diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json similarity index 91% rename from .devcontainer/cuda12.2-conda/devcontainer.json rename to .devcontainer/cuda12.5-conda/devcontainer.json index 05bf9173d25..fadce01d060 100644 --- a/.devcontainer/cuda12.2-conda/devcontainer.json +++ b/.devcontainer/cuda12.5-conda/devcontainer.json @@ -3,7 +3,7 @@ "context": "${localWorkspaceFolder}/.devcontainer", "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", "args": { - "CUDA": "12.2", + "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "conda", "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04" } @@ -11,7 +11,7 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { @@ -20,7 +20,7 @@ "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.2-envs}"], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"], "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", @@ -29,7 +29,7 @@ "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.2-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json similarity index 87% rename from .devcontainer/cuda12.2-pip/devcontainer.json rename to .devcontainer/cuda12.5-pip/devcontainer.json index 74420214726..026eb540952 100644 --- a/.devcontainer/cuda12.2-pip/devcontainer.json +++ b/.devcontainer/cuda12.5-pip/devcontainer.json @@ -3,15 +3,15 @@ "context": "${localWorkspaceFolder}/.devcontainer", "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", "args": { - "CUDA": "12.2", + "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.2-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { @@ -20,7 +20,7 @@ "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs}"], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"], "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", @@ -28,7 +28,7 @@ "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" ], "customizations": { "vscode": { diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index c5679cc5141..2e5959338b0 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -108,6 +108,28 @@ jobs: sha: ${{ inputs.sha }} date: ${{ inputs.date }} package-name: dask_cudf + wheel-build-cudf-polars: + needs: wheel-publish-cudf + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + script: ci/build_wheel_cudf_polars.sh + wheel-publish-cudf-polars: + needs: wheel-build-cudf-polars + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08 + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + package-name: cudf_polars trigger-pandas-tests: if: inputs.build_type == 'nightly' needs: wheel-build-cudf diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index a8643923a4d..5a937b2f362 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -19,7 +19,7 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" )) + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) )) build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a35802f2ab0..d5dfc9e1ff5 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -25,7 +25,8 @@ jobs: - docs-build - wheel-build-cudf - wheel-tests-cudf - - test-cudf-polars + - wheel-build-cudf-polars + - wheel-tests-cudf-polars - wheel-build-dask-cudf - wheel-tests-dask-cudf - devcontainer @@ -133,9 +134,18 @@ jobs: with: build_type: pull-request script: ci/test_wheel_cudf.sh - test-cudf-polars: + wheel-build-cudf-polars: needs: wheel-build-cudf secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 + with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + build_type: pull-request + script: "ci/build_wheel_cudf_polars.sh" + wheel-tests-cudf-polars: + needs: wheel-build-cudf-polars + secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -143,7 +153,7 @@ jobs: build_type: pull-request # This always runs, but only fails if this PR touches code in # pylibcudf or cudf_polars - script: "ci/test_cudf_polars.sh" + script: "ci/test_wheel_cudf_polars.sh" wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit @@ -167,7 +177,7 @@ jobs: uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08 with: arch: '["amd64"]' - cuda: '["12.2"]' + cuda: '["12.5"]' build_command: | sccache -z; build-all -DBUILD_BENCHMARKS=ON --verbose; @@ -186,7 +196,7 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" )) + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) )) build_type: pull-request script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4fbc28fa6e1..f9cdde7c2b7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -104,7 +104,7 @@ Instructions for a minimal build environment without conda are included below. # create the conda environment (assuming in base `cudf` directory) # note: RAPIDS currently doesn't support `channel_priority: strict`; # use `channel_priority: flexible` instead -conda env create --name cudf_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml +conda env create --name cudf_dev --file conda/environments/all_cuda-125_arch-x86_64.yaml # activate the environment conda activate cudf_dev ``` diff --git a/README.md b/README.md index 17d2df9a936..1ab6a2d7457 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects ```bash conda install -c rapidsai -c conda-forge -c nvidia \ - cudf=24.08 python=3.11 cuda-version=12.2 + cudf=24.08 python=3.11 cuda-version=12.5 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh new file mode 100755 index 00000000000..9c945e11c00 --- /dev/null +++ b/ci/build_wheel_cudf_polars.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +set -euo pipefail + +package_dir="python/cudf_polars" + +./ci/build_wheel.sh ${package_dir} + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh new file mode 100755 index 00000000000..c10612a065a --- /dev/null +++ b/ci/run_cudf_polars_pytests.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +# It is essential to cd into python/cudf_polars as `pytest-xdist` + `coverage` seem to work only at this directory level. + +# Support invoking run_cudf_polars_pytests.sh outside the script directory +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/ + +python -m pytest --cache-clear "$@" tests diff --git a/ci/test_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh similarity index 67% rename from ci/test_cudf_polars.sh rename to ci/test_wheel_cudf_polars.sh index 95fb4b431bf..900acd5d473 100755 --- a/ci/test_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -18,19 +18,14 @@ else fi RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist -RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ -mkdir -p "${RAPIDS_TESTS_DIR}" - -rapids-logger "Install cudf wheel" -# echo to expand wildcard before adding `[extra]` requires for pip -python -m pip install $(echo ./dist/cudf*.whl)[test] +# Download the cudf built in the previous step +RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep +python -m pip install ./local-cudf-dep/cudf*.whl rapids-logger "Install cudf_polars" -python -m pip install 'polars>=1.0' -python -m pip install --no-deps python/cudf_polars +python -m pip install $(echo ./dist/cudf_polars*.whl)[test] rapids-logger "Run cudf_polars tests" @@ -42,13 +37,11 @@ EXITCODE=0 trap set_exitcode ERR set +e -python -m pytest \ - --cache-clear \ +./ci/run_cudf_polars_pytests.sh \ --cov cudf_polars \ --cov-fail-under=100 \ - --cov-config=python/cudf_polars/pyproject.toml \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml" \ - python/cudf_polars/tests + --cov-config=./pyproject.toml \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml" trap ERR set -e diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index 2b20b9d9ce4..c3800d3cc25 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -8,7 +8,7 @@ RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE=" # Download the cudf built in the previous step RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep -python -m pip install --no-deps ./local-cudf-dep/cudf*.whl +python -m pip install ./local-cudf-dep/cudf*.whl # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install $(echo ./dist/dask_cudf*.whl)[test] diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index cc9238ab80a..b8d73a01f96 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -15,7 +15,7 @@ dependencies: - cachetools - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cramjam - cubinlinker - cuda-nvtx=11.8 diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml similarity index 96% rename from conda/environments/all_cuda-122_arch-x86_64.yaml rename to conda/environments/all_cuda-125_arch-x86_64.yaml index 9fecd452248..3f5fae49cbb 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -15,7 +15,7 @@ dependencies: - cachetools - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cramjam - cuda-cudart-dev - cuda-nvcc @@ -23,7 +23,7 @@ dependencies: - cuda-nvtx-dev - cuda-python>=12.0,<13.0a0 - cuda-sanitizer-api -- cuda-version=12.2 +- cuda-version=12.5 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 @@ -96,4 +96,4 @@ dependencies: - zlib>=1.2.13 - pip: - git+https://github.com/python-streamz/streamz.git@master -name: all_cuda-122_arch-x86_64 +name: all_cuda-125_arch-x86_64 diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml index d399e440edd..af894cccda0 100644 --- a/conda/recipes/cudf/conda_build_config.yaml +++ b/conda/recipes/cudf/conda_build_config.yaml @@ -11,7 +11,7 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" cuda_compiler: - cuda-nvcc diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 3cdc2050631..9137f099ad1 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -64,6 +64,7 @@ requirements: - rapids-build-backend >=0.3.0,<0.4.0.dev0 - scikit-build-core >=0.7.0 - dlpack >=0.8,<1.0 + # TODO: Change to `2.0` for NumPy 2 - numpy 1.23 - pyarrow ==16.1.0.* - libcudf ={{ version }} @@ -82,7 +83,8 @@ requirements: - pandas >=2.0,<2.2.3dev0 - cupy >=12.0.0 - numba >=0.57 - - {{ pin_compatible('numpy', max_pin='x') }} + # TODO: Update `numpy` in `host` when dropping `<2.0a0` + - numpy >=1.23,<2.0a0 - {{ pin_compatible('pyarrow', max_pin='x.x') }} - libcudf ={{ version }} - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml index d399e440edd..af894cccda0 100644 --- a/conda/recipes/cudf_kafka/conda_build_config.yaml +++ b/conda/recipes/cudf_kafka/conda_build_config.yaml @@ -11,7 +11,7 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" cuda_compiler: - cuda-nvcc diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index c01178bf732..4f99411e978 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -17,7 +17,7 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" libarrow_version: - "==16.1.0" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2811711d58c..65347bd6689 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -216,6 +216,8 @@ include(cmake/thirdparty/get_fmt.cmake) include(cmake/thirdparty/get_spdlog.cmake) # find nanoarrow include(cmake/thirdparty/get_nanoarrow.cmake) +# find thread_pool +include(cmake/thirdparty/get_thread_pool.cmake) # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved if(NOT BUILD_SHARED_LIBS) @@ -409,6 +411,7 @@ add_library( src/io/orc/stripe_init.cu src/datetime/timezone.cpp src/io/orc/writer_impl.cu + src/io/parquet/arrow_schema_writer.cpp src/io/parquet/compact_protocol_reader.cpp src/io/parquet/compact_protocol_writer.cpp src/io/parquet/decode_preprocess.cu @@ -425,6 +428,7 @@ add_library( src/io/parquet/reader_impl_helpers.cpp src/io/parquet/reader_impl_preprocess.cu src/io/parquet/writer_impl.cu + src/io/parquet/writer_impl_helpers.cpp src/io/parquet/decode_fixed.cu src/io/statistics/orc_column_statistics.cu src/io/statistics/parquet_column_statistics.cu @@ -802,7 +806,7 @@ add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm + PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $ PRIVATE $ cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio $ nanoarrow ) @@ -925,6 +929,11 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL) add_library( ${_tgt} SHARED src/utilities/stacktrace.cpp tests/utilities/identify_stream_usage.cpp ) + if(CUDF_USE_PER_THREAD_DEFAULT_STREAM) + target_compile_definitions( + ${_tgt} PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM CUDF_USE_PER_THREAD_DEFAULT_STREAM + ) + endif() set_target_properties( ${_tgt} diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index a5b248135c1..ff431c7f260 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -231,8 +231,8 @@ ConfigureBench( ) ConfigureNVBench( - GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_nunique.cpp groupby/group_rank.cpp - groupby/group_struct_keys.cpp + GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp + groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/copying/gather.cu b/cpp/benchmarks/copying/gather.cu index eeb0149fb3a..985166f7298 100644 --- a/cpp/benchmarks/copying/gather.cu +++ b/cpp/benchmarks/copying/gather.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -71,5 +71,5 @@ void BM_gather(benchmark::State& state) ->Ranges({{1 << 10, 1 << 26}, {1, 8}}) \ ->UseManualTime(); -GBM_BENCHMARK_DEFINE(double_coalesce_x, double, true); -GBM_BENCHMARK_DEFINE(double_coalesce_o, double, false); +GBM_BENCHMARK_DEFINE(double_coalesced, double, true); +GBM_BENCHMARK_DEFINE(double_shuffled, double, false); diff --git a/cpp/benchmarks/copying/scatter.cu b/cpp/benchmarks/copying/scatter.cu index a521dc82739..c27480b69f4 100644 --- a/cpp/benchmarks/copying/scatter.cu +++ b/cpp/benchmarks/copying/scatter.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,5 +74,5 @@ void BM_scatter(benchmark::State& state) ->Ranges({{1 << 10, 1 << 25}, {1, 8}}) \ ->UseManualTime(); -SBM_BENCHMARK_DEFINE(double_coalesce_x, double, true); -SBM_BENCHMARK_DEFINE(double_coalesce_o, double, false); +SBM_BENCHMARK_DEFINE(double_coalesced, double, true); +SBM_BENCHMARK_DEFINE(double_shuffled, double, false); diff --git a/cpp/benchmarks/decimal/convert_floating.cpp b/cpp/benchmarks/decimal/convert_floating.cpp index a367036c494..ac09c3400cb 100644 --- a/cpp/benchmarks/decimal/convert_floating.cpp +++ b/cpp/benchmarks/decimal/convert_floating.cpp @@ -32,8 +32,6 @@ void bench_cast_decimal(nvbench::state& state, nvbench::type_list || std::is_same_v; - static constexpr bool is_32bit = - std::is_same_v || std::is_same_v; static constexpr bool is_128bit = std::is_same_v || std::is_same_v; @@ -69,21 +67,6 @@ void bench_cast_decimal(nvbench::state& state, nvbench::type_list decimal conversion algorithm is limited - static constexpr bool is_64bit = !is_32bit && !is_128bit; - if (is_32bit && (exp_mode != 3)) { - state.skip("Decimal32 conversion only works up to scale factors of 10^9."); - return; - } - if (is_64bit && ((exp_mode < 2) || (exp_mode > 4))) { - state.skip("Decimal64 conversion only works up to scale factors of 10^18."); - return; - } - if (is_128bit && ((exp_mode == 0) || (exp_mode == 6))) { - state.skip("Decimal128 conversion only works up to scale factors of 10^38."); - return; - } - // Type IDs auto const input_id = cudf::type_to_id(); auto const output_id = cudf::type_to_id(); diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp index 01ca23ebbf8..f41285008c4 100644 --- a/cpp/benchmarks/groupby/group_max.cpp +++ b/cpp/benchmarks/groupby/group_max.cpp @@ -48,20 +48,25 @@ void groupby_max_helper(nvbench::state& state, cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); }(); + auto const num_aggregations = state.get_int64("num_aggregations"); + auto keys_view = keys->view(); auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view})); std::vector requests; - requests.emplace_back(cudf::groupby::aggregation_request()); - requests[0].values = vals->view(); - requests[0].aggregations.push_back(cudf::make_max_aggregation()); + for (int64_t i = 0; i < num_aggregations; i++) { + requests.emplace_back(cudf::groupby::aggregation_request()); + requests[i].values = vals->view(); + requests[i].aggregations.push_back(cudf::make_max_aggregation()); + } auto const mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_element_count( + static_cast(num_rows * num_aggregations) / elapsed_time / 1'000'000., "Mrows/s"); state.add_buffer_size( mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } @@ -91,7 +96,8 @@ NVBENCH_BENCH_TYPES(bench_groupby_max, .set_name("groupby_max") .add_int64_axis("cardinality", {0}) .add_int64_power_of_two_axis("num_rows", {12, 18, 24}) - .add_float64_axis("null_probability", {0, 0.1, 0.9}); + .add_float64_axis("null_probability", {0, 0.1, 0.9}) + .add_int64_axis("num_aggregations", {1, 2, 4, 8, 16, 32}); NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("groupby_max_cardinality") diff --git a/cpp/benchmarks/groupby/group_max_multithreaded.cpp b/cpp/benchmarks/groupby/group_max_multithreaded.cpp new file mode 100644 index 00000000000..bf1a1a5fcf7 --- /dev/null +++ b/cpp/benchmarks/groupby/group_max_multithreaded.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include +#include + +template +void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list) +{ + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const null_probability = state.get_float64("null_probability"); + auto const num_threads = state.get_int64("num_threads"); + auto const num_aggregations = state.get_int64("num_aggregations"); + + auto const keys = [&] { + data_profile const profile = + data_profile_builder() + .cardinality(cardinality) + .no_validity() + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + }(); + + auto const vals = [&] { + auto builder = data_profile_builder().cardinality(0).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + return create_random_column( + cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); + }(); + + auto keys_view = keys->view(); + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view})); + + auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); + BS::thread_pool threads(num_threads); + + std::vector> requests(num_threads); + for (auto& thread_requests : requests) { + for (int64_t j = 0; j < num_aggregations; j++) { + thread_requests.emplace_back(); + thread_requests.back().values = vals->view(); + thread_requests.back().aggregations.push_back( + cudf::make_max_aggregation()); + } + } + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + auto perform_agg = [&](int64_t index) { gb_obj.aggregate(requests[index], streams[index]); }; + timer.start(); + threads.detach_sequence(decltype(num_threads){0}, num_threads, perform_agg); + threads.wait(); + cudf::detail::join_streams(streams, cudf::get_default_stream()); + cudf::get_default_stream().synchronize(); + timer.stop(); + }); + + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count( + static_cast(num_rows * num_threads * num_aggregations) / elapsed_time / 1'000'000., + "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +NVBENCH_BENCH_TYPES(bench_groupby_max_multithreaded, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("groupby_max_multithreaded") + .add_int64_axis("cardinality", {0}) + .add_int64_power_of_two_axis("num_rows", {12, 18}) + .add_float64_axis("null_probability", {0, 0.1, 0.9}) + .add_int64_axis("num_aggregations", {1}) + .add_int64_axis("num_threads", {1, 2, 4, 8}); diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp index aa0ee39a179..e91bf06fdfa 100644 --- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp @@ -24,8 +24,8 @@ #include #include #include -#include +#include #include #include @@ -90,7 +90,7 @@ void BM_orc_multithreaded_read_common(nvbench::state& state, auto const num_threads = state.get_int64("num_threads"); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); - cudf::detail::thread_pool threads(num_threads); + BS::thread_pool threads(num_threads); auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); std::vector source_info_vector; @@ -112,13 +112,11 @@ void BM_orc_multithreaded_read_common(nvbench::state& state, cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource()); }; - threads.paused = true; - for (size_t i = 0; i < num_files; ++i) { - threads.submit(read_func, i); - } + threads.pause(); + threads.detach_sequence(decltype(num_files){0}, num_files, read_func); timer.start(); - threads.paused = false; - threads.wait_for_tasks(); + threads.unpause(); + threads.wait(); cudf::detail::join_streams(streams, cudf::get_default_stream()); timer.stop(); }); @@ -170,7 +168,7 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state, size_t const output_limit = state.get_int64("output_limit"); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); - cudf::detail::thread_pool threads(num_threads); + BS::thread_pool threads(num_threads); auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); std::vector source_info_vector; std::transform(source_sink_vector.begin(), @@ -203,13 +201,11 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state, } while (reader.has_next()); }; - threads.paused = true; - for (size_t i = 0; i < num_files; ++i) { - threads.submit(read_func, i); - } + threads.pause(); + threads.detach_sequence(decltype(num_files){0}, num_files, read_func); timer.start(); - threads.paused = false; - threads.wait_for_tasks(); + threads.unpause(); + threads.wait(); cudf::detail::join_streams(streams, cudf::get_default_stream()); timer.stop(); }); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index b4c8ed78ed8..9e76ebb71ab 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -23,10 +23,10 @@ #include #include #include -#include #include +#include #include #include @@ -93,7 +93,7 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, auto const num_threads = state.get_int64("num_threads"); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); - cudf::detail::thread_pool threads(num_threads); + BS::thread_pool threads(num_threads); auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); std::vector source_info_vector; @@ -114,13 +114,11 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource()); }; - threads.paused = true; - for (size_t i = 0; i < num_files; ++i) { - threads.submit(read_func, i); - } + threads.pause(); + threads.detach_sequence(decltype(num_files){0}, num_files, read_func); timer.start(); - threads.paused = false; - threads.wait_for_tasks(); + threads.unpause(); + threads.wait(); cudf::detail::join_streams(streams, cudf::get_default_stream()); timer.stop(); }); @@ -176,7 +174,7 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, size_t const output_limit = state.get_int64("output_limit"); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); - cudf::detail::thread_pool threads(num_threads); + BS::thread_pool threads(num_threads); auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); std::vector source_info_vector; std::transform(source_sink_vector.begin(), @@ -207,13 +205,11 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, } while (reader.has_next()); }; - threads.paused = true; - for (size_t i = 0; i < num_files; ++i) { - threads.submit(read_func, i); - } + threads.pause(); + threads.detach_sequence(decltype(num_files){0}, num_files, read_func); timer.start(); - threads.paused = false; - threads.wait_for_tasks(); + threads.unpause(); + threads.wait(); cudf::detail::join_streams(streams, cudf::get_default_stream()); timer.stop(); }); diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu index dbc3234dabf..570decf410f 100644 --- a/cpp/benchmarks/lists/copying/scatter_lists.cu +++ b/cpp/benchmarks/lists/copying/scatter_lists.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -143,5 +143,5 @@ void BM_lists_scatter(::benchmark::State& state) ->Ranges({{1 << 10, 1 << 25}, {64, 2048}}) /* 1K-1B rows, 64-2048 elements */ \ ->UseManualTime(); -SBM_BENCHMARK_DEFINE(double_type_colesce_o, double, true); -SBM_BENCHMARK_DEFINE(double_type_colesce_x, double, false); +SBM_BENCHMARK_DEFINE(double_coalesced, double, true); +SBM_BENCHMARK_DEFINE(double_shuffled, double, false); diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp index d05c195d077..d5b74da6773 100644 --- a/cpp/benchmarks/text/jaccard.cpp +++ b/cpp/benchmarks/text/jaccard.cpp @@ -59,6 +59,6 @@ static void bench_jaccard(nvbench::state& state) NVBENCH_BENCH(bench_jaccard) .set_name("jaccard") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {128, 512, 2048}) + .add_int64_axis("num_rows", {32768, 131072, 262144}) + .add_int64_axis("row_width", {128, 512, 1024, 2048}) .add_int64_axis("substring_width", {5, 10}); diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index 6ec35ddcaf1..fb82b0f5ff3 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -15,10 +15,6 @@ # This function finds cuCollections and performs any additional configuration. function(find_and_configure_cucollections) include(${rapids-cmake-dir}/cpm/cuco.cmake) - include(${rapids-cmake-dir}/cpm/package_override.cmake) - - set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") - rapids_cpm_package_override("${cudf_patch_dir}/cuco_override.json") if(BUILD_SHARED_LIBS) rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports) diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake new file mode 100644 index 00000000000..264257c7199 --- /dev/null +++ b/cpp/cmake/thirdparty/get_thread_pool.cmake @@ -0,0 +1,31 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# This function finds rmm and sets any additional necessary environment variables. +function(find_and_configure_thread_pool) + rapids_cpm_find( + BS_thread_pool 4.1.0 + CPM_ARGS + GIT_REPOSITORY https://github.com/bshoshany/thread-pool.git + GIT_TAG 097aa718f25d44315cadb80b407144ad455ee4f9 + GIT_SHALLOW TRUE + ) + if(NOT TARGET BS_thread_pool) + add_library(BS_thread_pool INTERFACE) + target_include_directories(BS_thread_pool INTERFACE ${BS_thread_pool_SOURCE_DIR}/include) + target_compile_definitions(BS_thread_pool INTERFACE "BS_THREAD_POOL_ENABLE_PAUSE=1") + endif() +endfunction() + +find_and_configure_thread_pool() diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json index e61102dffac..2f29578f7ae 100644 --- a/cpp/cmake/thirdparty/patches/cccl_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -3,11 +3,6 @@ "packages" : { "CCCL" : { "patches" : [ - { - "file" : "cccl/revert_pr_211.diff", - "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.", - "fixed_in" : "" - }, { "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", diff --git a/cpp/cmake/thirdparty/patches/cuco_noexcept.diff b/cpp/cmake/thirdparty/patches/cuco_noexcept.diff deleted file mode 100644 index 0f334c0e81f..00000000000 --- a/cpp/cmake/thirdparty/patches/cuco_noexcept.diff +++ /dev/null @@ -1,227 +0,0 @@ -diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh -index 7f9de01..5228193 100644 ---- a/include/cuco/aow_storage.cuh -+++ b/include/cuco/aow_storage.cuh -@@ -81,7 +81,7 @@ class aow_storage : public detail::aow_storage_base { - * @param size Number of windows to (de)allocate - * @param allocator Allocator used for (de)allocating device storage - */ -- explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept; -+ explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}); - - aow_storage(aow_storage&&) = default; ///< Move constructor - /** -@@ -122,7 +122,7 @@ class aow_storage : public detail::aow_storage_base { - * @param key Key to which all keys in `slots` are initialized - * @param stream Stream used for executing the kernel - */ -- void initialize(value_type key, cuda_stream_ref stream = {}) noexcept; -+ void initialize(value_type key, cuda_stream_ref stream = {}); - - /** - * @brief Asynchronously initializes each slot in the AoW storage to contain `key`. -diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh -index c2c9c14..8ac4236 100644 ---- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh -+++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh -@@ -125,7 +125,7 @@ class open_addressing_impl { - KeyEqual const& pred, - ProbingScheme const& probing_scheme, - Allocator const& alloc, -- cuda_stream_ref stream) noexcept -+ cuda_stream_ref stream) - : empty_slot_sentinel_{empty_slot_sentinel}, - erased_key_sentinel_{this->extract_key(empty_slot_sentinel)}, - predicate_{pred}, -@@ -233,7 +233,7 @@ class open_addressing_impl { - * - * @param stream CUDA stream this operation is executed in - */ -- void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); } -+ void clear(cuda_stream_ref stream) { storage_.initialize(empty_slot_sentinel_, stream); } - - /** - * @brief Asynchronously erases all elements from the container. After this call, `size()` returns -@@ -599,7 +599,7 @@ class open_addressing_impl { - * - * @return The number of elements in the container - */ -- [[nodiscard]] size_type size(cuda_stream_ref stream) const noexcept -+ [[nodiscard]] size_type size(cuda_stream_ref stream) const - { - auto counter = - detail::counter_storage{this->allocator()}; -diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl -index e17a145..3fa1d02 100644 ---- a/include/cuco/detail/static_map/static_map.inl -+++ b/include/cuco/detail/static_map/static_map.inl -@@ -123,7 +123,7 @@ template - void static_map::clear( -- cuda_stream_ref stream) noexcept -+ cuda_stream_ref stream) - { - impl_->clear(stream); - } -@@ -215,7 +215,7 @@ template - template - void static_map:: -- insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) noexcept -+ insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) - { - return this->insert_or_assign_async(first, last, stream); - stream.synchronize(); -@@ -465,7 +465,7 @@ template - static_map::size_type - static_map::size( -- cuda_stream_ref stream) const noexcept -+ cuda_stream_ref stream) const - { - return impl_->size(stream); - } -diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl -index 174f9bc..582926b 100644 ---- a/include/cuco/detail/static_multiset/static_multiset.inl -+++ b/include/cuco/detail/static_multiset/static_multiset.inl -@@ -97,7 +97,7 @@ template - void static_multiset::clear( -- cuda_stream_ref stream) noexcept -+ cuda_stream_ref stream) - { - impl_->clear(stream); - } -@@ -183,7 +183,7 @@ template - static_multiset::size_type - static_multiset::size( -- cuda_stream_ref stream) const noexcept -+ cuda_stream_ref stream) const - { - return impl_->size(stream); - } -diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl -index 645013f..d3cece0 100644 ---- a/include/cuco/detail/static_set/static_set.inl -+++ b/include/cuco/detail/static_set/static_set.inl -@@ -98,7 +98,7 @@ template - void static_set::clear( -- cuda_stream_ref stream) noexcept -+ cuda_stream_ref stream) - { - impl_->clear(stream); - } -@@ -429,7 +429,7 @@ template - static_set::size_type - static_set::size( -- cuda_stream_ref stream) const noexcept -+ cuda_stream_ref stream) const - { - return impl_->size(stream); - } -diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl -index 3547f4c..94b7f98 100644 ---- a/include/cuco/detail/storage/aow_storage.inl -+++ b/include/cuco/detail/storage/aow_storage.inl -@@ -32,8 +32,8 @@ - namespace cuco { - - template --constexpr aow_storage::aow_storage( -- Extent size, Allocator const& allocator) noexcept -+constexpr aow_storage::aow_storage(Extent size, -+ Allocator const& allocator) - : detail::aow_storage_base{size}, - allocator_{allocator}, - window_deleter_{capacity(), allocator_}, -@@ -64,7 +64,7 @@ aow_storage::ref() const noexcept - - template - void aow_storage::initialize(value_type key, -- cuda_stream_ref stream) noexcept -+ cuda_stream_ref stream) - { - this->initialize_async(key, stream); - stream.synchronize(); -diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh -index c86e90c..95da423 100644 ---- a/include/cuco/static_map.cuh -+++ b/include/cuco/static_map.cuh -@@ -269,7 +269,7 @@ class static_map { - * - * @param stream CUDA stream this operation is executed in - */ -- void clear(cuda_stream_ref stream = {}) noexcept; -+ void clear(cuda_stream_ref stream = {}); - - /** - * @brief Asynchronously erases all elements from the container. After this call, `size()` returns -@@ -387,7 +387,7 @@ class static_map { - * @param stream CUDA stream used for insert - */ - template -- void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; -+ void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}); - - /** - * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k` -@@ -690,7 +690,7 @@ class static_map { - * @param stream CUDA stream used to get the number of inserted elements - * @return The number of elements in the container - */ -- [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; -+ [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const; - - /** - * @brief Gets the maximum number of elements the hash map can hold. -diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh -index 0daf103..fbcbc9c 100644 ---- a/include/cuco/static_multiset.cuh -+++ b/include/cuco/static_multiset.cuh -@@ -235,7 +235,7 @@ class static_multiset { - * - * @param stream CUDA stream this operation is executed in - */ -- void clear(cuda_stream_ref stream = {}) noexcept; -+ void clear(cuda_stream_ref stream = {}); - - /** - * @brief Asynchronously erases all elements from the container. After this call, `size()` returns -@@ -339,7 +339,7 @@ class static_multiset { - * @param stream CUDA stream used to get the number of inserted elements - * @return The number of elements in the container - */ -- [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; -+ [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const; - - /** - * @brief Gets the maximum number of elements the multiset can hold. -diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh -index a069939..3517f84 100644 ---- a/include/cuco/static_set.cuh -+++ b/include/cuco/static_set.cuh -@@ -240,7 +240,7 @@ class static_set { - * - * @param stream CUDA stream this operation is executed in - */ -- void clear(cuda_stream_ref stream = {}) noexcept; -+ void clear(cuda_stream_ref stream = {}); - - /** - * @brief Asynchronously erases all elements from the container. After this call, `size()` returns -@@ -687,7 +687,7 @@ class static_set { - * @param stream CUDA stream used to get the number of inserted elements - * @return The number of elements in the container - */ -- [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; -+ [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const; - - /** - * @brief Gets the maximum number of elements the hash set can hold. diff --git a/cpp/cmake/thirdparty/patches/cuco_override.json b/cpp/cmake/thirdparty/patches/cuco_override.json deleted file mode 100644 index ae0a9a4b4f0..00000000000 --- a/cpp/cmake/thirdparty/patches/cuco_override.json +++ /dev/null @@ -1,14 +0,0 @@ - -{ - "packages" : { - "cuco" : { - "patches" : [ - { - "file" : "${current_json_dir}/cuco_noexcept.diff", - "issue" : "Remove erroneous noexcept clauses on cuco functions that may throw [https://github.com/rapidsai/cudf/issues/16059]", - "fixed_in" : "" - } - ] - } - } -} diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index bde6ef7d69c..dce81fb1677 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -57,6 +57,7 @@ build_example() { } build_example basic +build_example tpch build_example strings build_example nested_types build_example parquet_io diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 8be17db3781..274a2599189 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -16,6 +16,8 @@ #include "parquet_io.hpp" +#include "../utilities/timer.hpp" + /** * @file parquet_io.cpp * @brief Demonstrates usage of the libcudf APIs to read and write @@ -140,7 +142,7 @@ int main(int argc, char const** argv) << page_stat_string << ".." << std::endl; // `timer` is automatically started here - Timer timer; + cudf::examples::timer timer; write_parquet(input->view(), metadata, output_filepath, encoding, compression, page_stats); timer.print_elapsed_millis(); diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/parquet_io.hpp index d2fc359a2fe..e27cbec4fce 100644 --- a/cpp/examples/parquet_io/parquet_io.hpp +++ b/cpp/examples/parquet_io/parquet_io.hpp @@ -124,34 +124,3 @@ std::shared_ptr create_memory_resource(bool is_ return std::nullopt; } - -/** - * @brief Light-weight timer for parquet reader and writer instrumentation - * - * Timer object constructed from std::chrono, instrumenting at microseconds - * precision. Can display elapsed durations at milli and micro second - * scales. Timer starts at object construction. - */ -class Timer { - public: - using micros = std::chrono::microseconds; - using millis = std::chrono::milliseconds; - - Timer() { reset(); } - void reset() { start_time = std::chrono::high_resolution_clock::now(); } - auto elapsed() { return (std::chrono::high_resolution_clock::now() - start_time); } - void print_elapsed_micros() - { - std::cout << "Elapsed Time: " << std::chrono::duration_cast(elapsed()).count() - << "us\n\n"; - } - void print_elapsed_millis() - { - std::cout << "Elapsed Time: " << std::chrono::duration_cast(elapsed()).count() - << "ms\n\n"; - } - - private: - using time_point_t = std::chrono::time_point; - time_point_t start_time; -}; diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt new file mode 100644 index 00000000000..1b91d07e148 --- /dev/null +++ b/cpp/examples/tpch/CMakeLists.txt @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +cmake_minimum_required(VERSION 3.26.4) + +include(../set_cuda_architecture.cmake) + +rapids_cuda_init_architectures(tpch_example) +rapids_cuda_set_architectures(RAPIDS) + +project( + tpch_example + VERSION 0.0.1 + LANGUAGES CXX CUDA +) + +include(../fetch_dependencies.cmake) + +add_executable(tpch_q1 q1.cpp) +target_link_libraries(tpch_q1 PRIVATE cudf::cudf) +target_compile_features(tpch_q1 PRIVATE cxx_std_17) + +add_executable(tpch_q5 q5.cpp) +target_link_libraries(tpch_q5 PRIVATE cudf::cudf) +target_compile_features(tpch_q5 PRIVATE cxx_std_17) + +add_executable(tpch_q6 q6.cpp) +target_link_libraries(tpch_q6 PRIVATE cudf::cudf) +target_compile_features(tpch_q6 PRIVATE cxx_std_17) + +add_executable(tpch_q9 q9.cpp) +target_link_libraries(tpch_q9 PRIVATE cudf::cudf) +target_compile_features(tpch_q9 PRIVATE cxx_std_17) diff --git a/cpp/examples/tpch/README.md b/cpp/examples/tpch/README.md new file mode 100644 index 00000000000..1ea71ae9824 --- /dev/null +++ b/cpp/examples/tpch/README.md @@ -0,0 +1,38 @@ +# TPC-H Inspired Examples + +Implements TPC-H queries using `libcudf`. We leverage the data generator (wrapper around official TPC-H datagen) from [Apache Datafusion](https://github.com/apache/datafusion) for generating data in Parquet format. + +## Requirements + +- Rust + +## Generating the Dataset + +1. Clone the datafusion repository. +```bash +git clone git@github.com:apache/datafusion.git +``` + +2. Run the data generator. The data will be placed in a `data/` subdirectory. +```bash +cd datafusion/benchmarks/ +./bench.sh data tpch + +# for scale factor 10, +./bench.sh data tpch10 +``` + +## Running Queries + +1. Build the examples. +```bash +cd cpp/examples +./build.sh +``` +The TPC-H query binaries would be built inside `examples/tpch/build`. + +2. Execute the queries. +```bash +./tpch/build/tpch_q1 +``` +A parquet file named `q1.parquet` would be generated holding the results of the query. diff --git a/cpp/examples/tpch/q1.cpp b/cpp/examples/tpch/q1.cpp new file mode 100644 index 00000000000..1bdf039da4a --- /dev/null +++ b/cpp/examples/tpch/q1.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../utilities/timer.hpp" +#include "utils.hpp" + +#include +#include +#include + +/** + * @file q1.cpp + * @brief Implement query 1 of the TPC-H benchmark. + * + * create view lineitem as select * from '/tables/scale-1/lineitem.parquet'; + * + * select + * l_returnflag, + * l_linestatus, + * sum(l_quantity) as sum_qty, + * sum(l_extendedprice) as sum_base_price, + * sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + * sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + * avg(l_quantity) as avg_qty, + * avg(l_extendedprice) as avg_price, + * avg(l_discount) as avg_disc, + * count(*) as count_order + * from + * lineitem + * where + * l_shipdate <= date '1998-09-02' + * group by + * l_returnflag, + * l_linestatus + * order by + * l_returnflag, + * l_linestatus; + */ + +/** + * @brief Calculate the discount price column + * + * @param discount The discount column + * @param extendedprice The extended price column + * @param stream The CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + */ +[[nodiscard]] std::unique_ptr calc_disc_price( + cudf::column_view const& discount, + cudf::column_view const& extendedprice, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) +{ + auto const one = cudf::numeric_scalar(1); + auto const one_minus_discount = + cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr); + auto const disc_price_type = cudf::data_type{cudf::type_id::FLOAT64}; + auto disc_price = cudf::binary_operation(extendedprice, + one_minus_discount->view(), + cudf::binary_operator::MUL, + disc_price_type, + stream, + mr); + return disc_price; +} + +/** + * @brief Calculate the charge column + * + * @param tax The tax column + * @param disc_price The discount price column + * @param stream The CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + */ +[[nodiscard]] std::unique_ptr calc_charge( + cudf::column_view const& tax, + cudf::column_view const& disc_price, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) +{ + auto const one = cudf::numeric_scalar(1); + auto const one_plus_tax = + cudf::binary_operation(one, tax, cudf::binary_operator::ADD, tax.type(), stream, mr); + auto const charge_type = cudf::data_type{cudf::type_id::FLOAT64}; + auto charge = cudf::binary_operation( + disc_price, one_plus_tax->view(), cudf::binary_operator::MUL, charge_type, stream, mr); + return charge; +} + +int main(int argc, char const** argv) +{ + auto const args = parse_args(argc, argv); + + // Use a memory pool + auto resource = create_memory_resource(args.memory_resource_type); + rmm::mr::set_current_device_resource(resource.get()); + + cudf::examples::timer timer; + + // Define the column projections and filter predicate for `lineitem` table + std::vector const lineitem_cols = {"l_returnflag", + "l_linestatus", + "l_quantity", + "l_extendedprice", + "l_discount", + "l_shipdate", + "l_orderkey", + "l_tax"}; + auto const shipdate_ref = cudf::ast::column_reference(std::distance( + lineitem_cols.begin(), std::find(lineitem_cols.begin(), lineitem_cols.end(), "l_shipdate"))); + auto shipdate_upper = + cudf::timestamp_scalar(days_since_epoch(1998, 9, 2), true); + auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper); + auto lineitem_pred = std::make_unique( + cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal); + + // Read out the `lineitem` table from parquet file + auto lineitem = + read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred)); + + // Calculate the discount price and charge columns and append to lineitem table + auto disc_price = + calc_disc_price(lineitem->column("l_discount"), lineitem->column("l_extendedprice")); + auto charge = calc_charge(lineitem->column("l_tax"), disc_price->view()); + (*lineitem).append(disc_price, "disc_price").append(charge, "charge"); + + // Perform the group by operation + auto const groupedby_table = apply_groupby( + lineitem, + groupby_context_t{ + {"l_returnflag", "l_linestatus"}, + { + {"l_extendedprice", + {{cudf::aggregation::Kind::SUM, "sum_base_price"}, + {cudf::aggregation::Kind::MEAN, "avg_price"}}}, + {"l_quantity", + {{cudf::aggregation::Kind::SUM, "sum_qty"}, {cudf::aggregation::Kind::MEAN, "avg_qty"}}}, + {"l_discount", + { + {cudf::aggregation::Kind::MEAN, "avg_disc"}, + }}, + {"disc_price", + { + {cudf::aggregation::Kind::SUM, "sum_disc_price"}, + }}, + {"charge", + {{cudf::aggregation::Kind::SUM, "sum_charge"}, + {cudf::aggregation::Kind::COUNT_ALL, "count_order"}}}, + }}); + + // Perform the order by operation + auto const orderedby_table = apply_orderby(groupedby_table, + {"l_returnflag", "l_linestatus"}, + {cudf::order::ASCENDING, cudf::order::ASCENDING}); + + timer.print_elapsed_millis(); + + // Write query result to a parquet file + orderedby_table->to_parquet("q1.parquet"); + return 0; +} diff --git a/cpp/examples/tpch/q5.cpp b/cpp/examples/tpch/q5.cpp new file mode 100644 index 00000000000..e56850b94d6 --- /dev/null +++ b/cpp/examples/tpch/q5.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../utilities/timer.hpp" +#include "utils.hpp" + +#include +#include +#include + +/** + * @file q5.cpp + * @brief Implement query 5 of the TPC-H benchmark. + * + * create view customer as select * from '/tables/scale-1/customer.parquet'; + * create view orders as select * from '/tables/scale-1/orders.parquet'; + * create view lineitem as select * from '/tables/scale-1/lineitem.parquet'; + * create view supplier as select * from '/tables/scale-1/supplier.parquet'; + * create view nation as select * from '/tables/scale-1/nation.parquet'; + * create view region as select * from '/tables/scale-1/region.parquet'; + * + * select + * n_name, + * sum(l_extendedprice * (1 - l_discount)) as revenue + * from + * customer, + * orders, + * lineitem, + * supplier, + * nation, + * region + * where + * c_custkey = o_custkey + * and l_orderkey = o_orderkey + * and l_suppkey = s_suppkey + * and c_nationkey = s_nationkey + * and s_nationkey = n_nationkey + * and n_regionkey = r_regionkey + * and r_name = 'ASIA' + * and o_orderdate >= date '1994-01-01' + * and o_orderdate < date '1995-01-01' + * group by + * n_name + * order by + * revenue desc; + */ + +/** + * @brief Calculate the revenue column + * + * @param extendedprice The extended price column + * @param discount The discount column + * @param stream The CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + */ +[[nodiscard]] std::unique_ptr calc_revenue( + cudf::column_view const& extendedprice, + cudf::column_view const& discount, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) +{ + auto const one = cudf::numeric_scalar(1); + auto const one_minus_discount = + cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr); + auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64}; + auto revenue = cudf::binary_operation(extendedprice, + one_minus_discount->view(), + cudf::binary_operator::MUL, + revenue_type, + stream, + mr); + return revenue; +} + +int main(int argc, char const** argv) +{ + auto const args = parse_args(argc, argv); + + // Use a memory pool + auto resource = create_memory_resource(args.memory_resource_type); + rmm::mr::set_current_device_resource(resource.get()); + + cudf::examples::timer timer; + + // Define the column projection and filter predicate for the `orders` table + std::vector const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"}; + auto const o_orderdate_ref = cudf::ast::column_reference(std::distance( + orders_cols.begin(), std::find(orders_cols.begin(), orders_cols.end(), "o_orderdate"))); + auto o_orderdate_lower = + cudf::timestamp_scalar(days_since_epoch(1994, 1, 1), true); + auto const o_orderdate_lower_limit = cudf::ast::literal(o_orderdate_lower); + auto const o_orderdate_pred_lower = cudf::ast::operation( + cudf::ast::ast_operator::GREATER_EQUAL, o_orderdate_ref, o_orderdate_lower_limit); + auto o_orderdate_upper = + cudf::timestamp_scalar(days_since_epoch(1995, 1, 1), true); + auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper); + auto const o_orderdate_pred_upper = + cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit); + auto orders_pred = std::make_unique( + cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper); + + // Define the column projection and filter predicate for the `region` table + std::vector const region_cols = {"r_regionkey", "r_name"}; + auto const r_name_ref = cudf::ast::column_reference(std::distance( + region_cols.begin(), std::find(region_cols.begin(), region_cols.end(), "r_name"))); + auto r_name_value = cudf::string_scalar("ASIA"); + auto const r_name_literal = cudf::ast::literal(r_name_value); + auto region_pred = std::make_unique( + cudf::ast::ast_operator::EQUAL, r_name_ref, r_name_literal); + + // Read out the tables from parquet files + // while pushing down the column projections and filter predicates + auto const customer = + read_parquet(args.dataset_dir + "/customer.parquet", {"c_custkey", "c_nationkey"}); + auto const orders = + read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred)); + auto const lineitem = read_parquet(args.dataset_dir + "/lineitem.parquet", + {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"}); + auto const supplier = + read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"}); + auto const nation = + read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_regionkey", "n_name"}); + auto const region = + read_parquet(args.dataset_dir + "/region.parquet", region_cols, std::move(region_pred)); + + // Perform the joins + auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"}); + auto const join_b = apply_inner_join(join_a, customer, {"n_nationkey"}, {"c_nationkey"}); + auto const join_c = apply_inner_join(join_b, orders, {"c_custkey"}, {"o_custkey"}); + auto const join_d = apply_inner_join(join_c, lineitem, {"o_orderkey"}, {"l_orderkey"}); + auto joined_table = + apply_inner_join(supplier, join_d, {"s_suppkey", "s_nationkey"}, {"l_suppkey", "n_nationkey"}); + + // Calculate and append the `revenue` column + auto revenue = + calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount")); + (*joined_table).append(revenue, "revenue"); + + // Perform the groupby operation + auto const groupedby_table = + apply_groupby(joined_table, + groupby_context_t{{"n_name"}, + { + {"revenue", {{cudf::aggregation::Kind::SUM, "revenue"}}}, + }}); + + // Perform the order by operation + auto const orderedby_table = + apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING}); + + timer.print_elapsed_millis(); + + // Write query result to a parquet file + orderedby_table->to_parquet("q5.parquet"); + return 0; +} diff --git a/cpp/examples/tpch/q6.cpp b/cpp/examples/tpch/q6.cpp new file mode 100644 index 00000000000..f11b3d6ab3b --- /dev/null +++ b/cpp/examples/tpch/q6.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../utilities/timer.hpp" +#include "utils.hpp" + +#include +#include +#include + +/** + * @file q6.cpp + * @brief Implement query 6 of the TPC-H benchmark. + * + * create view lineitem as select * from '/tables/scale-1/lineitem.parquet'; + * + * select + * sum(l_extendedprice * l_discount) as revenue + * from + * lineitem + * where + * l_shipdate >= date '1994-01-01' + * and l_shipdate < date '1995-01-01' + * and l_discount >= 0.05 + * and l_discount <= 0.07 + * and l_quantity < 24; + */ + +/** + * @brief Calculate the revenue column + * + * @param extendedprice The extended price column + * @param discount The discount column + * @param stream The CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + */ +[[nodiscard]] std::unique_ptr calc_revenue( + cudf::column_view const& extendedprice, + cudf::column_view const& discount, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) +{ + auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64}; + auto revenue = cudf::binary_operation( + extendedprice, discount, cudf::binary_operator::MUL, revenue_type, stream, mr); + return revenue; +} + +int main(int argc, char const** argv) +{ + auto const args = parse_args(argc, argv); + + // Use a memory pool + auto resource = create_memory_resource(args.memory_resource_type); + rmm::mr::set_current_device_resource(resource.get()); + + cudf::examples::timer timer; + + // Read out the `lineitem` table from parquet file + std::vector const lineitem_cols = { + "l_extendedprice", "l_discount", "l_shipdate", "l_quantity"}; + auto const shipdate_ref = cudf::ast::column_reference(std::distance( + lineitem_cols.begin(), std::find(lineitem_cols.begin(), lineitem_cols.end(), "l_shipdate"))); + auto shipdate_lower = + cudf::timestamp_scalar(days_since_epoch(1994, 1, 1), true); + auto const shipdate_lower_literal = cudf::ast::literal(shipdate_lower); + auto shipdate_upper = + cudf::timestamp_scalar(days_since_epoch(1995, 1, 1), true); + auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper); + auto const shipdate_pred_a = cudf::ast::operation( + cudf::ast::ast_operator::GREATER_EQUAL, shipdate_ref, shipdate_lower_literal); + auto const shipdate_pred_b = + cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal); + auto lineitem_pred = std::make_unique( + cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b); + auto lineitem = + read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred)); + + // Cast the discount and quantity columns to float32 and append to lineitem table + auto discout_float = + cudf::cast(lineitem->column("l_discount"), cudf::data_type{cudf::type_id::FLOAT32}); + auto quantity_float = + cudf::cast(lineitem->column("l_quantity"), cudf::data_type{cudf::type_id::FLOAT32}); + + (*lineitem).append(discout_float, "l_discount_float").append(quantity_float, "l_quantity_float"); + + // Apply the filters + auto const discount_ref = cudf::ast::column_reference(lineitem->col_id("l_discount_float")); + auto const quantity_ref = cudf::ast::column_reference(lineitem->col_id("l_quantity_float")); + + auto discount_lower = cudf::numeric_scalar(0.05); + auto const discount_lower_literal = cudf::ast::literal(discount_lower); + auto discount_upper = cudf::numeric_scalar(0.07); + auto const discount_upper_literal = cudf::ast::literal(discount_upper); + auto quantity_upper = cudf::numeric_scalar(24); + auto const quantity_upper_literal = cudf::ast::literal(quantity_upper); + + auto const discount_pred_a = cudf::ast::operation( + cudf::ast::ast_operator::GREATER_EQUAL, discount_ref, discount_lower_literal); + + auto const discount_pred_b = + cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, discount_ref, discount_upper_literal); + auto const discount_pred = + cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, discount_pred_a, discount_pred_b); + auto const quantity_pred = + cudf::ast::operation(cudf::ast::ast_operator::LESS, quantity_ref, quantity_upper_literal); + auto const discount_quantity_pred = + cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, discount_pred, quantity_pred); + auto const filtered_table = apply_filter(lineitem, discount_quantity_pred); + + // Calculate the `revenue` column + auto revenue = + calc_revenue(filtered_table->column("l_extendedprice"), filtered_table->column("l_discount")); + + // Sum the `revenue` column + auto const revenue_view = revenue->view(); + auto const result_table = apply_reduction(revenue_view, cudf::aggregation::Kind::SUM, "revenue"); + + timer.print_elapsed_millis(); + + // Write query result to a parquet file + result_table->to_parquet("q6.parquet"); + return 0; +} diff --git a/cpp/examples/tpch/q9.cpp b/cpp/examples/tpch/q9.cpp new file mode 100644 index 00000000000..d3c218253f9 --- /dev/null +++ b/cpp/examples/tpch/q9.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../utilities/timer.hpp" +#include "utils.hpp" + +#include +#include +#include +#include +#include + +/** + * @file q9.cpp + * @brief Implement query 9 of the TPC-H benchmark. + * + * create view part as select * from '/tables/scale-1/part.parquet'; + * create view supplier as select * from '/tables/scale-1/supplier.parquet'; + * create view lineitem as select * from '/tables/scale-1/lineitem.parquet'; + * create view partsupp as select * from '/tables/scale-1/partsupp.parquet'; + * create view orders as select * from '/tables/scale-1/orders.parquet'; + * create view nation as select * from '/tables/scale-1/nation.parquet'; + * + * select + * nation, + * o_year, + * sum(amount) as sum_profit + * from + * ( + * select + * n_name as nation, + * extract(year from o_orderdate) as o_year, + * l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + * from + * part, + * supplier, + * lineitem, + * partsupp, + * orders, + * nation + * where + * s_suppkey = l_suppkey + * and ps_suppkey = l_suppkey + * and ps_partkey = l_partkey + * and p_partkey = l_partkey + * and o_orderkey = l_orderkey + * and s_nationkey = n_nationkey + * and p_name like '%green%' + * ) as profit + * group by + * nation, + * o_year + * order by + * nation, + * o_year desc; + */ + +/** + * @brief Calculate the amount column + * + * @param discount The discount column + * @param extendedprice The extended price column + * @param supplycost The supply cost column + * @param quantity The quantity column + * @param stream The CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + */ +[[nodiscard]] std::unique_ptr calc_amount( + cudf::column_view const& discount, + cudf::column_view const& extendedprice, + cudf::column_view const& supplycost, + cudf::column_view const& quantity, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) +{ + auto const one = cudf::numeric_scalar(1); + auto const one_minus_discount = + cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type()); + auto const extendedprice_discounted_type = cudf::data_type{cudf::type_id::FLOAT64}; + auto const extendedprice_discounted = cudf::binary_operation(extendedprice, + one_minus_discount->view(), + cudf::binary_operator::MUL, + extendedprice_discounted_type, + stream, + mr); + auto const supplycost_quantity_type = cudf::data_type{cudf::type_id::FLOAT64}; + auto const supplycost_quantity = cudf::binary_operation( + supplycost, quantity, cudf::binary_operator::MUL, supplycost_quantity_type); + auto amount = cudf::binary_operation(extendedprice_discounted->view(), + supplycost_quantity->view(), + cudf::binary_operator::SUB, + extendedprice_discounted->type(), + stream, + mr); + return amount; +} + +int main(int argc, char const** argv) +{ + auto const args = parse_args(argc, argv); + + // Use a memory pool + auto resource = create_memory_resource(args.memory_resource_type); + rmm::mr::set_current_device_resource(resource.get()); + + cudf::examples::timer timer; + + // Read out the table from parquet files + auto const lineitem = read_parquet( + args.dataset_dir + "/lineitem.parquet", + {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"}); + auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_name"}); + auto const orders = + read_parquet(args.dataset_dir + "/orders.parquet", {"o_orderkey", "o_orderdate"}); + auto const part = read_parquet(args.dataset_dir + "/part.parquet", {"p_partkey", "p_name"}); + auto const partsupp = read_parquet(args.dataset_dir + "/partsupp.parquet", + {"ps_suppkey", "ps_partkey", "ps_supplycost"}); + auto const supplier = + read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"}); + + // Generating the `profit` table + // Filter the part table using `p_name like '%green%'` + auto const p_name = part->table().column(1); + auto const mask = + cudf::strings::like(cudf::strings_column_view(p_name), cudf::string_scalar("%green%")); + auto const part_filtered = apply_mask(part, mask); + + // Perform the joins + auto const join_a = apply_inner_join(supplier, nation, {"s_nationkey"}, {"n_nationkey"}); + auto const join_b = apply_inner_join(partsupp, join_a, {"ps_suppkey"}, {"s_suppkey"}); + auto const join_c = apply_inner_join(lineitem, part_filtered, {"l_partkey"}, {"p_partkey"}); + auto const join_d = apply_inner_join(orders, join_c, {"o_orderkey"}, {"l_orderkey"}); + auto const joined_table = + apply_inner_join(join_d, join_b, {"l_suppkey", "l_partkey"}, {"s_suppkey", "ps_partkey"}); + + // Calculate the `nation`, `o_year`, and `amount` columns + auto n_name = std::make_unique(joined_table->column("n_name")); + auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate")); + auto amount = calc_amount(joined_table->column("l_discount"), + joined_table->column("l_extendedprice"), + joined_table->column("ps_supplycost"), + joined_table->column("l_quantity")); + + // Put together the `profit` table + std::vector> profit_columns; + profit_columns.push_back(std::move(n_name)); + profit_columns.push_back(std::move(o_year)); + profit_columns.push_back(std::move(amount)); + + auto profit_table = std::make_unique(std::move(profit_columns)); + auto const profit = std::make_unique( + std::move(profit_table), std::vector{"nation", "o_year", "amount"}); + + // Perform the groupby operation + auto const groupedby_table = apply_groupby( + profit, + groupby_context_t{{"nation", "o_year"}, + {{"amount", {{cudf::groupby_aggregation::SUM, "sum_profit"}}}}}); + + // Perform the orderby operation + auto const orderedby_table = apply_orderby( + groupedby_table, {"nation", "o_year"}, {cudf::order::ASCENDING, cudf::order::DESCENDING}); + + timer.print_elapsed_millis(); + + // Write query result to a parquet file + orderedby_table->to_parquet("q9.parquet"); + return 0; +} diff --git a/cpp/examples/tpch/utils.hpp b/cpp/examples/tpch/utils.hpp new file mode 100644 index 00000000000..e586da2c802 --- /dev/null +++ b/cpp/examples/tpch/utils.hpp @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +// RMM memory resource creation utilities +inline auto make_cuda() { return std::make_shared(); } +inline auto make_pool() +{ + return rmm::mr::make_owning_wrapper( + make_cuda(), rmm::percent_of_free_device_memory(50)); +} +inline auto make_managed() { return std::make_shared(); } +inline auto make_managed_pool() +{ + return rmm::mr::make_owning_wrapper( + make_managed(), rmm::percent_of_free_device_memory(50)); +} +inline std::shared_ptr create_memory_resource( + std::string const& mode) +{ + if (mode == "cuda") return make_cuda(); + if (mode == "pool") return make_pool(); + if (mode == "managed") return make_managed(); + if (mode == "managed_pool") return make_managed_pool(); + CUDF_FAIL("Unknown rmm_mode parameter: " + mode + + "\nExpecting: cuda, pool, managed, or managed_pool"); +} + +/** + * @brief A class to represent a table with column names attached + */ +class table_with_names { + public: + table_with_names(std::unique_ptr tbl, std::vector col_names) + : tbl(std::move(tbl)), col_names(col_names) + { + } + /** + * @brief Return the table view + */ + [[nodiscard]] cudf::table_view table() const { return tbl->view(); } + /** + * @brief Return the column view for a given column name + * + * @param col_name The name of the column + */ + [[nodiscard]] cudf::column_view column(std::string const& col_name) const + { + return tbl->view().column(col_id(col_name)); + } + /** + * @param Return the column names of the table + */ + [[nodiscard]] std::vector column_names() const { return col_names; } + /** + * @brief Translate a column name to a column index + * + * @param col_name The name of the column + */ + [[nodiscard]] cudf::size_type col_id(std::string const& col_name) const + { + CUDF_FUNC_RANGE(); + auto it = std::find(col_names.begin(), col_names.end(), col_name); + if (it == col_names.end()) { throw std::runtime_error("Column not found"); } + return std::distance(col_names.begin(), it); + } + /** + * @brief Append a column to the table + * + * @param col The column to append + * @param col_name The name of the appended column + */ + table_with_names& append(std::unique_ptr& col, std::string const& col_name) + { + CUDF_FUNC_RANGE(); + auto cols = tbl->release(); + cols.push_back(std::move(col)); + tbl = std::make_unique(std::move(cols)); + col_names.push_back(col_name); + return (*this); + } + /** + * @brief Select a subset of columns from the table + * + * @param col_names The names of the columns to select + */ + [[nodiscard]] cudf::table_view select(std::vector const& col_names) const + { + CUDF_FUNC_RANGE(); + std::vector col_indices; + for (auto const& col_name : col_names) { + col_indices.push_back(col_id(col_name)); + } + return tbl->select(col_indices); + } + /** + * @brief Write the table to a parquet file + * + * @param filepath The path to the parquet file + */ + void to_parquet(std::string const& filepath) const + { + CUDF_FUNC_RANGE(); + auto const sink_info = cudf::io::sink_info(filepath); + cudf::io::table_metadata metadata; + metadata.schema_info = + std::vector(col_names.begin(), col_names.end()); + auto const table_input_metadata = cudf::io::table_input_metadata{metadata}; + auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view()); + builder.metadata(table_input_metadata); + auto const options = builder.build(); + cudf::io::write_parquet(options); + } + + private: + std::unique_ptr tbl; + std::vector col_names; +}; + +/** + * @brief Concatenate two vectors + * + * @param lhs The left vector + * @param rhs The right vector + */ +template +std::vector concat(std::vector const& lhs, std::vector const& rhs) +{ + std::vector result; + result.reserve(lhs.size() + rhs.size()); + std::copy(lhs.begin(), lhs.end(), std::back_inserter(result)); + std::copy(rhs.begin(), rhs.end(), std::back_inserter(result)); + return result; +} + +/** + * @brief Inner join two tables and gather the result + * + * @param left_input The left input table + * @param right_input The right input table + * @param left_on The columns to join on in the left table + * @param right_on The columns to join on in the right table + * @param compare_nulls The null equality policy + */ +[[nodiscard]] std::unique_ptr join_and_gather( + cudf::table_view const& left_input, + cudf::table_view const& right_input, + std::vector const& left_on, + std::vector const& right_on, + cudf::null_equality compare_nulls) +{ + CUDF_FUNC_RANGE(); + constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; + auto const left_selected = left_input.select(left_on); + auto const right_selected = right_input.select(right_on); + auto const [left_join_indices, right_join_indices] = cudf::inner_join( + left_selected, right_selected, compare_nulls, rmm::mr::get_current_device_resource()); + + auto const left_indices_span = cudf::device_span{*left_join_indices}; + auto const right_indices_span = cudf::device_span{*right_join_indices}; + + auto const left_indices_col = cudf::column_view{left_indices_span}; + auto const right_indices_col = cudf::column_view{right_indices_span}; + + auto const left_result = cudf::gather(left_input, left_indices_col, oob_policy); + auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy); + + auto joined_cols = left_result->release(); + auto right_cols = right_result->release(); + joined_cols.insert(joined_cols.end(), + std::make_move_iterator(right_cols.begin()), + std::make_move_iterator(right_cols.end())); + return std::make_unique(std::move(joined_cols)); +} + +/** + * @brief Apply an inner join operation to two tables + * + * @param left_input The left input table + * @param right_input The right input table + * @param left_on The columns to join on in the left table + * @param right_on The columns to join on in the right table + * @param compare_nulls The null equality policy + */ +[[nodiscard]] std::unique_ptr apply_inner_join( + std::unique_ptr const& left_input, + std::unique_ptr const& right_input, + std::vector const& left_on, + std::vector const& right_on, + cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) +{ + CUDF_FUNC_RANGE(); + std::vector left_on_indices; + std::vector right_on_indices; + std::transform( + left_on.begin(), left_on.end(), std::back_inserter(left_on_indices), [&](auto const& col_name) { + return left_input->col_id(col_name); + }); + std::transform(right_on.begin(), + right_on.end(), + std::back_inserter(right_on_indices), + [&](auto const& col_name) { return right_input->col_id(col_name); }); + auto table = join_and_gather( + left_input->table(), right_input->table(), left_on_indices, right_on_indices, compare_nulls); + return std::make_unique( + std::move(table), concat(left_input->column_names(), right_input->column_names())); +} + +/** + * @brief Apply a filter predicated to a table + * + * @param table The input table + * @param predicate The filter predicate + */ +[[nodiscard]] std::unique_ptr apply_filter( + std::unique_ptr const& table, cudf::ast::operation const& predicate) +{ + CUDF_FUNC_RANGE(); + auto const boolean_mask = cudf::compute_column(table->table(), predicate); + auto result_table = cudf::apply_boolean_mask(table->table(), boolean_mask->view()); + return std::make_unique(std::move(result_table), table->column_names()); +} + +/** + * @brief Apply a boolean mask to a table + * + * @param table The input table + * @param mask The boolean mask + */ +[[nodiscard]] std::unique_ptr apply_mask( + std::unique_ptr const& table, std::unique_ptr const& mask) +{ + CUDF_FUNC_RANGE(); + auto result_table = cudf::apply_boolean_mask(table->table(), mask->view()); + return std::make_unique(std::move(result_table), table->column_names()); +} + +struct groupby_context_t { + std::vector keys; + std::unordered_map>> + values; +}; + +/** + * @brief Apply a groupby operation to a table + * + * @param table The input table + * @param ctx The groupby context + */ +[[nodiscard]] std::unique_ptr apply_groupby( + std::unique_ptr const& table, groupby_context_t const& ctx) +{ + CUDF_FUNC_RANGE(); + auto const keys = table->select(ctx.keys); + cudf::groupby::groupby groupby_obj(keys); + std::vector result_column_names; + result_column_names.insert(result_column_names.end(), ctx.keys.begin(), ctx.keys.end()); + std::vector requests; + for (auto& [value_col, aggregations] : ctx.values) { + requests.emplace_back(cudf::groupby::aggregation_request()); + for (auto& agg : aggregations) { + if (agg.first == cudf::aggregation::Kind::SUM) { + requests.back().aggregations.push_back( + cudf::make_sum_aggregation()); + } else if (agg.first == cudf::aggregation::Kind::MEAN) { + requests.back().aggregations.push_back( + cudf::make_mean_aggregation()); + } else if (agg.first == cudf::aggregation::Kind::COUNT_ALL) { + requests.back().aggregations.push_back( + cudf::make_count_aggregation()); + } else { + throw std::runtime_error("Unsupported aggregation"); + } + result_column_names.push_back(agg.second); + } + requests.back().values = table->column(value_col); + } + auto agg_results = groupby_obj.aggregate(requests); + std::vector> result_columns; + for (size_t i = 0; i < agg_results.first->num_columns(); i++) { + auto col = std::make_unique(agg_results.first->get_column(i)); + result_columns.push_back(std::move(col)); + } + for (size_t i = 0; i < agg_results.second.size(); i++) { + for (size_t j = 0; j < agg_results.second[i].results.size(); j++) { + result_columns.push_back(std::move(agg_results.second[i].results[j])); + } + } + auto result_table = std::make_unique(std::move(result_columns)); + return std::make_unique(std::move(result_table), result_column_names); +} + +/** + * @brief Apply an order by operation to a table + * + * @param table The input table + * @param sort_keys The sort keys + * @param sort_key_orders The sort key orders + */ +[[nodiscard]] std::unique_ptr apply_orderby( + std::unique_ptr const& table, + std::vector const& sort_keys, + std::vector const& sort_key_orders) +{ + CUDF_FUNC_RANGE(); + std::vector column_views; + for (auto& key : sort_keys) { + column_views.push_back(table->column(key)); + } + auto result_table = + cudf::sort_by_key(table->table(), cudf::table_view{column_views}, sort_key_orders); + return std::make_unique(std::move(result_table), table->column_names()); +} + +/** + * @brief Apply a reduction operation to a column + * + * @param column The input column + * @param agg_kind The aggregation kind + * @param col_name The name of the output column + */ +[[nodiscard]] std::unique_ptr apply_reduction( + cudf::column_view const& column, + cudf::aggregation::Kind const& agg_kind, + std::string const& col_name) +{ + CUDF_FUNC_RANGE(); + auto const agg = cudf::make_sum_aggregation(); + auto const result = cudf::reduce(column, *agg, column.type()); + cudf::size_type const len = 1; + auto col = cudf::make_column_from_scalar(*result, len); + std::vector> columns; + columns.push_back(std::move(col)); + auto result_table = std::make_unique(std::move(columns)); + std::vector col_names = {col_name}; + return std::make_unique(std::move(result_table), col_names); +} + +/** + * @brief Read a parquet file into a table + * + * @param filename The path to the parquet file + * @param columns The columns to read + * @param predicate The filter predicate to pushdown + */ +[[nodiscard]] std::unique_ptr read_parquet( + std::string const& filename, + std::vector const& columns = {}, + std::unique_ptr const& predicate = nullptr) +{ + CUDF_FUNC_RANGE(); + auto const source = cudf::io::source_info(filename); + auto builder = cudf::io::parquet_reader_options_builder(source); + if (!columns.empty()) { builder.columns(columns); } + if (predicate) { builder.filter(*predicate); } + auto const options = builder.build(); + auto table_with_metadata = cudf::io::read_parquet(options); + std::vector column_names; + for (auto const& col_info : table_with_metadata.metadata.schema_info) { + column_names.push_back(col_info.name); + } + return std::make_unique(std::move(table_with_metadata.tbl), column_names); +} + +/** + * @brief Generate the `std::tm` structure from year, month, and day + * + * @param year The year + * @param month The month + * @param day The day + */ +std::tm make_tm(int year, int month, int day) +{ + std::tm tm{}; + tm.tm_year = year - 1900; + tm.tm_mon = month - 1; + tm.tm_mday = day; + return tm; +} + +/** + * @brief Calculate the number of days since the UNIX epoch + * + * @param year The year + * @param month The month + * @param day The day + */ +int32_t days_since_epoch(int year, int month, int day) +{ + std::tm tm = make_tm(year, month, day); + std::tm epoch = make_tm(1970, 1, 1); + std::time_t time = std::mktime(&tm); + std::time_t epoch_time = std::mktime(&epoch); + double diff = std::difftime(time, epoch_time) / (60 * 60 * 24); + return static_cast(diff); +} + +struct tpch_example_args { + std::string dataset_dir; + std::string memory_resource_type; +}; + +/** + * @brief Parse command line arguments into a struct + * + * @param argc The number of command line arguments + * @param argv The command line arguments + */ +tpch_example_args parse_args(int argc, char const** argv) +{ + if (argc < 3) { + std::string usage_message = "Usage: " + std::string(argv[0]) + + " \n The query result will be " + "saved to a parquet file named q{query_no}.parquet in the current " + "working directory "; + throw std::runtime_error(usage_message); + } + tpch_example_args args; + args.dataset_dir = argv[1]; + args.memory_resource_type = argv[2]; + return args; +} diff --git a/cpp/examples/utilities/timer.hpp b/cpp/examples/utilities/timer.hpp new file mode 100644 index 00000000000..65fa92e74cf --- /dev/null +++ b/cpp/examples/utilities/timer.hpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace cudf { +namespace examples { +/** + * @brief Light-weight timer for measuring elapsed time. + * + * A timer object constructed from std::chrono, instrumenting at microseconds + * precision. Can display elapsed durations at milli and micro second + * scales. The timer starts at object construction. + */ +class timer { + public: + using micros = std::chrono::microseconds; + using millis = std::chrono::milliseconds; + + timer() { reset(); } + void reset() { start_time = std::chrono::high_resolution_clock::now(); } + auto elapsed() const { return (std::chrono::high_resolution_clock::now() - start_time); } + void print_elapsed_micros() const + { + std::cout << "Elapsed Time: " << std::chrono::duration_cast(elapsed()).count() + << "us\n\n"; + } + void print_elapsed_millis() const + { + std::cout << "Elapsed Time: " << std::chrono::duration_cast(elapsed()).count() + << "ms\n\n"; + } + + private: + using time_point_t = std::chrono::time_point; + time_point_t start_time; +}; + +} // namespace examples +}; // namespace cudf diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp index 22dad11e109..c74c91e39c2 100644 --- a/cpp/include/cudf/binaryop.hpp +++ b/cpp/include/cudf/binaryop.hpp @@ -290,6 +290,17 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op, namespace binops { +/** + * @brief Returns true if the binary operator is supported for the given input types. + * + * @param out The output data type + * @param lhs The left-hand cudf::data_type + * @param rhs The right-hand cudf::data_type + * @param op The binary operator + * @return true if the binary operator is supported for the given input types + */ +bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op); + /** * @brief Computes output valid mask for op between a column and a scalar * diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh index 1ef8b3b120a..c3bc3ad89fa 100644 --- a/cpp/include/cudf/detail/distinct_hash_join.cuh +++ b/cpp/include/cudf/detail/distinct_hash_join.cuh @@ -42,17 +42,6 @@ template struct comparator_adapter { comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {} - // suppress "function was declared but never referenced warning" -#pragma nv_diagnostic push -#pragma nv_diag_suppress 177 - __device__ constexpr auto operator()( - cuco::pair const&, - cuco::pair const&) const noexcept - { - // All build table keys are distinct thus `false` no matter what - return false; - } - __device__ constexpr auto operator()( cuco::pair const&, cuco::pair const&) const noexcept @@ -69,15 +58,6 @@ struct comparator_adapter { return _d_equal(lhs.second, rhs.second); } - __device__ constexpr auto operator()( - cuco::pair const& lhs, - cuco::pair const& rhs) const noexcept - { - if (lhs.first != rhs.first) { return false; } - return _d_equal(lhs.second, rhs.second); - } -#pragma nv_diagnostic pop - private: Equal _d_equal; }; diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index f1775c6d6d7..5007af7f9f1 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -41,8 +41,8 @@ static constexpr size_type warp_size{32}; */ class grid_1d { public: - int const num_threads_per_block; - int const num_blocks; + thread_index_type const num_threads_per_block; + thread_index_type const num_blocks; /** * @param overall_num_elements The number of elements the kernel needs to * handle/process, in its main, one-dimensional/linear input (e.g. one or more @@ -55,9 +55,9 @@ class grid_1d { * than a single element; this affects the number of threads the grid must * contain */ - grid_1d(cudf::size_type overall_num_elements, - cudf::size_type num_threads_per_block, - cudf::size_type elements_per_thread = 1) + grid_1d(thread_index_type overall_num_elements, + thread_index_type num_threads_per_block, + thread_index_type elements_per_thread = 1) : num_threads_per_block(num_threads_per_block), num_blocks(util::div_rounding_up_safe(overall_num_elements, elements_per_thread * num_threads_per_block)) diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp index e8486a80afc..9cdda773dbb 100644 --- a/cpp/include/cudf/dictionary/detail/update_keys.hpp +++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp @@ -29,7 +29,7 @@ namespace dictionary { namespace detail { /** * @copydoc cudf::dictionary::add_keys(dictionary_column_view const&,column_view - * const&,mm::mr::device_memory_resource*) + * const&,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -40,7 +40,7 @@ std::unique_ptr add_keys(dictionary_column_view const& dictionary_column /** * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view - * const&,mm::mr::device_memory_resource*) + * const&,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -51,7 +51,7 @@ std::unique_ptr remove_keys(dictionary_column_view const& dictionary_col /** * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view - * const&,mm::mr::device_memory_resource*) + * const&,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -61,7 +61,7 @@ std::unique_ptr remove_unused_keys(dictionary_column_view const& diction /** * @copydoc cudf::dictionary::set_keys(dictionary_column_view - * const&,mm::mr::device_memory_resource*) + * const&,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -72,7 +72,7 @@ std::unique_ptr set_keys(dictionary_column_view const& dictionary_column /** * @copydoc - * cudf::dictionary::match_dictionaries(std::vector,mm::mr::device_memory_resource*) + * cudf::dictionary::match_dictionaries(std::vector,rmm::device_async_resource_ref) * * @param stream CUDA stream used for device memory operations and kernel launches. */ diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index 6c3c3b4da07..c9cbc603226 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -84,8 +84,8 @@ template && - is_supported_representation_type())>* = nullptr> -CUDF_HOST_DEVICE inline Rep ipow(T exponent) + cuda::std::is_integral_v)>* = nullptr> +CUDF_HOST_DEVICE inline constexpr Rep ipow(T exponent) { cudf_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible."); diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp index 2c3a5c5629d..f12177c6a4b 100644 --- a/cpp/include/cudf/fixed_point/floating_conversion.hpp +++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp @@ -18,6 +18,7 @@ #include +#include #include #include @@ -34,6 +35,49 @@ namespace numeric { namespace detail { +/** + * @brief Determine the number of significant bits in an integer + * + * @tparam T Type of input integer value. Must be either uint32_t, uint64_t, or __uint128_t + * @param value The integer whose bits are being counted + * @return The number of significant bits: the # of bits - # of leading zeroes + */ +template || std::is_same_v || + std::is_same_v)> +CUDF_HOST_DEVICE inline int count_significant_bits(T value) +{ +#ifdef __CUDA_ARCH__ + if constexpr (std::is_same_v) { + return 64 - __clzll(static_cast(value)); + } else if constexpr (std::is_same_v) { + return 32 - __clz(static_cast(value)); + } else if constexpr (std::is_same_v) { + // 128 bit type, must break up into high and low components + auto const high_bits = static_cast(value >> 64); + auto const low_bits = static_cast(value); + return 128 - (__clzll(high_bits) + static_cast(high_bits == 0) * __clzll(low_bits)); + } +#else + // Undefined behavior to call __builtin_clzll() with zero in gcc and clang + if (value == 0) { return 0; } + + if constexpr (std::is_same_v) { + return 64 - __builtin_clzll(value); + } else if constexpr (std::is_same_v) { + return 32 - __builtin_clz(value); + } else if constexpr (std::is_same_v) { + // 128 bit type, must break up into high and low components + auto const high_bits = static_cast(value >> 64); + if (high_bits == 0) { + return 64 - __builtin_clzll(static_cast(value)); + } else { + return 128 - __builtin_clzll(high_bits); + } + } +#endif +} + /** * @brief Helper struct for getting and setting the components of a floating-point value * @@ -62,27 +106,28 @@ struct floating_converter { // The low 23 / 52 bits (for float / double) are the mantissa. // The mantissa is normalized. There is an understood 1 bit to the left of the binary point. // The value of the mantissa is in the range [1, 2). - /// # mantissa bits (-1 for understood bit) - static constexpr int num_mantissa_bits = cuda::std::numeric_limits::digits - 1; + /// # significand bits (includes understood bit) + static constexpr int num_significand_bits = cuda::std::numeric_limits::digits; + /// # stored mantissa bits (-1 for understood bit) + static constexpr int num_stored_mantissa_bits = num_significand_bits - 1; /// The mask for the understood bit - static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_mantissa_bits); + static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_stored_mantissa_bits); /// The mask to select the mantissa static constexpr IntegralType mantissa_mask = understood_bit_mask - 1; // And in between are the bits used to store the biased power-of-2 exponent. /// # exponents bits (-1 for sign bit) - static constexpr int num_exponent_bits = num_floating_bits - num_mantissa_bits - 1; + static constexpr int num_exponent_bits = num_floating_bits - num_stored_mantissa_bits - 1; /// The mask for the exponents, unshifted static constexpr IntegralType unshifted_exponent_mask = (IntegralType(1) << num_exponent_bits) - 1; /// The mask to select the exponents - static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_mantissa_bits; + static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_stored_mantissa_bits; // To store positive and negative exponents as unsigned values, the stored value for // the power-of-2 is exponent + bias. The bias is 127 for floats and 1023 for doubles. /// 127 / 1023 for float / double - static constexpr IntegralType exponent_bias = - cuda::std::numeric_limits::max_exponent - 1; + static constexpr int exponent_bias = cuda::std::numeric_limits::max_exponent - 1; /** * @brief Reinterpret the bits of a floating-point value as an integer @@ -113,15 +158,15 @@ struct floating_converter { } /** - * @brief Extracts the integral significand of a bit-casted floating-point number + * @brief Checks whether the bit-casted floating-point value is +/-0 * - * @param integer_rep The bit-casted floating value to extract the exponent from - * @return The integral significand, bit-shifted to a (large) whole number + * @param integer_rep The bit-casted floating value to check if is +/-0 + * @return True if is a zero, else false */ - CUDF_HOST_DEVICE inline static IntegralType get_base2_value(IntegralType integer_rep) + CUDF_HOST_DEVICE inline static bool is_zero(IntegralType integer_rep) { - // Extract the significand, setting the high bit for the understood 1/2 - return (integer_rep & mantissa_mask) | understood_bit_mask; + // It's a zero if every non-sign bit is zero + return ((integer_rep & ~sign_mask) == 0); } /** @@ -137,40 +182,59 @@ struct floating_converter { } /** - * @brief Extracts the exponent of a bit-casted floating-point number + * @brief Extracts the significand and exponent of a bit-casted floating-point number, + * shifted for denormals. * - * @note This returns INT_MIN for +/-0, +/-inf, NaN's, and denormals - * For all of these cases, the decimal fixed_point number should be set to zero + * @note Zeros/inf/NaN not handled. * * @param integer_rep The bit-casted floating value to extract the exponent from - * @return The stored base-2 exponent, or INT_MIN for special values + * @return The stored base-2 exponent and significand, shifted for denormals */ - CUDF_HOST_DEVICE inline static int get_exp2(IntegralType integer_rep) + CUDF_HOST_DEVICE inline static std::pair get_significand_and_pow2( + IntegralType integer_rep) { - // First extract the exponent bits and handle its special values. - // To minimize branching, all of these special cases will return INT_MIN. - // For all of these cases, the decimal fixed_point number should be set to zero. + // Extract the significand + auto significand = (integer_rep & mantissa_mask); + + // Extract the exponent bits. auto const exponent_bits = integer_rep & exponent_mask; + + // Notes on special values of exponent_bits: + // bits = exponent_mask is +/-inf or NaN, but those are handled prior to input. + // bits = 0 is either a denormal (handled below) or a zero (handled earlier by caller). + int floating_pow2; if (exponent_bits == 0) { - // Because of the understood set-bit not stored in the mantissa, it is not possible - // to store the value zero directly. Instead both +/-0 and denormals are represented with - // the exponent bits set to zero. - // Thus it's fastest to just floor (generally unwanted) denormals to zero. - return INT_MIN; - } else if (exponent_bits == exponent_mask) { - //+/-inf and NaN values are stored with all of the exponent bits set. - // As none of these are representable by integers, we'll return the same value for all cases. - return INT_MIN; + // Denormal values are 2^(1 - exponent_bias) * Sum_i(B_i * 2^-i) + // Where i is the i-th mantissa bit (counting from the LEFT, starting at 1), + // and B_i is the value of that bit (0 or 1) + // So e.g. for the minimum denormal, only the lowest bit is set: + // FLT_TRUE_MIN = 2^(1 - 127) * 2^-23 = 2^-149 + // DBL_TRUE_MIN = 2^(1 - 1023) * 2^-52 = 2^-1074 + floating_pow2 = 1 - exponent_bias; + + // Line-up denormal to same (understood) bit as normal numbers + // This is so bit-shifting starts at the same bit index + auto const lineup_shift = num_significand_bits - count_significant_bits(significand); + significand <<= lineup_shift; + floating_pow2 -= lineup_shift; + } else { + // Extract the exponent value: shift the bits down and subtract the bias. + auto const shifted_exponent_bits = exponent_bits >> num_stored_mantissa_bits; + floating_pow2 = static_cast(shifted_exponent_bits) - exponent_bias; + + // Set the high bit for the understood 1/2 + significand |= understood_bit_mask; } - // Extract the exponent value: shift the bits down and subtract the bias. - using SignedIntegralType = cuda::std::make_signed_t; - SignedIntegralType const shifted_exponent_bits = exponent_bits >> num_mantissa_bits; - return shifted_exponent_bits - static_cast(exponent_bias); + // To convert the mantissa to an integer, we effectively applied #-mantissa-bits + // powers of 2 to convert the fractional value to an integer, so subtract them off here + int const pow2 = floating_pow2 - num_stored_mantissa_bits; + + return {significand, pow2}; } /** - * @brief Sets the sign bit of a positive floating-point number + * @brief Sets the sign bit of a floating-point number * * @param floating The floating-point value to set the sign of. Must be positive. * @param is_negative The sign bit to set for the floating-point number @@ -192,83 +256,60 @@ struct floating_converter { /** * @brief Adds to the base-2 exponent of a floating-point number * + * @note The caller must guarantee that the input is a positive (> 0) whole number. + * * @param floating The floating value to add to the exponent of. Must be positive. - * @param exp2 The power-of-2 to add to the floating-point number - * @return The input floating-point value * 2^exp2 + * @param pow2 The power-of-2 to add to the floating-point number + * @return The input floating-point value * 2^pow2 */ - CUDF_HOST_DEVICE inline static FloatingType add_exp2(FloatingType floating, int exp2) + CUDF_HOST_DEVICE inline static FloatingType add_pow2(FloatingType floating, int pow2) { + // Note that the input floating-point number is positive (& whole), so we don't have to + // worry about the sign here; the sign will be set later in set_is_negative() + // Convert floating to integer auto integer_rep = bit_cast_to_integer(floating); // Extract the currently stored (biased) exponent + using SignedType = std::make_signed_t; auto exponent_bits = integer_rep & exponent_mask; - auto stored_exp2 = exponent_bits >> num_mantissa_bits; + auto stored_pow2 = static_cast(exponent_bits >> num_stored_mantissa_bits); // Add the additional power-of-2 - stored_exp2 += exp2; + stored_pow2 += pow2; // Check for exponent over/under-flow. - // Note that the input floating-point number is always positive, so we don't have to - // worry about the sign here; the sign will be set later in set_is_negative() - if (stored_exp2 <= 0) { - return 0.0; - } else if (stored_exp2 >= unshifted_exponent_mask) { + if (stored_pow2 <= 0) { + // Denormal (zero handled prior to input) + + // Early out if bit shift will zero it anyway. + // Note: We must handle this explicitly, as too-large a bit-shift is UB + auto const bit_shift = -stored_pow2 + 1; //+1 due to understood bit set below + if (bit_shift > num_stored_mantissa_bits) { return 0.0; } + + // Clear the exponent bits (zero means 2^-126/2^-1022 w/ no understood bit) + integer_rep &= (~exponent_mask); + + // The input floating-point number has an "understood" bit that we need to set + // prior to bit-shifting. Set the understood bit. + integer_rep |= understood_bit_mask; + + // Convert to denormal: bit shift off the low bits + integer_rep >>= bit_shift; + } else if (stored_pow2 >= static_cast(unshifted_exponent_mask)) { + // Overflow: Set infinity return cuda::std::numeric_limits::infinity(); } else { - // Clear existing exponent bits and set new ones - exponent_bits = stored_exp2 << num_mantissa_bits; + // Normal number: Clear existing exponent bits and set new ones + exponent_bits = static_cast(stored_pow2) << num_stored_mantissa_bits; integer_rep &= (~exponent_mask); integer_rep |= exponent_bits; - - // Convert back to float - return bit_cast_to_floating(integer_rep); } - } -}; - -/** - * @brief Determine the number of significant bits in an integer - * - * @tparam T Type of input integer value. Must be either uint32_t, uint64_t, or __uint128_t - * @param value The integer whose bits are being counted - * @return The number of significant bits: the # of bits - # of leading zeroes - */ -template || std::is_same_v || - std::is_same_v)> -CUDF_HOST_DEVICE inline int count_significant_bits(T value) -{ -#ifdef __CUDA_ARCH__ - if constexpr (std::is_same_v) { - return 64 - __clzll(static_cast(value)); - } else if constexpr (std::is_same_v) { - return 32 - __clz(static_cast(value)); - } else if constexpr (std::is_same_v) { - // 128 bit type, must break up into high and low components - auto const high_bits = static_cast(value >> 64); - auto const low_bits = static_cast(value); - return 128 - (__clzll(high_bits) + static_cast(high_bits == 0) * __clzll(low_bits)); - } -#else - // Undefined behavior to call __builtin_clzll() with zero in gcc and clang - if (value == 0) { return 0; } - if constexpr (std::is_same_v) { - return 64 - __builtin_clzll(value); - } else if constexpr (std::is_same_v) { - return 32 - __builtin_clz(value); - } else if constexpr (std::is_same_v) { - // 128 bit type, must break up into high and low components - auto const high_bits = static_cast(value >> 64); - if (high_bits == 0) { - return 64 - __builtin_clzll(static_cast(value)); - } else { - return 128 - __builtin_clzll(high_bits); - } + // Convert back to float + return bit_cast_to_floating(integer_rep); } -#endif -} +}; /** * @brief Recursively calculate a signed large power of 10 (>= 10^19) that can only be stored in an @@ -276,18 +317,18 @@ CUDF_HOST_DEVICE inline int count_significant_bits(T value) * * @note Intended to be run at compile time. * - * @tparam Exp10 The power of 10 to calculate - * @return Returns 10^Exp10 + * @tparam Pow10 The power of 10 to calculate + * @return Returns 10^Pow10 */ -template +template constexpr __uint128_t large_power_of_10() { // Stop at 10^19 to speed up compilation; literals can be used for smaller powers of 10. - static_assert(Exp10 >= 19); - if constexpr (Exp10 == 19) + static_assert(Pow10 >= 19); + if constexpr (Pow10 == 19) return __uint128_t(10000000000000000000ULL); else - return large_power_of_10() * __uint128_t(10); + return large_power_of_10() * __uint128_t(10); } /** @@ -295,11 +336,11 @@ constexpr __uint128_t large_power_of_10() * * @tparam T Type of value to be divided-from. * @param value The number to be divided-from. - * @param exp10 The power-of-10 of the denominator, from 0 to 9 inclusive. - * @return Returns value / 10^exp10 + * @param pow10 The power-of-10 of the denominator, from 0 to 9 inclusive. + * @return Returns value / 10^pow10 */ -template >* = nullptr> -CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10) +template )> +CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int pow10) { // Computing division this way is much faster than the alternatives. // Division is not implemented in GPU hardware, and the compiler will often implement it as a @@ -309,7 +350,7 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10) // Instead, if the compiler can see exactly what number it is dividing by, it can // produce much more optimal assembly, doing bit shifting, multiplies by a constant, etc. - // For the compiler to see the value though, array lookup (with exp10 as the index) + // For the compiler to see the value though, array lookup (with pow10 as the index) // is not sufficient: We have to use a switch statement. Although this introduces a branch, // it is still much faster than doing the divide any other way. // Perhaps an array can be used in C++23 with the assume attribute? @@ -325,7 +366,7 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10) // introduces too much pressure on the kernels that use this code, slowing down their benchmarks. // It also dramatically slows down the compile time. - switch (exp10) { + switch (pow10) { case 0: return value; case 1: return value / 10U; case 2: return value / 100U; @@ -345,36 +386,13 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10) * * @tparam T Type of value to be divided-from. * @param value The number to be divided-from. - * @param exp10 The power-of-10 of the denominator, from 0 to 19 inclusive. - * @return Returns value / 10^exp10 + * @param pow10 The power-of-10 of the denominator, from 0 to 19 inclusive. + * @return Returns value / 10^pow10 */ -template >* = nullptr> -CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int exp10) +template )> +CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int pow10) { - // See comments in divide_power10_32bit() for discussion. - switch (exp10) { - case 0: return value; - case 1: return value / 10U; - case 2: return value / 100U; - case 3: return value / 1000U; - case 4: return value / 10000U; - case 5: return value / 100000U; - case 6: return value / 1000000U; - case 7: return value / 10000000U; - case 8: return value / 100000000U; - case 9: return value / 1000000000U; - case 10: return value / 10000000000ULL; - case 11: return value / 100000000000ULL; - case 12: return value / 1000000000000ULL; - case 13: return value / 10000000000000ULL; - case 14: return value / 100000000000000ULL; - case 15: return value / 1000000000000000ULL; - case 16: return value / 10000000000000000ULL; - case 17: return value / 100000000000000000ULL; - case 18: return value / 1000000000000000000ULL; - case 19: return value / 10000000000000000000ULL; - default: return 0; - } + return value / ipow(pow10); } /** @@ -382,55 +400,13 @@ CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int exp10) * * @tparam T Type of value to be divided-from. * @param value The number to be divided-from. - * @param exp10 The power-of-10 of the denominator, from 0 to 38 inclusive. - * @return Returns value / 10^exp10. + * @param pow10 The power-of-10 of the denominator, from 0 to 38 inclusive. + * @return Returns value / 10^pow10. */ -template >* = nullptr> -CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int exp10) +template )> +CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int pow10) { - // See comments in divide_power10_32bit() for an introduction. - switch (exp10) { - case 0: return value; - case 1: return value / 10U; - case 2: return value / 100U; - case 3: return value / 1000U; - case 4: return value / 10000U; - case 5: return value / 100000U; - case 6: return value / 1000000U; - case 7: return value / 10000000U; - case 8: return value / 100000000U; - case 9: return value / 1000000000U; - case 10: return value / 10000000000ULL; - case 11: return value / 100000000000ULL; - case 12: return value / 1000000000000ULL; - case 13: return value / 10000000000000ULL; - case 14: return value / 100000000000000ULL; - case 15: return value / 1000000000000000ULL; - case 16: return value / 10000000000000000ULL; - case 17: return value / 100000000000000000ULL; - case 18: return value / 1000000000000000000ULL; - case 19: return value / 10000000000000000000ULL; - case 20: return value / large_power_of_10<20>(); - case 21: return value / large_power_of_10<21>(); - case 22: return value / large_power_of_10<22>(); - case 23: return value / large_power_of_10<23>(); - case 24: return value / large_power_of_10<24>(); - case 25: return value / large_power_of_10<25>(); - case 26: return value / large_power_of_10<26>(); - case 27: return value / large_power_of_10<27>(); - case 28: return value / large_power_of_10<28>(); - case 29: return value / large_power_of_10<29>(); - case 30: return value / large_power_of_10<30>(); - case 31: return value / large_power_of_10<31>(); - case 32: return value / large_power_of_10<32>(); - case 33: return value / large_power_of_10<33>(); - case 34: return value / large_power_of_10<34>(); - case 35: return value / large_power_of_10<35>(); - case 36: return value / large_power_of_10<36>(); - case 37: return value / large_power_of_10<37>(); - case 38: return value / large_power_of_10<38>(); - default: return 0; - } + return value / ipow<__uint128_t, Radix::BASE_10>(pow10); } /** @@ -438,14 +414,14 @@ CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int exp10) * * @tparam T Type of value to be multiplied. * @param value The number to be multiplied. - * @param exp10 The power-of-10 of the multiplier, from 0 to 9 inclusive. - * @return Returns value * 10^exp10 + * @param pow10 The power-of-10 of the multiplier, from 0 to 9 inclusive. + * @return Returns value * 10^pow10 */ -template >* = nullptr> -CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int exp10) +template )> +CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int pow10) { // See comments in divide_power10_32bit() for discussion. - switch (exp10) { + switch (pow10) { case 0: return value; case 1: return value * 10U; case 2: return value * 100U; @@ -465,36 +441,13 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int exp10) * * @tparam T Type of value to be multiplied. * @param value The number to be multiplied. - * @param exp10 The power-of-10 of the multiplier, from 0 to 19 inclusive. - * @return Returns value * 10^exp10 + * @param pow10 The power-of-10 of the multiplier, from 0 to 19 inclusive. + * @return Returns value * 10^pow10 */ -template >* = nullptr> -CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int exp10) +template )> +CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int pow10) { - // See comments in divide_power10_32bit() for discussion. - switch (exp10) { - case 0: return value; - case 1: return value * 10U; - case 2: return value * 100U; - case 3: return value * 1000U; - case 4: return value * 10000U; - case 5: return value * 100000U; - case 6: return value * 1000000U; - case 7: return value * 10000000U; - case 8: return value * 100000000U; - case 9: return value * 1000000000U; - case 10: return value * 10000000000ULL; - case 11: return value * 100000000000ULL; - case 12: return value * 1000000000000ULL; - case 13: return value * 10000000000000ULL; - case 14: return value * 100000000000000ULL; - case 15: return value * 1000000000000000ULL; - case 16: return value * 10000000000000000ULL; - case 17: return value * 100000000000000000ULL; - case 18: return value * 1000000000000000000ULL; - case 19: return value * 10000000000000000000ULL; - default: return 0; - } + return value * ipow(pow10); } /** @@ -502,113 +455,690 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int exp10) * * @tparam T Type of value to be multiplied. * @param value The number to be multiplied. - * @param exp10 The power-of-10 of the multiplier, from 0 to 38 inclusive. - * @return Returns value * 10^exp10. + * @param pow10 The power-of-10 of the multiplier, from 0 to 38 inclusive. + * @return Returns value * 10^pow10. */ -template >* = nullptr> -CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int exp10) +template )> +CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int pow10) { - // See comments in divide_power10_128bit() for discussion. - switch (exp10) { - case 0: return value; - case 1: return value * 10U; - case 2: return value * 100U; - case 3: return value * 1000U; - case 4: return value * 10000U; - case 5: return value * 100000U; - case 6: return value * 1000000U; - case 7: return value * 10000000U; - case 8: return value * 100000000U; - case 9: return value * 1000000000U; - case 10: return value * 10000000000ULL; - case 11: return value * 100000000000ULL; - case 12: return value * 1000000000000ULL; - case 13: return value * 10000000000000ULL; - case 14: return value * 100000000000000ULL; - case 15: return value * 1000000000000000ULL; - case 16: return value * 10000000000000000ULL; - case 17: return value * 100000000000000000ULL; - case 18: return value * 1000000000000000000ULL; - case 19: return value * 10000000000000000000ULL; - case 20: return value * large_power_of_10<20>(); - case 21: return value * large_power_of_10<21>(); - case 22: return value * large_power_of_10<22>(); - case 23: return value * large_power_of_10<23>(); - case 24: return value * large_power_of_10<24>(); - case 25: return value * large_power_of_10<25>(); - case 26: return value * large_power_of_10<26>(); - case 27: return value * large_power_of_10<27>(); - case 28: return value * large_power_of_10<28>(); - case 29: return value * large_power_of_10<29>(); - case 30: return value * large_power_of_10<30>(); - case 31: return value * large_power_of_10<31>(); - case 32: return value * large_power_of_10<32>(); - case 33: return value * large_power_of_10<33>(); - case 34: return value * large_power_of_10<34>(); - case 35: return value * large_power_of_10<35>(); - case 36: return value * large_power_of_10<36>(); - case 37: return value * large_power_of_10<37>(); - case 38: return value * large_power_of_10<38>(); - default: return 0; - } + return value * ipow<__uint128_t, Radix::BASE_10>(pow10); } /** * @brief Multiply an integer by a power of 10. * - * @note Use this function if you have no a-priori knowledge of what exp10 might be. + * @note Use this function if you have no a-priori knowledge of what pow10 might be. * If you do, prefer calling the bit-size-specific versions * * @tparam Rep Representation type needed for integer exponentiation * @tparam T Integral type of value to be multiplied. * @param value The number to be multiplied. - * @param exp10 The power-of-10 of the multiplier. - * @return Returns value * 10^exp10 + * @param pow10 The power-of-10 of the multiplier. + * @return Returns value * 10^pow10 */ -template )>* = nullptr> -CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int exp10) +template )> +CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int pow10) { - // Use this function if you have no knowledge of what exp10 might be + // Use this function if you have no knowledge of what pow10 might be // If you do, prefer calling the bit-size-specific versions if constexpr (sizeof(Rep) <= 4) { - return multiply_power10_32bit(value, exp10); + return multiply_power10_32bit(value, pow10); } else if constexpr (sizeof(Rep) <= 8) { - return multiply_power10_64bit(value, exp10); + return multiply_power10_64bit(value, pow10); } else { - return multiply_power10_128bit(value, exp10); + return multiply_power10_128bit(value, pow10); } } /** * @brief Divide an integer by a power of 10. * - * @note Use this function if you have no a-priori knowledge of what exp10 might be. + * @note Use this function if you have no a-priori knowledge of what pow10 might be. * If you do, prefer calling the bit-size-specific versions * * @tparam Rep Representation type needed for integer exponentiation * @tparam T Integral type of value to be divided-from. * @param value The number to be divided-from. - * @param exp10 The power-of-10 of the denominator. - * @return Returns value / 10^exp10 + * @param pow10 The power-of-10 of the denominator. + * @return Returns value / 10^pow10 */ -template )>* = nullptr> -CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int exp10) +template )> +CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int pow10) { - // Use this function if you have no knowledge of what exp10 might be + // Use this function if you have no knowledge of what pow10 might be // If you do, prefer calling the bit-size-specific versions if constexpr (sizeof(Rep) <= 4) { - return divide_power10_32bit(value, exp10); + return divide_power10_32bit(value, pow10); } else if constexpr (sizeof(Rep) <= 8) { - return divide_power10_64bit(value, exp10); + return divide_power10_64bit(value, pow10); } else { - return divide_power10_128bit(value, exp10); + return divide_power10_128bit(value, pow10); } } +/** + * @brief Perform a bit-shift left, guarding against undefined behavior + * + * @tparam IntegerType Type of input unsigned integer value + * @param value The integer whose bits are being shifted + * @param bit_shift The number of bits to shift left + * @return The bit-shifted integer, except max value if UB would occur + */ +template )> +CUDF_HOST_DEVICE inline IntegerType guarded_left_shift(IntegerType value, int bit_shift) +{ + // Bit shifts larger than this are undefined behavior + constexpr int max_safe_bit_shift = cuda::std::numeric_limits::digits - 1; + return (bit_shift <= max_safe_bit_shift) ? value << bit_shift + : cuda::std::numeric_limits::max(); +} + +/** + * @brief Perform a bit-shift right, guarding against undefined behavior + * + * @tparam IntegerType Type of input unsigned integer value + * @param value The integer whose bits are being shifted + * @param bit_shift The number of bits to shift right + * @return The bit-shifted integer, which is zero on underflow + */ +template )> +CUDF_HOST_DEVICE inline IntegerType guarded_right_shift(IntegerType value, int bit_shift) +{ + // Bit shifts larger than this are undefined behavior + constexpr int max_safe_bit_shift = cuda::std::numeric_limits::digits - 1; + return (bit_shift <= max_safe_bit_shift) ? value >> bit_shift : 0; +} + +/** + * @brief Helper struct with common constants needed by the floating <--> decimal conversions + */ +template +struct shifting_constants { + /// Whether the type is double + static constexpr bool is_double = cuda::std::is_same_v; + + /// Integer type that can hold the value of the significand + using IntegerRep = std::conditional_t; + + /// Num bits needed to hold the significand + static constexpr auto num_significand_bits = cuda::std::numeric_limits::digits; + + /// Shift data back and forth in space of a type with 2x the starting bits, to give us enough room + using ShiftingRep = std::conditional_t; + + // The significand of a float / double is 24 / 53 bits + // However, to uniquely represent each double / float as different #'s in decimal + // you need 17 / 9 digits (from std::numeric_limits::max_digits10) + // To represent 10^17 / 10^9, you need 57 / 30 bits + // So we need to keep track of at least this # of bits during shifting to ensure no info is lost + + // We will be alternately shifting our data back and forth by powers of 2 and 10 to convert + // between floating and decimal (see shifting functions for details). + + // To iteratively shift back and forth, our 2's (bit-) and 10's (divide-/multiply-) shifts must + // be of nearly the same magnitude, or else we'll over-/under-flow our shifting integer + + // 2^10 is approximately 10^3, so the largest shifts will have a 10/3 ratio + // The difference between 2^10 and 10^3 is 1024/1000: 2.4% + // So every time we shift by 10 bits and 3 decimal places, the 2s shift is an extra 2.4% + + // This 2.4% error compounds each time we do an iteration. + // The min (normal) float is 2^-126. + // Min denormal: 2^-126 * 2^-23 (mantissa bits): 2^-149 = ~1.4E-45 + // With our 10/3 shifting ratio, 149 (bit-shifts) * (3 / 10) = 44.7 (10s-shifts) + // 10^(-44.7) = 2E-45, which is off by ~1.4x from 1.4E-45 + + // Similarly, the min (normal) double is 2^-1022. + // Min denormal: 2^-1022 * 2^-52 (mantissa bits): 2^-1074 = 4.94E-324 + // With our 10/3 shifting ratio, 1074 (bit-shifts) * (3 / 10) = 322.2 (10s-shifts) + // 10^(-322.2) = 6.4E-323, which is off by ~13.2x from 4.94E-324 + + // To account for this compounding error, we can either complicate our loop code (slow), + // or use extra bits (in the direction we're shifting the 2s!) to compensate: + // 4 extra bits for doubles (2^4 = 16 > 13.2x error), 1 extra for floats (2 > 1.4x error) + /// # buffer bits to account for shifting error + static constexpr int num_2s_shift_buffer_bits = is_double ? 4 : 1; + + // How much room do we have for shifting? + // Float: 64-bit ShiftingRep - 31 (rep + buffer) = 33 bits. 2^33 = 8.6E9 + // Double: 128-bit ShiftingRep - 61 (rep + buffer) = 67 bits. 2^67 = 1.5E20 + // Thus for double / float we can shift up to 20 / 9 decimal places at once + + // But, we need to stick to our 10-bits / 3-decimals shift ratio to not over/under-flow. + // To simplify our loop code, we'll keep to this ratio by instead shifting a max of + // 18 / 9 decimal places, for double / float (60 / 30 bits) + /// Max at-once decimal place shift + static constexpr int max_digits_shift = is_double ? 18 : 9; + /// Max at-once bit shift + static constexpr int max_bits_shift = max_digits_shift * 10 / 3; + + // Pre-calculate 10^max_digits_shift. Note that 10^18 / 10^9 fits within IntegerRep + /// 10^max_digits_shift + static constexpr auto max_digits_shift_pow = + multiply_power10(IntegerRep(1), max_digits_shift); +}; + +/** + * @brief Add half a bit to integer rep of floating point if conversion causes truncation + * + * @note This fixes problems like 1.2 (value = 1.1999...) at scale -1 -> 11 + * + * @tparam FloatingType Type of integer holding the floating-point significand + * @param floating The floating-point number to convert + * @param integer_rep The integer representation of the floating-point significand + * @param pow2 The power of 2 that needs to be applied to the significand + * @param pow10 The power of 10 that needs to be applied to the significand + * @return integer_rep, shifted 1 and ++'d if the conversion to decimal causes truncation + */ +template )> +CUDF_HOST_DEVICE cuda::std::pair::IntegralType, int> +add_half_if_truncates(FloatingType floating, + typename floating_converter::IntegralType integer_rep, + int pow2, + int pow10) +{ + // The user-supplied scale may truncate information, so we need to talk about rounding. + // We have chosen not to round, so we want 1.23456f with scale -4 to be decimal 12345 + + // But if we don't round at all, 1.2 (double) with scale -1 is 11 instead of 12! + // Why? Because 1.2 (double) is actually stored as 1.1999999... which we truncate to 1.1 + // While correct (given our choice to truncate), this is surprising and undesirable. + // This problem happens because 1.2 is not perfectly representable in floating point, + // and the value 1.199999... happened to be closer to 1.2 than the next value (1.2000...1...) + + // If the scale truncates information (we didn't choose to keep exactly 1.1999...), how + // do we make sure we store 1.2? We'll add half an ulp! (unit in the last place) + // Then 1.1999... becomes 1.2000...1... which truncates to 1.2. + // And if it had been 1.2000...1..., adding half an ulp still truncates to 1.2 + + // Why 1/2 an ulp? Because that's all that is needed. The reason we have this problem in the + // first place is because the compiler rounded (e.g.) 1.2 to the nearest floating point number. + // The distance of this rounding is at most 1/2 ulp, otherwise we'd have rounded the other way. + + // How do we add 1/2 an ulp? Just shift the bits left (updating pow2) and add 1. + // We'll always shift up so every input to the conversion algorithm is aligned the same way. + + // If we add a full ulp we run into issues where we add too much and get the wrong result. + // This is because (e.g.) 2^23 = 8.4E6 which is not quite 7 digits of precision. + // So if we want 7 digits, that may "barely" truncate information; adding a 1 ulp is overkill. + + // So when does the user-supplied scale truncate info? + // For powers > 0: When the 10s (scale) shift is larger than the corresponding bit-shift. + // For powers < 0: When the 10s shift is less than the corresponding bit-shift. + + // Corresponding bit-shift: + // 2^10 is approximately 10^3, but this is off by 1.024% + // 1.024^30 is 2.03704, so this is high by one bit for every 30*3 = 90 powers of 10 + // So 10^N = 2^(10*N/3 - N/90) = 2^(299*N/90) + // Do comparison without dividing, which loses information: + // Note: if shift is "equal," still truncates if pow2 < 0 (shifting UP by 2s, 2^10 > 10^3) + int const pow2_term = 90 * pow2; + int const pow10_term = 299 * pow10; + bool const conversion_truncates = + (pow10_term > pow2_term) || ((pow2_term == pow10_term) && (pow2 < 0)); + + // However, don't add a half-bit if the input is a whole number! + // This is only for errors introduced by rounding decimal fractions! + bool const is_whole_number = (cuda::std::floor(floating) == floating); + bool const add_half_bit = conversion_truncates && !is_whole_number; + + // Add half a bit on truncation (shift to make room and update pow2) + integer_rep <<= 1; + --pow2; + integer_rep += static_cast(add_half_bit); + + return {integer_rep, pow2}; +} + +/** + * @brief Perform base-2 -> base-10 fixed-point conversion for pow10 > 0 + * + * @tparam Rep The type of the storage for the decimal value + * @tparam FloatingType The type of the original floating-point value we are converting from + * @param base2_value The base-2 fixed-point value we are converting from + * @param pow2 The number of powers of 2 to apply to convert from base-2 + * @param pow10 The number of powers of 10 to apply to reach the desired scale factor + * @return Magnitude of the converted-to decimal integer + */ +template )> +CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t shift_to_decimal_pospow( + typename shifting_constants::IntegerRep const base2_value, int pow2, int pow10) +{ + // To convert to decimal, we need to apply the input powers of 2 and 10 + // The result will be (integer) base2_value * (2^pow2) / (10^pow10) + // Output type is ShiftingRep + + // Here pow10 > 0 and pow2 > 0, so we need to shift left by 2s and divide by 10s. + // We'll iterate back and forth between them, shifting up by 2s + // and down by 10s until all of the powers have been applied. + + // However the input base2_value type has virtually no spare room to shift our data + // without over- or under-flowing and losing precision. + // So we'll cast up to ShiftingRep: uint64 for float's, __uint128_t for double's + using Constants = shifting_constants; + using ShiftingRep = typename Constants::ShiftingRep; + auto shifting_rep = static_cast(base2_value); + + // We want to start with our significand bits at the top of the shifting range, + // so that we don't lose information we need on intermediary right-shifts. + // Note that since we're shifting 2s up, we need num_2s_shift_buffer_bits space on the high side, + // For all numbers this bit shift is a fixed distance, due to the understood 2^0 bit. + // Note that shift_from is +1 due to shift in add_half_if_truncates() + static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::num_2s_shift_buffer_bits; + static constexpr int shift_from = Constants::num_significand_bits + 1; + static constexpr int max_init_shift = shift_up_to - shift_from; + + // If our total bit shift is less than this, we don't need to iterate + using UnsignedRep = cuda::std::make_unsigned_t; + if (pow2 <= max_init_shift) { + // Shift bits left, divide by 10s to apply the scale factor, and we're done. + shifting_rep = divide_power10(shifting_rep << pow2, pow10); + // NOTE: Cast can overflow! + return static_cast(shifting_rep); + } + + // We need to iterate. Do the combined initial shift + shifting_rep <<= max_init_shift; + pow2 -= max_init_shift; + + // Iterate, dividing by 10s and shifting up by 2s until we're almost done + while (pow10 > Constants::max_digits_shift) { + // More decimal places to shift than we have room: Divide the max number of 10s + shifting_rep /= Constants::max_digits_shift_pow; + pow10 -= Constants::max_digits_shift; + + // If our remaining bit shift is less than the max, we're finished iterating + if (pow2 <= Constants::max_bits_shift) { + // Shift bits left, divide by 10s to apply the scale factor, and we're done. + shifting_rep = divide_power10(shifting_rep << pow2, pow10); + + // NOTE: Cast can overflow! + return static_cast(shifting_rep); + } + + // Shift the max number of bits left again + shifting_rep <<= Constants::max_bits_shift; + pow2 -= Constants::max_bits_shift; + } + + // Last 10s-shift: Divide all remaining decimal places, shift all remaining bits, then bail + // Note: This divide result may not fit in the low half of the bit range + // But the divisor is less than the max-shift, and thus fits within 64 / 32 bits + if constexpr (Constants::is_double) { + shifting_rep = divide_power10_64bit(shifting_rep, pow10); + } else { + shifting_rep = divide_power10_32bit(shifting_rep, pow10); + } + + // Final bit shift: Shift may be large, guard against UB + // NOTE: This can overflow (both cast and shift)! + return guarded_left_shift(static_cast(shifting_rep), pow2); +} + +/** + * @brief Perform base-2 -> base-10 fixed-point conversion for pow10 < 0 + * + * @tparam Rep The type of the storage for the decimal value + * @tparam FloatingType The type of the original floating-point value we are converting from + * @param base2_value The base-2 fixed-point value we are converting from + * @param pow2 The number of powers of 2 to apply to convert from base-2 + * @param pow10 The number of powers of 10 to apply to reach the desired scale factor + * @return Magnitude of the converted-to decimal integer + */ +template )> +CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t shift_to_decimal_negpow( + typename shifting_constants::IntegerRep base2_value, int pow2, int pow10) +{ + // This is similar to shift_to_decimal_pospow(), except pow10 < 0 & pow2 < 0 + // See comments in that function for details. + // Instead here we need to multiply by 10s and shift right by 2s + + // ShiftingRep: uint64 for float's, __uint128_t for double's + using Constants = shifting_constants; + using ShiftingRep = typename Constants::ShiftingRep; + auto shifting_rep = static_cast(base2_value); + + // Convert to using positive values so we don't have keep negating + int pow10_mag = -pow10; + int pow2_mag = -pow2; + + // For performing final 10s-shift + using UnsignedRep = cuda::std::make_unsigned_t; + auto final_shifts_low10s = [&]() { + // Last 10s-shift: multiply all remaining decimal places, shift all remaining bits, then bail + // The multiplier is less than the max-shift, and thus fits within 64 / 32 bits + if constexpr (Constants::is_double) { + shifting_rep = multiply_power10_64bit(shifting_rep, pow10_mag); + } else { + shifting_rep = multiply_power10_32bit(shifting_rep, pow10_mag); + } + + // Final bit shifting: Shift may be large, guard against UB + return static_cast(guarded_right_shift(shifting_rep, pow2_mag)); + }; + + // If our total decimal shift is less than the max, we don't need to iterate + if (pow10_mag <= Constants::max_digits_shift) { return final_shifts_low10s(); } + + // We want to start by lining up our bits to the top of the shifting range, + // except our first operation is a multiply, so not quite that far + // We are bit-shifting down, so we need extra bits on the low-side, which this has. + // Note that shift_from is +1 due to shift in add_half_if_truncates() + static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::max_bits_shift; + static constexpr int shift_from = Constants::num_significand_bits + 1; + static constexpr int num_init_bit_shift = shift_up_to - shift_from; + + // Perform initial shift + shifting_rep <<= num_init_bit_shift; + pow2_mag += num_init_bit_shift; + + // Iterate, multiplying by 10s and shifting down by 2s until we're almost done + do { + // More decimal places to shift than we have room: Multiply the max number of 10s + shifting_rep *= Constants::max_digits_shift_pow; + pow10_mag -= Constants::max_digits_shift; + + // If our remaining bit shift is less than the max, we're finished iterating + if (pow2_mag <= Constants::max_bits_shift) { + // Last bit-shift: Shift all remaining bits, apply the remaining scale, then bail + shifting_rep >>= pow2_mag; + + // We need to convert to the output rep for the final scale-factor multiply, because if (e.g.) + // float -> dec128 and some large pow10_mag, it might overflow the 64bit shifting rep. + // It's not needed for pow10 > 0 because we're dividing by 10s there instead of multiplying. + // NOTE: This can overflow! (Both multiply and cast) + return multiply_power10(static_cast(shifting_rep), pow10_mag); + } + + // More bits to shift than we have room: Shift the max number of 2s + shifting_rep >>= Constants::max_bits_shift; + pow2_mag -= Constants::max_bits_shift; + } while (pow10_mag > Constants::max_digits_shift); + + // Do our final shifts + return final_shifts_low10s(); +} + +/** + * @brief Perform base-2 -> base-10 fixed-point conversion + * + * @tparam Rep The type of integer we are converting to, to store the decimal value + * @tparam FloatingType The type of floating-point object we are converting from + * @param base2_value The base-2 fixed-point value we are converting from + * @param pow2 The number of powers of 2 to apply to convert from base-2 + * @param pow10 The number of powers of 10 to apply to reach the desired scale factor + * @return Integer representation of the floating-point value, given the desired scale + */ +template )> +CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t convert_floating_to_integral_shifting( + typename floating_converter::IntegralType base2_value, int pow10, int pow2) +{ + // Apply the powers of 2 and 10 to convert to decimal. + // The result will be base2_value * (2^pow2) / (10^pow10) + + // Note that while this code is branchy, the decimal scale factor is part of the + // column type itself, so every thread will take the same branches on pow10. + // Also data within a column tends to be similar, so they will often take the + // same branches on pow2 as well. + + // NOTE: some returns here can overflow (e.g. ShiftingRep -> UnsignedRep) + using UnsignedRep = cuda::std::make_unsigned_t; + if (pow10 == 0) { + // NOTE: Left Bit-shift can overflow! As can cast! (e.g. double -> decimal32) + // Bit shifts may be large, guard against UB + if (pow2 >= 0) { + return guarded_left_shift(static_cast(base2_value), pow2); + } else { + return static_cast(guarded_right_shift(base2_value, -pow2)); + } + } else if (pow10 > 0) { + if (pow2 <= 0) { + // Power-2/10 shifts both downward: order doesn't matter, apply and bail. + // Guard against shift being undefined behavior + auto const shifted = guarded_right_shift(base2_value, -pow2); + return static_cast(divide_power10(shifted, pow10)); + } + return shift_to_decimal_pospow(base2_value, pow2, pow10); + } else { // pow10 < 0 + if (pow2 >= 0) { + // Power-2/10 shifts both upward: order doesn't matter, apply and bail. + // NOTE: Either shift, multiply, or cast (e.g. double -> decimal32) can overflow! + auto const shifted = guarded_left_shift(static_cast(base2_value), pow2); + return multiply_power10(shifted, -pow10); + } + return shift_to_decimal_negpow(base2_value, pow2, pow10); + } +} + +/** + * @brief Perform floating-point -> integer decimal conversion + * + * @tparam Rep The type of integer we are converting to, to store the decimal value + * @tparam FloatingType The type of floating-point object we are converting from + * @param floating The floating point value to convert + * @param scale The desired base-10 scale factor: decimal value = returned value * 10^scale + * @return Integer representation of the floating-point value, given the desired scale + */ +template )> +CUDF_HOST_DEVICE inline Rep convert_floating_to_integral(FloatingType const& floating, + scale_type const& scale) +{ + // Extract components of the floating point number + using converter = floating_converter; + auto const integer_rep = converter::bit_cast_to_integer(floating); + if (converter::is_zero(integer_rep)) { return 0; } + + // Note that the significand here is an unsigned integer with sizeof(FloatingType) + auto const is_negative = converter::get_is_negative(integer_rep); + auto const [significand, floating_pow2] = converter::get_significand_and_pow2(integer_rep); + + // Add half a bit if truncating to yield expected value, see function for discussion. + auto const pow10 = static_cast(scale); + auto const [base2_value, pow2] = + add_half_if_truncates(floating, significand, floating_pow2, pow10); + + // Apply the powers of 2 and 10 to convert to decimal. + auto const magnitude = + convert_floating_to_integral_shifting(base2_value, pow10, pow2); + + // Reapply the sign and return + // NOTE: Cast can overflow! + auto const signed_magnitude = static_cast(magnitude); + return is_negative ? -signed_magnitude : signed_magnitude; +} + +/** + * @brief Perform base-10 -> base-2 fixed-point conversion for pow10 > 0 + * + * @tparam DecimalRep The decimal integer type we are converting from + * @tparam FloatingType The type of floating point object we are converting to + * @param decimal_rep The decimal integer to convert + * @param pow10 The number of powers of 10 to apply to undo the scale factor + * @return A pair of the base-2 value and the remaining powers of 2 to be applied + */ +template )> +CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int pow10) +{ + // This is the reverse of shift_to_decimal_pospow(), see that for more details. + + // ShiftingRep: uint64 for float's, __uint128_t for double's + using Constants = shifting_constants; + using ShiftingRep = typename Constants::ShiftingRep; + + // We want to start by lining up our bits to the top of the shifting range, + // except our first operation is a multiply, so not quite that far + // We are bit-shifting down, so we need extra bits on the low-side, which this has. + static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::max_bits_shift; + int const shift_from = count_significant_bits(decimal_rep); + int const num_init_bit_shift = shift_up_to - shift_from; + int pow2 = -num_init_bit_shift; + + // Perform the initial bit shift + ShiftingRep shifting_rep; + if constexpr (sizeof(ShiftingRep) < sizeof(DecimalRep)) { + // Shift within DecimalRep before dropping to the smaller ShiftingRep + decimal_rep = (pow2 >= 0) ? (decimal_rep >> pow2) : (decimal_rep << -pow2); + shifting_rep = static_cast(decimal_rep); + } else { + // Scale up to ShiftingRep before shifting + shifting_rep = static_cast(decimal_rep); + shifting_rep = (pow2 >= 0) ? (shifting_rep >> pow2) : (shifting_rep << -pow2); + } + + // Iterate, multiplying by 10s and shifting down by 2s until we're almost done + while (pow10 > Constants::max_digits_shift) { + // More decimal places to shift than we have room: Multiply the max number of 10s + shifting_rep *= Constants::max_digits_shift_pow; + pow10 -= Constants::max_digits_shift; + + // Then make more room by bit shifting down by the max # of 2s + shifting_rep >>= Constants::max_bits_shift; + pow2 += Constants::max_bits_shift; + } + + // Last 10s-shift: multiply all remaining decimal places + // The multiplier is less than the max-shift, and thus fits within 64 / 32 bits + if constexpr (Constants::is_double) { + shifting_rep = multiply_power10_64bit(shifting_rep, pow10); + } else { + shifting_rep = multiply_power10_32bit(shifting_rep, pow10); + } + + // Our shifting_rep is now the integer mantissa, return it and the powers of 2 + return std::pair{shifting_rep, pow2}; +} + +/** + * @brief Perform base-10 -> base-2 fixed-point conversion for pow10 < 0 + * + * @tparam DecimalRep The decimal integer type we are converting from + * @tparam FloatingType The type of floating point object we are converting to + * @param decimal_rep The decimal integer to convert + * @param pow10 The number of powers of 10 to apply to undo the scale factor + * @return A pair of the base-2 value and the remaining powers of 2 to be applied + */ +template )> +CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int const pow10) +{ + // This is the reverse of shift_to_decimal_negpow(), see that for more details. + + // ShiftingRep: uint64 for float's, __uint128_t for double's + using Constants = shifting_constants; + using ShiftingRep = typename Constants::ShiftingRep; + + // We want to start with our significand bits at the top of the shifting range, + // so that we lose minimal information we need on intermediary right-shifts. + // Note that since we're shifting 2s up, we need num_2s_shift_buffer_bits space on the high side + static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::num_2s_shift_buffer_bits; + int const shift_from = count_significant_bits(decimal_rep); + int const num_init_bit_shift = shift_up_to - shift_from; + int pow2 = -num_init_bit_shift; + + // Perform the initial bit shift + ShiftingRep shifting_rep; + if constexpr (sizeof(ShiftingRep) < sizeof(DecimalRep)) { + // Shift within DecimalRep before dropping to the smaller ShiftingRep + decimal_rep = (pow2 >= 0) ? (decimal_rep >> pow2) : (decimal_rep << -pow2); + shifting_rep = static_cast(decimal_rep); + } else { + // Scale up to ShiftingRep before shifting + shifting_rep = static_cast(decimal_rep); + shifting_rep = (pow2 >= 0) ? (shifting_rep >> pow2) : (shifting_rep << -pow2); + } + + // Convert to using positive values upfront, simpler than doing later. + int pow10_mag = -pow10; + + // Iterate, dividing by 10s and shifting up by 2s until we're almost done + while (pow10_mag > Constants::max_digits_shift) { + // More decimal places to shift than we have room: Divide the max number of 10s + shifting_rep /= Constants::max_digits_shift_pow; + pow10_mag -= Constants::max_digits_shift; + + // Then make more room by bit shifting up by the max # of 2s + shifting_rep <<= Constants::max_bits_shift; + pow2 -= Constants::max_bits_shift; + } + + // Last 10s-shift: Divdie all remaining decimal places. + // This divide result may not fit in the low half of the bit range + // But the divisor is less than the max-shift, and thus fits within 64 / 32 bits + if constexpr (Constants::is_double) { + shifting_rep = divide_power10_64bit(shifting_rep, pow10_mag); + } else { + shifting_rep = divide_power10_32bit(shifting_rep, pow10_mag); + } + + // Our shifting_rep is now the integer mantissa, return it and the powers of 2 + return std::pair{shifting_rep, pow2}; +} + +/** + * @brief Perform integer decimal -> floating-point conversion + * + * @tparam FloatingType The type of floating-point object we are converting to + * @tparam Rep The decimal integer type we are converting from + * @param value The decimal integer to convert + * @param scale The base-10 scale factor for the input integer + * @return Floating-point representation of the scaled integral value + */ +template )> +CUDF_HOST_DEVICE inline FloatingType convert_integral_to_floating(Rep const& value, + scale_type const& scale) +{ + // Check the sign of the input + bool const is_negative = (value < 0); + + // Convert to unsigned for bit counting/shifting + using UnsignedType = cuda::std::make_unsigned_t; + auto const unsigned_value = [&]() -> UnsignedType { + // Must guard against minimum value, as we can't just negate it: not representable. + if (value == cuda::std::numeric_limits::min()) { return static_cast(value); } + + // No abs function for 128bit types, so have to do it manually. + if constexpr (cuda::std::is_same_v) { + return static_cast(is_negative ? -value : value); + } else { + return cuda::std::abs(value); + } + }(); + + // Shift by powers of 2 and 10 to get our integer mantissa + auto const [mantissa, pow2] = [&]() { + auto const pow10 = static_cast(scale); + if (pow10 >= 0) { + return shift_to_binary_pospow(unsigned_value, pow10); + } else { // pow10 < 0 + return shift_to_binary_negpow(unsigned_value, pow10); + } + }(); + + // Zero has special exponent bits, just handle it here + if (mantissa == 0) { return FloatingType(0.0f); } + + // Cast our integer mantissa to floating point + auto const floating = static_cast(mantissa); // IEEE-754 rounds to even + + // Apply the sign and the remaining powers of 2 + using converter = floating_converter; + auto const magnitude = converter::add_pow2(floating, pow2); + return converter::set_is_negative(magnitude, is_negative); +} + } // namespace detail /** @} */ // end of group diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 502ffb9ba4f..11f6ce2bad7 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -39,6 +39,7 @@ #include #include +#include #include @@ -372,8 +373,8 @@ std::unique_ptr from_arrow( std::unique_ptr from_arrow( ArrowSchema const* schema, ArrowArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input @@ -391,8 +392,8 @@ std::unique_ptr from_arrow( std::unique_ptr from_arrow_column( ArrowSchema const* schema, ArrowArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::table` from given ArrowDeviceArray input @@ -415,8 +416,8 @@ std::unique_ptr from_arrow_column( std::unique_ptr from_arrow_host( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::table` from given ArrowArrayStream input @@ -433,8 +434,8 @@ std::unique_ptr
from_arrow_host( */ std::unique_ptr
from_arrow_stream( ArrowArrayStream* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::column` from given ArrowDeviceArray input @@ -456,8 +457,8 @@ std::unique_ptr
from_arrow_stream( std::unique_ptr from_arrow_host_column( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray @@ -537,8 +538,8 @@ using unique_table_view_t = unique_table_view_t from_arrow_device( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter @@ -580,8 +581,8 @@ using unique_column_view_t = unique_column_view_t from_arrow_device_column( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp new file mode 100644 index 00000000000..1827ba0e3e6 --- /dev/null +++ b/cpp/include/cudf/io/config_utils.hpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace CUDF_EXPORT cudf { +namespace io::cufile_integration { + +/** + * @brief Returns true if cuFile and its compatibility mode are enabled. + */ +bool is_always_enabled(); + +/** + * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled). + */ +bool is_gds_enabled(); + +/** + * @brief Returns true if KvikIO is enabled. + */ +bool is_kvikio_enabled(); + +} // namespace io::cufile_integration + +namespace io::nvcomp_integration { + +/** + * @brief Returns true if all nvCOMP uses are enabled. + */ +bool is_all_enabled(); + +/** + * @brief Returns true if stable nvCOMP use is enabled. + */ +bool is_stable_enabled(); + +} // namespace io::nvcomp_integration +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 68bb7fba00e..cc361f0918e 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1756,11 +1756,9 @@ class csv_writer_options_builder { * * @param options Settings for controlling writing behavior * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation */ void write_csv(csv_writer_options const& options, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** @} */ // end of group } // namespace io diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp index 50c1a7c163d..2a70fa888f4 100644 --- a/cpp/include/cudf/io/detail/csv.hpp +++ b/cpp/include/cudf/io/detail/csv.hpp @@ -49,14 +49,12 @@ table_with_metadata read_csv(std::unique_ptr&& source, * @param column_names Column names for the output CSV * @param options Settings for controlling behavior * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource to use for device memory allocation */ void write_csv(data_sink* sink, table_view const& table, host_span column_names, csv_writer_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + rmm::cuda_stream_view stream); } // namespace csv } // namespace detail diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 540a584908d..6ff1c12831b 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -46,13 +46,11 @@ table_with_metadata read_json(host_span> sources, * @param table The set of columns * @param options Settings for controlling behavior * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource to use for device memory allocation */ void write_json(data_sink* sink, table_view const& table, json_writer_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + rmm::cuda_stream_view stream); /** * @brief Normalize single quotes to double quotes using FST diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 8de690482f9..7af90766ad0 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -1018,11 +1018,9 @@ class json_writer_options_builder { * * @param options Settings for controlling writing behavior * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation */ void write_json(json_writer_options const& options, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** @} */ // end of group } // namespace io diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp new file mode 100644 index 00000000000..f3260d0cb53 --- /dev/null +++ b/cpp/include/cudf/io/nvcomp_adapter.hpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +namespace CUDF_EXPORT cudf { +namespace io::nvcomp { + +enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 }; + +/** + * @brief Set of parameters that impact whether nvCOMP features are enabled. + * + */ +struct feature_status_parameters { + int lib_major_version; ///< major version + int lib_minor_version; ///< minor version + int lib_patch_version; ///< patch version + bool are_all_integrations_enabled; ///< all integrations + bool are_stable_integrations_enabled; ///< stable integrations + int compute_capability_major; ///< cuda compute major version + + /** + * @brief Default Constructor + */ + feature_status_parameters(); + + /** + * @brief feature_status_parameters Constructor + * + * @param major positive integer representing major value of nvcomp + * @param minor positive integer representing minor value of nvcomp + * @param patch positive integer representing patch value of nvcomp + * @param all_enabled if all integrations are enabled + * @param stable_enabled if stable integrations are enabled + * @param cc_major CUDA compute capability + */ + feature_status_parameters( + int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major) + : lib_major_version{major}, + lib_minor_version{minor}, + lib_patch_version{patch}, + are_all_integrations_enabled{all_enabled}, + are_stable_integrations_enabled{stable_enabled}, + compute_capability_major{cc_major} + { + } +}; + +/** + * @brief Equality operator overload. Required to use `feature_status_parameters` as a map key. + */ +inline bool operator==(feature_status_parameters const& lhs, feature_status_parameters const& rhs) +{ + return lhs.lib_major_version == rhs.lib_major_version and + lhs.lib_minor_version == rhs.lib_minor_version and + lhs.lib_patch_version == rhs.lib_patch_version and + lhs.are_all_integrations_enabled == rhs.are_all_integrations_enabled and + lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled and + lhs.compute_capability_major == rhs.compute_capability_major; +} + +/** + * @brief If a compression type is disabled through nvCOMP, returns the reason as a string. + * + * Result depends on nvCOMP version and environment variables. + * + * @param compression Compression type + * @param params Optional parameters to query status with different configurations + * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled + */ +[[nodiscard]] std::optional is_compression_disabled( + compression_type compression, feature_status_parameters params = feature_status_parameters()); + +/** + * @brief If a decompression type is disabled through nvCOMP, returns the reason as a string. + * + * Result depends on nvCOMP version and environment variables. + * + * @param compression Compression type + * @param params Optional parameters to query status with different configurations + * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled + */ +[[nodiscard]] std::optional is_decompression_disabled( + compression_type compression, feature_status_parameters params = feature_status_parameters()); + +} // namespace io::nvcomp +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 431f14af522..4d98cae73a7 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -597,6 +597,8 @@ class parquet_writer_options_base { // Parquet writer can write timestamps as UTC // Defaults to true because libcudf timestamps are implicitly UTC bool _write_timestamps_as_UTC = true; + // Whether to write ARROW schema + bool _write_arrow_schema = false; // Maximum size of each row group (unless smaller than a single page) size_t _row_group_size_bytes = default_row_group_size_bytes; // Maximum number of rows in row group (unless smaller than a single page) @@ -689,6 +691,13 @@ class parquet_writer_options_base { */ [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; } + /** + * @brief Returns `true` if arrow schema will be written + * + * @return `true` if arrow schema will be written + */ + [[nodiscard]] auto is_enabled_write_arrow_schema() const { return _write_arrow_schema; } + /** * @brief Returns maximum row group size, in bytes. * @@ -824,6 +833,13 @@ class parquet_writer_options_base { */ void enable_utc_timestamps(bool val); + /** + * @brief Sets preference for writing arrow schema. Write arrow schema if set to `true`. + * + * @param val Boolean value to enable/disable writing of arrow schema. + */ + void enable_write_arrow_schema(bool val); + /** * @brief Sets the maximum row group size, in bytes. * @@ -1084,6 +1100,15 @@ class parquet_writer_options_builder_base { * @return this for chaining */ BuilderT& utc_timestamps(bool enabled); + + /** + * @brief Set to true if arrow schema is to be written + * + * @param enabled Boolean value to enable/disable writing of arrow schema + * @return this for chaining + */ + BuilderT& write_arrow_schema(bool enabled); + /** * @brief Set to true if V2 page headers are to be written. * diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp index a19aa9be0c0..a714f762a19 100644 --- a/cpp/include/cudf/strings/replace.hpp +++ b/cpp/include/cudf/strings/replace.hpp @@ -122,7 +122,7 @@ std::unique_ptr replace_slice( * If a target string is found, it is replaced by the corresponding entry in the repls column. * All occurrences found in each string are replaced. * - * This does not use regex to match targets in the string. + * This does not use regex to match targets in the string. Empty string targets are ignored. * * Null string entries will return null output string entries. * diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp index 74c8bc67d3a..1609c72f175 100644 --- a/cpp/include/cudf/unary.hpp +++ b/cpp/include/cudf/unary.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -50,14 +51,19 @@ namespace cudf { */ template () && - cuda::std::is_floating_point_v>* = nullptr> + CUDF_ENABLE_IF(cuda::std::is_floating_point_v&& is_fixed_point())> CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::scale_type scale) { - using Rep = typename Fixed::rep; - auto const shifted = numeric::detail::shift(floating, scale); - numeric::scaled_integer scaled{static_cast(shifted), scale}; - return Fixed(scaled); + using Rep = typename Fixed::rep; + auto const value = [&]() { + if constexpr (Fixed::rad == numeric::Radix::BASE_10) { + return numeric::detail::convert_floating_to_integral(floating, scale); + } else { + return static_cast(numeric::detail::shift(floating, scale)); + } + }(); + + return Fixed(numeric::scaled_integer{value, scale}); } /** @@ -75,14 +81,17 @@ CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::sca */ template && - is_fixed_point()>* = nullptr> + CUDF_ENABLE_IF(cuda::std::is_floating_point_v&& is_fixed_point())> CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed) { - using Rep = typename Fixed::rep; - auto const casted = static_cast(fixed.value()); - auto const scale = numeric::scale_type{-fixed.scale()}; - return numeric::detail::shift(casted, scale); + using Rep = typename Fixed::rep; + if constexpr (Fixed::rad == numeric::Radix::BASE_10) { + return numeric::detail::convert_integral_to_floating(fixed.value(), fixed.scale()); + } else { + auto const casted = static_cast(fixed.value()); + auto const scale = numeric::scale_type{-fixed.scale()}; + return numeric::detail::shift(casted, scale); + } } /** @@ -95,7 +104,7 @@ CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed) */ template >* = nullptr> + CUDF_ENABLE_IF(cuda::std::is_floating_point_v)> CUDF_HOST_DEVICE Floating convert_to_floating(Input input) { if constexpr (is_fixed_point()) { @@ -202,6 +211,16 @@ std::unique_ptr cast( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +/** + * @brief Check if a cast between two datatypes is supported. + * + * @param from source type + * @param to target type + * + * @returns true if the cast is supported. + */ +bool is_supported_cast(data_type from, data_type to) noexcept; + /** * @brief Creates a column of `type_id::BOOL8` elements indicating the presence of `NaN` values * in a column of floating point values. diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp index a39df064f44..45d5d1b12e1 100644 --- a/cpp/include/cudf/utilities/logger.hpp +++ b/cpp/include/cudf/utilities/logger.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,9 +16,11 @@ #pragma once +#include + #include -namespace cudf { +namespace CUDF_EXPORT cudf { /** * @brief Returns the global logger. @@ -43,4 +45,4 @@ namespace cudf { */ spdlog::logger& logger(); -} // namespace cudf +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/utilities/thread_pool.hpp b/cpp/include/cudf/utilities/thread_pool.hpp deleted file mode 100644 index c8c3eb097c4..00000000000 --- a/cpp/include/cudf/utilities/thread_pool.hpp +++ /dev/null @@ -1,381 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -/** - * Modified from https://github.com/bshoshany/thread-pool - * @copyright Copyright (c) 2021 Barak Shoshany. Licensed under the MIT license. - * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT - */ - -#include // std::atomic -#include // std::chrono -#include // std::int_fast64_t, std::uint_fast32_t -#include // std::function -#include // std::future, std::promise -#include // std::shared_ptr, std::unique_ptr -#include // std::mutex, std::scoped_lock -#include // std::queue -#include // std::this_thread, std::thread -#include // std::decay_t, std::enable_if_t, std::is_void_v, std::invoke_result_t -#include // std::move, std::swap - -namespace cudf { -namespace detail { - -/** - * @brief A C++17 thread pool class. The user submits tasks to be executed into a queue. Whenever a - * thread becomes available, it pops a task from the queue and executes it. Each task is - * automatically assigned a future, which can be used to wait for the task to finish executing - * and/or obtain its eventual return value. - */ -class thread_pool { - using ui32 = int; - - public: - /** - * @brief Construct a new thread pool. - * - * @param _thread_count The number of threads to use. The default value is the total number of - * hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this - * will be twice the number of CPU cores. If the argument is zero, the default value will be used - * instead. - */ - thread_pool(ui32 const& _thread_count = std::thread::hardware_concurrency()) - : thread_count(_thread_count ? _thread_count : std::thread::hardware_concurrency()), - threads(new std::thread[_thread_count ? _thread_count : std::thread::hardware_concurrency()]) - { - create_threads(); - } - - /** - * @brief Destruct the thread pool. Waits for all tasks to complete, then destroys all threads. - * Note that if the variable paused is set to true, then any tasks still in the queue will never - * be executed. - */ - ~thread_pool() - { - wait_for_tasks(); - running = false; - destroy_threads(); - } - - /** - * @brief Get the number of tasks currently waiting in the queue to be executed by the threads. - * - * @return The number of queued tasks. - */ - [[nodiscard]] size_t get_tasks_queued() const - { - std::scoped_lock const lock(queue_mutex); - return tasks.size(); - } - - /** - * @brief Get the number of tasks currently being executed by the threads. - * - * @return The number of running tasks. - */ - [[nodiscard]] ui32 get_tasks_running() const { return tasks_total - (ui32)get_tasks_queued(); } - - /** - * @brief Get the total number of unfinished tasks - either still in the queue, or running in a - * thread. - * - * @return The total number of tasks. - */ - [[nodiscard]] ui32 get_tasks_total() const { return tasks_total; } - - /** - * @brief Get the number of threads in the pool. - * - * @return The number of threads. - */ - [[nodiscard]] ui32 get_thread_count() const { return thread_count; } - - /** - * @brief Parallelize a loop by splitting it into blocks, submitting each block separately to the - * thread pool, and waiting for all blocks to finish executing. The loop will be equivalent to: - * for (T i = first_index; i <= last_index; i++) loop(i); - * - * @tparam T The type of the loop index. Should be a signed or unsigned integer. - * @tparam F The type of the function to loop through. - * @param first_index The first index in the loop (inclusive). - * @param last_index The last index in the loop (inclusive). - * @param loop The function to loop through. Should take exactly one argument, the loop index. - * @param num_tasks The maximum number of tasks to split the loop into. The default is to use the - * number of threads in the pool. - */ - template - void parallelize_loop(T first_index, T last_index, F const& loop, ui32 num_tasks = 0) - { - if (num_tasks == 0) num_tasks = thread_count; - if (last_index < first_index) std::swap(last_index, first_index); - size_t total_size = last_index - first_index + 1; - size_t block_size = total_size / num_tasks; - if (block_size == 0) { - block_size = 1; - num_tasks = (ui32)total_size > 1 ? (ui32)total_size : 1; - } - std::atomic blocks_running = 0; - for (ui32 t = 0; t < num_tasks; t++) { - T start = (T)(t * block_size + first_index); - T end = (t == num_tasks - 1) ? last_index : (T)((t + 1) * block_size + first_index - 1); - blocks_running++; - push_task([start, end, &loop, &blocks_running] { - for (T i = start; i <= end; i++) - loop(i); - blocks_running--; - }); - } - while (blocks_running != 0) { - sleep_or_yield(); - } - } - - /** - * @brief Push a function with no arguments or return value into the task queue. - * - * @tparam F The type of the function. - * @param task The function to push. - */ - template - void push_task(F const& task) - { - tasks_total++; - { - std::scoped_lock const lock(queue_mutex); - tasks.push(std::function(task)); - } - } - - /** - * @brief Push a function with arguments, but no return value, into the task queue. - * @details The function is wrapped inside a lambda in order to hide the arguments, as the tasks - * in the queue must be of type std::function, so they cannot have any arguments or return - * value. If no arguments are provided, the other overload will be used, in order to avoid the - * (slight) overhead of using a lambda. - * - * @tparam F The type of the function. - * @tparam A The types of the arguments. - * @param task The function to push. - * @param args The arguments to pass to the function. - */ - template - void push_task(F const& task, A const&... args) - { - push_task([task, args...] { task(args...); }); - } - - /** - * @brief Reset the number of threads in the pool. Waits for all currently running tasks to be - * completed, then destroys all threads in the pool and creates a new thread pool with the new - * number of threads. Any tasks that were waiting in the queue before the pool was reset will then - * be executed by the new threads. If the pool was paused before resetting it, the new pool will - * be paused as well. - * - * @param _thread_count The number of threads to use. The default value is the total number of - * hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this - * will be twice the number of CPU cores. If the argument is zero, the default value will be used - * instead. - */ - void reset(ui32 const& _thread_count = std::thread::hardware_concurrency()) - { - bool was_paused = paused; - paused = true; - wait_for_tasks(); - running = false; - destroy_threads(); - thread_count = _thread_count ? _thread_count : std::thread::hardware_concurrency(); - threads = std::make_unique(thread_count); - paused = was_paused; - create_threads(); - running = true; - } - - /** - * @brief Submit a function with zero or more arguments and a return value into the task queue, - * and get a future for its eventual returned value. - * - * @tparam F The type of the function. - * @tparam A The types of the zero or more arguments to pass to the function. - * @tparam R The return type of the function. - * @param task The function to submit. - * @param args The zero or more arguments to pass to the function. - * @return A future to be used later to obtain the function's returned value, waiting for it to - * finish its execution if needed. - */ - template , std::decay_t...>> - std::future submit(F const& task, A const&... args) - { - std::shared_ptr> promise(new std::promise); - std::future future = promise->get_future(); - push_task([task, args..., promise] { - try { - if constexpr (std::is_void_v) { - task(args...); - promise->set_value(); - } else { - promise->set_value(task(args...)); - } - } catch (...) { - promise->set_exception(std::current_exception()); - }; - }); - return future; - } - - /** - * @brief Wait for tasks to be completed. Normally, this function waits for all tasks, both those - * that are currently running in the threads and those that are still waiting in the queue. - * However, if the variable paused is set to true, this function only waits for the currently - * running tasks (otherwise it would wait forever). To wait for a specific task, use submit() - * instead, and call the wait() member function of the generated future. - */ - void wait_for_tasks() - { - while (true) { - if (!paused) { - if (tasks_total == 0) break; - } else { - if (get_tasks_running() == 0) break; - } - sleep_or_yield(); - } - } - - /** - * @brief An atomic variable indicating to the workers to pause. When set to true, the workers - * temporarily stop popping new tasks out of the queue, although any tasks already executed will - * keep running until they are done. Set to false again to resume popping tasks. - */ - std::atomic paused = false; - - /** - * @brief The duration, in microseconds, that the worker function should sleep for when it cannot - * find any tasks in the queue. If set to 0, then instead of sleeping, the worker function will - * execute std::this_thread::yield() if there are no tasks in the queue. The default value is - * 1000. - */ - ui32 sleep_duration = 1000; - - private: - /** - * @brief Create the threads in the pool and assign a worker to each thread. - */ - void create_threads() - { - for (ui32 i = 0; i < thread_count; i++) { - threads[i] = std::thread(&thread_pool::worker, this); - } - } - - /** - * @brief Destroy the threads in the pool by joining them. - */ - void destroy_threads() - { - for (ui32 i = 0; i < thread_count; i++) { - threads[i].join(); - } - } - - /** - * @brief Try to pop a new task out of the queue. - * - * @param task A reference to the task. Will be populated with a function if the queue is not - * empty. - * @return true if a task was found, false if the queue is empty. - */ - bool pop_task(std::function& task) - { - std::scoped_lock const lock(queue_mutex); - if (tasks.empty()) - return false; - else { - task = std::move(tasks.front()); - tasks.pop(); - return true; - } - } - - /** - * @brief Sleep for sleep_duration microseconds. If that variable is set to zero, yield instead. - * - */ - void sleep_or_yield() - { - if (sleep_duration) - std::this_thread::sleep_for(std::chrono::microseconds(sleep_duration)); - else - std::this_thread::yield(); - } - - /** - * @brief A worker function to be assigned to each thread in the pool. Continuously pops tasks out - * of the queue and executes them, as long as the atomic variable running is set to true. - */ - void worker() - { - while (running) { - std::function task; - if (!paused && pop_task(task)) { - task(); - tasks_total--; - } else { - sleep_or_yield(); - } - } - } - - /** - * @brief A mutex to synchronize access to the task queue by different threads. - */ - mutable std::mutex queue_mutex; - - /** - * @brief An atomic variable indicating to the workers to keep running. When set to false, the - * workers permanently stop working. - */ - std::atomic running = true; - - /** - * @brief A queue of tasks to be executed by the threads. - */ - std::queue> tasks; - - /** - * @brief The number of threads in the pool. - */ - ui32 thread_count; - - /** - * @brief A smart pointer to manage the memory allocated for the threads. - */ - std::unique_ptr threads; - - /** - * @brief An atomic variable to keep track of the total number of unfinished tasks - either still - * in the queue, or running in a thread. - */ - std::atomic tasks_total = 0; -}; - -} // namespace detail -} // namespace cudf diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index 7363f965af8..2abd6f0abac 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -226,6 +226,9 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end) using namespace numeric; using RepType = typename ElementTo::rep; + CUDF_EXPECTS(std::all_of(begin, end, [](ElementFrom v) { return v.scale() == 0; }), + "Only zero-scale fixed-point values are supported"); + auto to_rep = [](ElementTo fp) { return fp.value(); }; auto transformer_begin = thrust::make_transform_iterator(begin, to_rep); auto const size = cudf::distance(begin, end); diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py index 3fe503f749e..42f84e4d0c7 100755 --- a/cpp/scripts/sort_ninja_log.py +++ b/cpp/scripts/sort_ninja_log.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # import argparse import os @@ -9,14 +9,12 @@ from xml.dom import minidom parser = argparse.ArgumentParser() -parser.add_argument( - "log_file", type=str, default=".ninja_log", help=".ninja_log file" -) +parser.add_argument("log_file", type=str, default=".ninja_log", help=".ninja_log file") parser.add_argument( "--fmt", type=str, default="csv", - choices=["csv", "xml", "html"], + choices=["csv", "html"], help="output format (to stdout)", ) parser.add_argument( @@ -37,6 +35,7 @@ output_fmt = args.fmt cmp_file = args.cmp_log + # build a map of the log entries def build_log_map(log_file): entries = {} @@ -68,37 +67,6 @@ def build_log_map(log_file): return entries -# output results in XML format -def output_xml(entries, sorted_list, args): - root = ET.Element("testsuites") - testsuite = ET.Element( - "testsuite", - attrib={ - "name": "build-time", - "tests": str(len(sorted_list)), - "failures": str(0), - "errors": str(0), - }, - ) - root.append(testsuite) - for name in sorted_list: - entry = entries[name] - build_time = float(entry[1] - entry[0]) / 1000 - item = ET.Element( - "testcase", - attrib={ - "classname": "BuildTime", - "name": name, - "time": str(build_time), - }, - ) - testsuite.append(item) - - tree = ET.ElementTree(root) - xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ") - print(xmlstr) - - # utility converts a millisecond value to a column width in pixels def time_to_width(value, end): # map a value from (0,end) to (0,1000) @@ -282,9 +250,7 @@ def output_html(entries, sorted_list, cmp_entries, args): # output detail table in build-time descending order print("
") - print( - "", "", "", sep="" - ) + print("", "", "", sep="") if cmp_entries: print("", sep="") print("") @@ -303,9 +269,7 @@ def output_html(entries, sorted_list, cmp_entries, args): print("", sep="", end="") print("", sep="", end="") # output diff column - cmp_entry = ( - cmp_entries[name] if cmp_entries and name in cmp_entries else None - ) + cmp_entry = cmp_entries[name] if cmp_entries and name in cmp_entries else None if cmp_entry: diff_time = build_time - (cmp_entry[1] - cmp_entry[0]) diff_time_str = format_build_time(diff_time) @@ -353,7 +317,7 @@ def output_html(entries, sorted_list, cmp_entries, args): print( "time change < 20%% or build time < 1 minute", + ">time change < 20% or build time < 1 minute", ) print("
FileCompile timeSize
FileCompile timeSizet-cmp
", build_time_str, "", file_size_str, "
") @@ -370,9 +334,7 @@ def output_csv(entries, sorted_list, cmp_entries, args): entry = entries[name] build_time = entry[1] - entry[0] file_size = entry[2] - cmp_entry = ( - cmp_entries[name] if cmp_entries and name in cmp_entries else None - ) + cmp_entry = cmp_entries[name] if cmp_entries and name in cmp_entries else None print(build_time, file_size, name, sep=",", end="") if cmp_entry: diff_time = build_time - (cmp_entry[1] - cmp_entry[0]) @@ -396,9 +358,7 @@ def output_csv(entries, sorted_list, cmp_entries, args): # load the comparison build log if available cmp_entries = build_log_map(cmp_file) if cmp_file else None -if output_fmt == "xml": - output_xml(entries, sorted_list, args) -elif output_fmt == "html": +if output_fmt == "html": output_html(entries, sorted_list, cmp_entries, args) else: output_csv(entries, sorted_list, cmp_entries, args) diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 8ac1491547d..3ac8547baad 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -50,6 +50,11 @@ namespace cudf { namespace binops { +bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op) +{ + return cudf::binops::compiled::is_supported_operation(out, lhs, rhs, op); +} + /** * @brief Computes output valid mask for op between a column and a scalar */ @@ -194,7 +199,7 @@ std::unique_ptr binary_operation(LhsType const& lhs, rmm::device_async_resource_ref mr) { if constexpr (std::is_same_v and std::is_same_v) - CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match"); + CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match", std::invalid_argument); if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and output_type.id() == type_id::STRING and diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 73c1a474310..e1d289e67a3 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -56,7 +57,7 @@ struct dispatch_from_arrow_device { data_type, bool, rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) + rmm::device_async_resource_ref) { CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error); } @@ -68,7 +69,7 @@ struct dispatch_from_arrow_device { data_type type, bool skip_mask, rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) + rmm::device_async_resource_ref mr) { size_type const num_rows = input->length; size_type const offset = input->offset; @@ -90,7 +91,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); + rmm::device_async_resource_ref mr); template <> dispatch_tuple_t dispatch_from_arrow_device::operator()(ArrowSchemaView* schema, @@ -98,7 +99,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()(ArrowSchemaView* s data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { if (input->length == 0) { return std::make_tuple( @@ -141,7 +142,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING, "Large strings are not yet supported in from_arrow_device", @@ -182,7 +183,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { ArrowSchemaView keys_schema_view; NANOARROW_THROW_NOT_OK( @@ -238,7 +239,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { std::vector children; owned_columns_t out_owned_cols; @@ -283,7 +284,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { size_type const num_rows = input->length; size_type const offset = input->offset; @@ -324,7 +325,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { return type.id() != type_id::EMPTY ? std::move(type_dispatcher( @@ -342,7 +343,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, unique_table_view_t from_arrow_device(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL", @@ -397,7 +398,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema, unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL", @@ -429,7 +430,7 @@ unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, unique_table_view_t from_arrow_device(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -439,7 +440,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema, unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index b7e07056686..b3087dedf98 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -49,7 +50,7 @@ namespace { struct dispatch_copy_from_arrow_host { rmm::cuda_stream_view stream; - rmm::mr::device_memory_resource* mr; + rmm::device_async_resource_ref mr; std::unique_ptr get_mask_buffer(ArrowArray const* array) { @@ -131,7 +132,7 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); + rmm::device_async_resource_ref mr); template <> std::unique_ptr dispatch_copy_from_arrow_host::operator()(ArrowSchemaView* schema, @@ -388,7 +389,7 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, data_type type, bool skip_mask, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { return type.id() != type_id::EMPTY ? std::move(type_dispatcher( @@ -405,7 +406,7 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, std::unique_ptr from_arrow_host(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL", @@ -441,7 +442,7 @@ std::unique_ptr
from_arrow_host(ArrowSchema const* schema, std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL", @@ -462,7 +463,7 @@ std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, std::unique_ptr
from_arrow_host(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -472,7 +473,7 @@ std::unique_ptr
from_arrow_host(ArrowSchema const* schema, std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -482,7 +483,7 @@ std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, std::unique_ptr
from_arrow(ArrowSchema const* schema, ArrowArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -497,7 +498,7 @@ std::unique_ptr
from_arrow(ArrowSchema const* schema, std::unique_ptr from_arrow_column(ArrowSchema const* schema, ArrowArray const* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu index 0c85b561944..578105aa90a 100644 --- a/cpp/src/interop/from_arrow_stream.cu +++ b/cpp/src/interop/from_arrow_stream.cu @@ -41,7 +41,7 @@ namespace { std::unique_ptr make_empty_column_from_schema(ArrowSchema const* schema, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { ArrowSchemaView schema_view; NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr)); @@ -81,7 +81,7 @@ std::unique_ptr make_empty_column_from_schema(ArrowSchema const* schema, std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(input != nullptr, "input ArrowArrayStream must not be NULL", std::invalid_argument); @@ -135,7 +135,7 @@ std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::from_arrow_stream(input, stream, mr); diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 2b3aa2f08f1..622a3aba4bb 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -365,6 +365,9 @@ std::shared_ptr dispatch_to_arrow::operator()( arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { + CUDF_EXPECTS(metadata.children_meta.empty() || + metadata.children_meta.size() == static_cast(input.num_children()), + "Number of field names and number of children do not match\n"); std::unique_ptr tmp_column = nullptr; if ((input.offset() != 0) or ((input.num_children() == 2) and (input.child(0).size() - 1 != input.size()))) { @@ -375,8 +378,11 @@ std::shared_ptr dispatch_to_arrow::operator()( auto children_meta = metadata.children_meta.empty() ? std::vector{{}, {}} : metadata.children_meta; auto child_arrays = fetch_child_array(input_view, children_meta, ar_mr, stream); - if (child_arrays.empty()) { - return std::make_shared(arrow::list(arrow::null()), 0, nullptr, nullptr); + if (child_arrays.empty() || child_arrays[0]->data()->length == 0) { + auto element_type = child_arrays.empty() ? arrow::null() : child_arrays[1]->type(); + auto result = arrow::MakeEmptyArray(arrow::list(element_type), ar_mr); + CUDF_EXPECTS(result.ok(), "Failed to construct empty arrow list array\n"); + return result.ValueUnsafe(); } auto offset_buffer = child_arrays[0]->data()->buffers[1]; diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu index ebfd6605977..b9d3a59e647 100644 --- a/cpp/src/interop/to_arrow_device.cu +++ b/cpp/src/interop/to_arrow_device.cu @@ -603,7 +603,7 @@ unique_device_array_t create_device_array(nanoarrow::UniqueArray&& out, }); result->device_id = rmm::get_current_cuda_device().value(); result->device_type = ARROW_DEVICE_CUDA; - result->sync_event = private_data->sync_event; + result->sync_event = &private_data->sync_event; result->array = private_data->parent; // makes a shallow copy result->array.private_data = private_data.release(); result->array.release = &detail::ArrowDeviceArrayRelease; diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index f8920bf82c2..5d0c6a8c83b 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -13,11 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include "nvcomp_adapter.hpp" -#include "io/utilities/config_utils.hpp" #include "nvcomp_adapter.cuh" +#include +#include #include #include @@ -35,6 +37,13 @@ #include NVCOMP_ZSTD_HEADER #endif +// When building with nvcomp 4.0 or newer, map the new version macros to the old ones +#ifndef NVCOMP_MAJOR_VERSION +#define NVCOMP_MAJOR_VERSION NVCOMP_VER_MAJOR +#define NVCOMP_MINOR_VERSION NVCOMP_VER_MINOR +#define NVCOMP_PATCH_VERSION NVCOMP_VER_PATCH +#endif + #define NVCOMP_HAS_ZSTD_DECOMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 3)) #define NVCOMP_HAS_ZSTD_COMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 4)) @@ -472,8 +481,8 @@ feature_status_parameters::feature_status_parameters() : lib_major_version{NVCOMP_MAJOR_VERSION}, lib_minor_version{NVCOMP_MINOR_VERSION}, lib_patch_version{NVCOMP_PATCH_VERSION}, - are_all_integrations_enabled{detail::nvcomp_integration::is_all_enabled()}, - are_stable_integrations_enabled{detail::nvcomp_integration::is_stable_enabled()} + are_all_integrations_enabled{nvcomp_integration::is_all_enabled()}, + are_stable_integrations_enabled{nvcomp_integration::is_stable_enabled()} { int device; CUDF_CUDA_TRY(cudaGetDevice(&device)); diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 1a680a050fd..43c79e32375 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -17,8 +17,9 @@ #pragma once #include "gpuinflate.hpp" -#include "io/utilities/config_utils.hpp" +#include +#include #include #include @@ -27,70 +28,6 @@ #include namespace cudf::io::nvcomp { - -enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 }; - -/** - * @brief Set of parameters that impact whether the use nvCOMP features is enabled. - */ -struct feature_status_parameters { - int lib_major_version; - int lib_minor_version; - int lib_patch_version; - bool are_all_integrations_enabled; - bool are_stable_integrations_enabled; - int compute_capability_major; - - feature_status_parameters(); - feature_status_parameters( - int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major) - : lib_major_version{major}, - lib_minor_version{minor}, - lib_patch_version{patch}, - are_all_integrations_enabled{all_enabled}, - are_stable_integrations_enabled{stable_enabled}, - compute_capability_major{cc_major} - { - } -}; - -/** - * @brief Equality operator overload. Required to use `feature_status_parameters` as a map key. - */ -inline bool operator==(feature_status_parameters const& lhs, feature_status_parameters const& rhs) -{ - return lhs.lib_major_version == rhs.lib_major_version and - lhs.lib_minor_version == rhs.lib_minor_version and - lhs.lib_patch_version == rhs.lib_patch_version and - lhs.are_all_integrations_enabled == rhs.are_all_integrations_enabled and - lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled and - lhs.compute_capability_major == rhs.compute_capability_major; -} - -/** - * @brief If a compression type is disabled through nvCOMP, returns the reason as a string. - * - * Result cab depend on nvCOMP version and environment variables. - * - * @param compression Compression type - * @param params Optional parameters to query status with different configurations - * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled - */ -[[nodiscard]] std::optional is_compression_disabled( - compression_type compression, feature_status_parameters params = feature_status_parameters()); - -/** - * @brief If a decompression type is disabled through nvCOMP, returns the reason as a string. - * - * Result can depend on nvCOMP version and environment variables. - * - * @param compression Compression type - * @param params Optional parameters to query status with different configurations - * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled - */ -[[nodiscard]] std::optional is_decompression_disabled( - compression_type compression, feature_status_parameters params = feature_status_parameters()); - /** * @brief Device batch decompression of given type. * diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 7c4d5711281..00a6dcb2286 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -372,15 +373,33 @@ void write_chunked(data_sink* out_sink, CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column."); cudf::string_scalar newline{options.get_line_terminator(), true, stream}; - auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view, - newline, - string_scalar{"", false, stream}, - stream, - rmm::mr::get_current_device_resource()); - strings_column_view strings_column{p_str_col_w_nl->view()}; - auto total_num_bytes = strings_column.chars_size(stream); - char const* ptr_all_bytes = strings_column.chars_begin(stream); + // use strings concatenate to build the final CSV output in device memory + auto contents_w_nl = [&] { + auto const total_size = + str_column_view.chars_size(stream) + (newline.size() * str_column_view.size()); + auto const empty_str = string_scalar("", true, stream); + // use join_strings when the output will be less than 2GB + if (total_size < static_cast(std::numeric_limits::max())) { + return cudf::strings::detail::join_strings(str_column_view, newline, empty_str, stream, mr) + ->release(); + } + auto nl_col = cudf::make_column_from_scalar(newline, str_column_view.size(), stream); + // convert the last element into an empty string by resetting the last offset value + auto& offsets = nl_col->child(strings_column_view::offsets_column_index); + auto offsets_view = offsets.mutable_view(); + cudf::fill_in_place(offsets_view, + offsets.size() - 1, // set the last element with + offsets.size(), // the value from 2nd to last element + *cudf::detail::get_element(offsets.view(), offsets.size() - 2, stream, mr), + stream); + auto const nl_tbl = cudf::table_view({str_column_view.parent(), nl_col->view()}); + return cudf::strings::detail::concatenate( + nl_tbl, empty_str, empty_str, strings::separator_on_nulls::NO, stream, mr) + ->release(); + }(); + auto const total_num_bytes = contents_w_nl.data->size(); + auto const ptr_all_bytes = static_cast(contents_w_nl.data->data()); if (out_sink->is_device_write_preferred(total_num_bytes)) { // Direct write from device memory @@ -411,13 +430,13 @@ void write_csv(data_sink* out_sink, table_view const& table, host_span user_column_names, csv_writer_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) + rmm::cuda_stream_view stream) { // write header: column names separated by delimiter: // (even for tables with no rows) // - write_chunked_begin(out_sink, table, user_column_names, options, stream, mr); + write_chunked_begin( + out_sink, table, user_column_names, options, stream, rmm::mr::get_current_device_resource()); if (table.num_rows() > 0) { // no need to check same-size columns constraint; auto-enforced by table_view @@ -491,7 +510,8 @@ void write_csv(data_sink* out_sink, str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource()); }(); - write_chunked(out_sink, str_concat_col->view(), options, stream, mr); + write_chunked( + out_sink, str_concat_col->view(), options, stream, rmm::mr::get_current_device_resource()); } } } diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 5daa55d4552..6d2834206d4 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -215,9 +215,7 @@ table_with_metadata read_json(json_reader_options options, return json::detail::read_json(datasources, options, stream, mr); } -void write_json(json_writer_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +void write_json(json_writer_options const& options, rmm::cuda_stream_view stream) { auto sinks = make_datasinks(options.get_sink()); CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for JSON writing"); @@ -226,8 +224,7 @@ void write_json(json_writer_options const& options, sinks[0].get(), options.get_table(), options, - stream, - mr); + stream); } table_with_metadata read_csv(csv_reader_options options, @@ -252,9 +249,7 @@ table_with_metadata read_csv(csv_reader_options options, } // Freeform API wraps the detail writer class API -void write_csv(csv_writer_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +void write_csv(csv_writer_options const& options, rmm::cuda_stream_view stream) { using namespace cudf::io::detail; @@ -266,8 +261,7 @@ void write_csv(csv_writer_options const& options, options.get_table(), options.get_names(), options, - stream, - mr); + stream); } raw_orc_statistics read_raw_orc_statistics(source_info const& src_info, @@ -762,6 +756,9 @@ void parquet_writer_options_base::set_compression(compression_type compression) void parquet_writer_options_base::enable_int96_timestamps(bool req) { + CUDF_EXPECTS(not req or not is_enabled_write_arrow_schema(), + "INT96 timestamps and arrow schema cannot be simultaneously " + "enabled as INT96 timestamps are deprecated in Arrow."); _write_timestamps_as_int96 = req; } @@ -770,6 +767,14 @@ void parquet_writer_options_base::enable_utc_timestamps(bool val) _write_timestamps_as_UTC = val; } +void parquet_writer_options_base::enable_write_arrow_schema(bool val) +{ + CUDF_EXPECTS(not val or not is_enabled_int96_timestamps(), + "arrow schema and INT96 timestamps cannot be simultaneously " + "enabled as INT96 timestamps are deprecated in Arrow."); + _write_arrow_schema = val; +} + void parquet_writer_options_base::set_row_group_size_bytes(size_t size_bytes) { CUDF_EXPECTS( @@ -974,6 +979,13 @@ BuilderT& parquet_writer_options_builder_base::utc_timestamp return static_cast(*this); } +template +BuilderT& parquet_writer_options_builder_base::write_arrow_schema(bool enabled) +{ + _options.enable_write_arrow_schema(enabled); + return static_cast(*this); +} + template BuilderT& parquet_writer_options_builder_base::write_v2_headers(bool enabled) { diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 74001e5e01a..9cd39038348 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -193,7 +193,8 @@ datasource::owning_buffer> get_record_range_raw_input( size_t chunk_size = reader_opts.get_byte_range_size(); CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset, - "Invalid offsetting"); + "Invalid offsetting", + std::invalid_argument); auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset; chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size; diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 997d6fd99f8..c688c809e04 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -805,8 +805,7 @@ void write_chunked(data_sink* out_sink, strings_column_view const& str_column_view, int const skip_last_chars, json_writer_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column."); @@ -829,8 +828,7 @@ void write_chunked(data_sink* out_sink, void write_json(data_sink* out_sink, table_view const& table, json_writer_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); std::vector user_column_names = [&]() { @@ -912,7 +910,7 @@ void write_json(data_sink* out_sink, bool const include_line_terminator = (&sub_view != &vector_views.back()) or options.is_enabled_lines(); auto const skip_last_chars = (include_line_terminator ? 0 : line_terminator.size()); - write_chunked(out_sink, str_concat_col->view(), skip_last_chars, options, stream, mr); + write_chunked(out_sink, str_concat_col->view(), skip_last_chars, options, stream); } } else { if (options.is_enabled_lines()) { diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index 72eb41b1360..8e20505d3ff 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -19,13 +19,13 @@ #include "io/orc/reader_impl.hpp" #include "io/orc/reader_impl_chunking.hpp" #include "io/orc/reader_impl_helpers.hpp" -#include "io/utilities/config_utils.hpp" #include "io/utilities/hostdevice_span.hpp" #include #include #include #include +#include #include #include diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index b6fc4e3510f..805959327ac 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -16,12 +16,12 @@ #include "io/comp/nvcomp_adapter.hpp" #include "io/utilities/block_utils.cuh" -#include "io/utilities/config_utils.hpp" #include "io/utilities/time_utils.cuh" #include "orc_gpu.hpp" #include #include +#include #include #include #include diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index e9e031a407a..4cb20bb7518 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp new file mode 100644 index 00000000000..ddf65e9020f --- /dev/null +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -0,0 +1,388 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file arrow_schema_writer.cpp + * @brief Arrow IPC schema writer implementation + */ + +#include "arrow_schema_writer.hpp" + +#include "io/parquet/parquet_common.hpp" +#include "io/utilities/base64_utilities.hpp" +#include "ipc/Message_generated.h" +#include "ipc/Schema_generated.h" +#include "writer_impl_helpers.hpp" + +#include +#include +#include + +namespace cudf::io::parquet::detail { + +using namespace cudf::io::detail; + +namespace { + +// Copied over from arrow source for better code readability +namespace flatbuf = cudf::io::parquet::flatbuf; +using FlatBufferBuilder = flatbuffers::FlatBufferBuilder; +using DictionaryOffset = flatbuffers::Offset; +using FieldOffset = flatbuffers::Offset; +using Offset = flatbuffers::Offset; +using FBString = flatbuffers::Offset; + +/** + * @brief Recursively construct the arrow schema (fields) tree + * + * @param fbb The root flatbuffer builder object instance + * @param column A view of the column + * @param column_metadata Metadata of the column + * @param write_mode Flag to indicate that we are guaranteeing a single table write + * @param utc_timestamps Flag to indicate if timestamps are UTC + * + * @return Flatbuffer offset to the constructed field + */ +FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, + cudf::detail::LinkedColPtr const& column, + column_in_metadata const& column_metadata, + single_write_mode const write_mode, + bool const utc_timestamps); + +/** + * @brief Functor to convert cudf column metadata to arrow schema field metadata + */ +struct dispatch_to_flatbuf { + FlatBufferBuilder& fbb; + cudf::detail::LinkedColPtr const& col; + column_in_metadata const& col_meta; + single_write_mode const write_mode; + bool const utc_timestamps; + Offset& field_offset; + flatbuf::Type& field_type_id; + std::vector& children; + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Bool; + field_offset = flatbuf::CreateBool(fbb).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 8, std::numeric_limits::is_signed).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 16, std::numeric_limits::is_signed).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 32, std::numeric_limits::is_signed).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 64, std::numeric_limits::is_signed).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 8, std::numeric_limits::is_signed).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 16, std::numeric_limits::is_signed).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 32, std::numeric_limits::is_signed).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Int; + field_offset = flatbuf::CreateInt(fbb, 64, std::numeric_limits::is_signed).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_FloatingPoint; + field_offset = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_SINGLE).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_FloatingPoint; + field_offset = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_DOUBLE).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Utf8View; + field_offset = flatbuf::CreateUtf8View(fbb).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Date; + // Date type (Set unit type to DAY for arrows's Date32) + field_offset = flatbuf::CreateDate(fbb, flatbuf::DateUnit_DAY).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Timestamp; + // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp + field_offset = flatbuf::CreateTimestamp( + fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) + .Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Timestamp; + // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp + field_offset = + flatbuf::CreateTimestamp( + fbb, flatbuf::TimeUnit_MILLISECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) + .Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Timestamp; + // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp + field_offset = + flatbuf::CreateTimestamp( + fbb, flatbuf::TimeUnit_MICROSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) + .Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Timestamp; + // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp + field_offset = + flatbuf::CreateTimestamp( + fbb, flatbuf::TimeUnit_NANOSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0) + .Union(); + } + + template + std::enable_if_t, void> operator()() + { + // `duration_D` is written as TimeType as `duration_D` is not a valid arrow type. + // This also allows for easy and faithful roundtripping with cudf. + field_type_id = flatbuf::Type_Time; + field_offset = flatbuf::CreateTime(fbb, flatbuf::TimeUnit_MILLISECOND).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Duration; + field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_SECOND).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Duration; + field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MILLISECOND).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Duration; + field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MICROSECOND).Union(); + } + + template + std::enable_if_t, void> operator()() + { + field_type_id = flatbuf::Type_Duration; + field_offset = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_NANOSECOND).Union(); + } + + template + std::enable_if_t(), void> operator()() + { + field_type_id = flatbuf::Type_Decimal; + field_offset = flatbuf::CreateDecimal(fbb, + (col_meta.is_decimal_precision_set()) + ? col_meta.get_decimal_precision() + : MAX_DECIMAL128_PRECISION, + col->type().scale(), + 128) + .Union(); + } + + template + std::enable_if_t(), void> operator()() + { + // Lists are represented differently in arrow and cuDF. + // cuDF representation: List: "col_name" : { "list", "element:int" } (2 children) + // arrow schema representation: List: "col_name" : { "list" } (1 child) + // Hence, we only need to process the second child of the list. + if constexpr (std::is_same_v) { + children.emplace_back(make_arrow_schema_fields( + fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps)); + field_type_id = flatbuf::Type_List; + field_offset = flatbuf::CreateList(fbb).Union(); + } + + // Traverse the struct in DFS manner and process children fields. + else if constexpr (std::is_same_v) { + std::transform(thrust::make_counting_iterator(0UL), + thrust::make_counting_iterator(col->children.size()), + std::back_inserter(children), + [&](auto const idx) { + return make_arrow_schema_fields( + fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps); + }); + field_type_id = flatbuf::Type_Struct_; + field_offset = flatbuf::CreateStruct_(fbb).Union(); + } + } + + template + std::enable_if_t(), void> operator()() + { + // `dictionary32` columns are not written to parquet by cudf. + CUDF_FAIL("Dictionary columns are not supported for writing"); + } +}; + +FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb, + cudf::detail::LinkedColPtr const& column, + column_in_metadata const& column_metadata, + single_write_mode const write_mode, + bool const utc_timestamps) +{ + // Variables to be set by the dispatch_to_flatbuf functor + Offset field_offset = 0; + flatbuf::Type field_type_id = flatbuf::Type_NONE; + std::vector children; + + cudf::type_dispatcher(column->type(), + dispatch_to_flatbuf{fbb, + column, + column_metadata, + write_mode, + utc_timestamps, + field_offset, + field_type_id, + children}); + + // push to field offsets vector + return flatbuf::CreateField( + fbb, + fbb.CreateString(column_metadata.get_name()), // name + is_output_column_nullable(column, column_metadata, write_mode), // nullable + field_type_id, // type id + field_offset, // field offset + {0}, // DictionaryOffset + fbb.CreateVector(children.data(), children.size())); // children vector +} + +} // namespace + +std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, + table_input_metadata const& metadata, + single_write_mode const write_mode, + bool const utc_timestamps) +{ + // Lambda function to convert int32 to a string of uint8 bytes + auto const convert_int32_to_byte_string = [&](int32_t const value) { + std::array buffer; + std::memcpy(buffer.data(), &value, sizeof(int32_t)); + return std::string(reinterpret_cast(buffer.data()), buffer.size()); + }; + + // Instantiate a flatbuffer builder + FlatBufferBuilder fbb; + + // Create an empty field offset vector and reserve space for linked columns + std::vector field_offsets; + field_offsets.reserve(linked_columns.size()); + + // populate field offsets (aka schema fields) + std::transform(thrust::make_zip_iterator( + thrust::make_tuple(linked_columns.begin(), metadata.column_metadata.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())), + std::back_inserter(field_offsets), + [&](auto const& elem) { + return make_arrow_schema_fields( + fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps); + }); + + // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to + // create an ipc message flatbuffer + fbb.Finish(flatbuf::CreateMessage( + fbb, + flatbuf::MetadataVersion_V5, // Metadata version V5 (latest) + flatbuf::MessageHeader_Schema, // Schema type message header + flatbuf::CreateSchema(fbb, + flatbuf::Endianness::Endianness_Little, + fbb.CreateVector(field_offsets)) + .Union(), // arrow:schema built from the field vector + SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH // Body length is zero for schema type ipc message + )); + + // Construct the final string and store it here to use its view in base64_encode + std::string const ipc_message = + convert_int32_to_byte_string(IPC_CONTINUATION_TOKEN) + + // Since the schema type ipc message doesn't have a body, the flatbuffer size is equal to the + // ipc message's metadata length + convert_int32_to_byte_string(fbb.GetSize()) + + std::string(reinterpret_cast(fbb.GetBufferPointer()), fbb.GetSize()); + + // Encode the final ipc message string to base64 and return + return cudf::io::detail::base64_encode(ipc_message); +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp new file mode 100644 index 00000000000..9bc435bf6c8 --- /dev/null +++ b/cpp/src/io/parquet/arrow_schema_writer.hpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file arrow_schema_writer.hpp + * @brief Arrow IPC schema writer implementation + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace cudf::io::parquet::detail { + +/** + * @brief Construct and return arrow schema from input parquet schema + * + * Recursively traverses through parquet schema to construct the arrow schema tree. + * Serializes the arrow schema tree and stores it as the header (or metadata) of + * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended + * with header size (padded for 16 byte alignment) and a continuation string. The final + * string is base64 encoded and returned. + * + * @param linked_columns Vector of table column views + * @param metadata Metadata of the columns of the table + * @param write_mode Flag to indicate that we are guaranteeing a single table write + * @param utc_timestamps Flag to indicate if timestamps are UTC + * + * @return The constructed arrow ipc message string + */ +std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns, + table_input_metadata const& metadata, + cudf::io::detail::single_write_mode const write_mode, + bool const utc_timestamps); + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index ea80ae73c2f..8a866141c4b 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -792,7 +792,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, gpuDecodePageDataGeneric <<>>( @@ -801,7 +801,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, gpuDecodePageDataGeneric <<>>( @@ -812,7 +812,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, gpuDecodePageDataGeneric <<>>( @@ -821,7 +821,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span pages, gpuDecodePageDataGeneric <<>>( diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index e49801e6172..62f1ee88036 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -26,6 +26,8 @@ namespace cudf::io::parquet::detail { +namespace cg = cooperative_groups; + namespace { // # of threads we're decoding with @@ -163,7 +165,8 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t) // For V1, the choice is an overestimate (s->dict_size), or an exact number that's // expensive to compute. For now we're going with the latter. else { - str_len = gpuInitStringDescriptors(s, nullptr, target_pos, t); + str_len = gpuInitStringDescriptors( + s, nullptr, target_pos, cg::this_thread_block()); } break; diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 7207173b82f..e0d50d7ccf9 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -23,6 +23,8 @@ namespace cudf::io::parquet::detail { +namespace cg = cooperative_groups; + namespace { constexpr int decode_block_size = 128; @@ -277,6 +279,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) } // this needs to be here to prevent warp 3 modifying src_pos before all threads have read it __syncthreads(); + auto const tile_warp = cg::tiled_partition(cg::this_thread_block()); if (t < 32) { // decode repetition and definition levels. // - update validity vectors @@ -298,9 +301,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f); } else if (s->col.physical_type == BYTE_ARRAY or s->col.physical_type == FIXED_LEN_BYTE_ARRAY) { - gpuInitStringDescriptors(s, sb, src_target_pos, t & 0x1f); + gpuInitStringDescriptors(s, sb, src_target_pos, tile_warp); } - if (t == 32) { s->dict_pos = src_target_pos; } + if (tile_warp.thread_rank() == 0) { s->dict_pos = src_target_pos; } } else { // WARP1..WARP3: Decode values int const dtype = s->col.physical_type; diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index b1f8e6dd5fe..a3f91f6859b 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -21,6 +21,7 @@ #include "parquet_gpu.hpp" #include "rle_stream.cuh" +#include #include #include @@ -420,46 +421,62 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s* s, state_buf* sb, int t * @param[in,out] s Page state input/output * @param[out] sb Page state buffer output * @param[in] target_pos Target output position - * @param[in] t Thread ID + * @param[in] g Cooperative group (thread block or tile) * @tparam sizes_only True if only sizes are to be calculated * @tparam state_buf Typename of the `state_buf` (usually inferred) + * @tparam thread_group Typename of the cooperative group (inferred) * * @return Total length of strings processed */ -template -__device__ size_type -gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int target_pos, int t) +template +__device__ size_type gpuInitStringDescriptors(page_state_s* s, + [[maybe_unused]] state_buf* sb, + int target_pos, + thread_group const& g) { - int pos = s->dict_pos; - int total_len = 0; + int const t = g.thread_rank(); + int const dict_size = s->dict_size; + int k = s->dict_val; + int pos = s->dict_pos; + int total_len = 0; + + // All group threads can participate for fixed len byte arrays. + if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) { + int const dtype_len_in = s->dtype_len_in; + total_len = min((target_pos - pos) * dtype_len_in, dict_size - s->dict_val); + if constexpr (!sizes_only) { + for (pos += t, k += t * dtype_len_in; pos < target_pos; pos += g.size()) { + sb->str_len[rolling_index(pos)] = + (k < dict_size) ? dtype_len_in : 0; + // dict_idx is upperbounded by dict_size. + sb->dict_idx[rolling_index(pos)] = k; + // Increment k if needed. + if (k < dict_size) { k = min(k + (g.size() * dtype_len_in), dict_size); } + } + } + // Only thread_rank = 0 updates the s->dict_val + if (!t) { s->dict_val += total_len; } + } + // This step is purely serial for byte arrays + else { + if (!t) { + uint8_t const* cur = s->data_start; - // This step is purely serial - if (!t) { - uint8_t const* cur = s->data_start; - int dict_size = s->dict_size; - int k = s->dict_val; - - while (pos < target_pos) { - int len = 0; - if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) { - if (k < dict_size) { len = s->dtype_len_in; } - } else { + for (int len = 0; pos < target_pos; pos++, len = 0) { if (k + 4 <= dict_size) { len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24); k += 4; if (k + len > dict_size) { len = 0; } } + if constexpr (!sizes_only) { + sb->dict_idx[rolling_index(pos)] = k; + sb->str_len[rolling_index(pos)] = len; + } + k += len; + total_len += len; } - if constexpr (!sizes_only) { - sb->dict_idx[rolling_index(pos)] = k; - sb->str_len[rolling_index(pos)] = len; - } - k += len; - total_len += len; - pos++; + s->dict_val = k; } - s->dict_val = k; - __threadfence_block(); } return total_len; diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 58e8a09d5b6..ca74a1c2ba0 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -31,6 +31,8 @@ namespace cudf::io::parquet::detail { +namespace cg = cooperative_groups; + namespace { constexpr int preprocess_block_size = 512; @@ -1006,6 +1008,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) } // this needs to be here to prevent warp 1/2 modifying src_pos before all threads have read it __syncthreads(); + + // Create a warp sized thread block tile + auto const tile_warp = cg::tiled_partition(cg::this_thread_block()); + if (t < 32) { // decode repetition and definition levels. // - update validity vectors @@ -1020,9 +1026,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) if (s->dict_base) { src_target_pos = gpuDecodeDictionaryIndices(s, sb, src_target_pos, lane_id).first; } else { - gpuInitStringDescriptors(s, sb, src_target_pos, lane_id); + gpuInitStringDescriptors(s, sb, src_target_pos, tile_warp); } - if (t == 32) { s->dict_pos = src_target_pos; } + if (tile_warp.thread_rank() == 0) { s->dict_pos = src_target_pos; } } else { int const me = t - out_thread0; diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 8507eca047e..e42c259b1bf 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include namespace cudf::io::parquet::detail { @@ -26,6 +27,15 @@ auto constexpr MAX_DECIMAL32_PRECISION = 9; auto constexpr MAX_DECIMAL64_PRECISION = 18; auto constexpr MAX_DECIMAL128_PRECISION = 38; // log10(2^(sizeof(int128_t) * 8 - 1) - 1) +// Constants copied from arrow source and renamed to match the case +int32_t constexpr MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL = sizeof(int32_t); +int32_t constexpr MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t); +int32_t constexpr IPC_CONTINUATION_TOKEN = -1; +std::string const ARROW_SCHEMA_KEY = "ARROW:schema"; + +// Schema type ipc message has zero length body +int64_t constexpr SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH = 0; + /** * @brief Basic data types in Parquet, determines how data is physically stored */ diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index d371ef5de93..3da303e6928 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -16,7 +16,6 @@ #include "compact_protocol_reader.hpp" #include "io/comp/nvcomp_adapter.hpp" -#include "io/utilities/config_utils.hpp" #include "io/utilities/time_utils.cuh" #include "reader_impl.hpp" #include "reader_impl_chunking.hpp" @@ -25,6 +24,7 @@ #include #include #include +#include #include @@ -862,7 +862,7 @@ std::vector compute_page_splits_by_row(device_span aggregate_reader_metadata::decode_ipc_message( std::string_view const serialized_message) const { - // Constants copied from arrow source and renamed to match the case - constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL = sizeof(int32_t); - constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t); - constexpr int32_t IPC_CONTINUATION_TOKEN = -1; - // message buffer auto message_buf = serialized_message.data(); // current message (buffer) size diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 9aeb19a7723..6bfa8519c76 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -117,6 +117,9 @@ struct metadata : public FileMetaData { void sanitize_schema(); }; +/** + * @brief Class to extract data types from arrow schema tree + */ struct arrow_schema_data_types { std::vector children; data_type type{type_id::EMPTY}; @@ -142,7 +145,7 @@ class aggregate_reader_metadata { const; /** - * @brief Decodes and constructs the arrow schema from the "ARROW:schema" IPC message + * @brief Decodes and constructs the arrow schema from the ARROW_SCHEMA_KEY IPC message * in key value metadata section of Parquet file footer */ [[nodiscard]] arrow_schema_data_types collect_arrow_schema() const; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index bed4dbc5a66..8413e716224 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -19,6 +19,7 @@ * @brief cuDF-IO parquet writer class implementation */ +#include "arrow_schema_writer.hpp" #include "compact_protocol_reader.hpp" #include "compact_protocol_writer.hpp" #include "io/comp/nvcomp_adapter.hpp" @@ -26,22 +27,20 @@ #include "io/parquet/parquet_gpu.hpp" #include "io/statistics/column_statistics.cuh" #include "io/utilities/column_utils.cuh" -#include "io/utilities/config_utils.hpp" #include "parquet_common.hpp" #include "parquet_gpu.cuh" #include "writer_impl.hpp" +#include "writer_impl_helpers.hpp" #include #include #include #include #include +#include #include #include #include -#include -#include -#include #include #include @@ -70,7 +69,8 @@ struct aggregate_writer_metadata { host_span const> kv_md, host_span tbl_schema, size_type num_columns, - statistics_freq stats_granularity) + statistics_freq stats_granularity, + std::string const arrow_schema_ipc_message) : version(1), schema(std::vector(tbl_schema.begin(), tbl_schema.end())), files(partitions.size()) @@ -92,6 +92,13 @@ struct aggregate_writer_metadata { return KeyValue{kv.first, kv.second}; }); } + + // Append arrow schema to the key-value metadata + if (not arrow_schema_ipc_message.empty()) { + std::for_each(this->files.begin(), this->files.end(), [&](auto& file) { + file.key_value_metadata.emplace_back(KeyValue{ARROW_SCHEMA_KEY, arrow_schema_ipc_message}); + }); + } } aggregate_writer_metadata(aggregate_writer_metadata const&) = default; @@ -182,26 +189,6 @@ struct aggregate_writer_metadata { namespace { -/** - * @brief Function that translates GDF compression to parquet compression. - * - * @param compression The compression type - * @return The supported Parquet compression - */ -Compression to_parquet_compression(compression_type compression) -{ - switch (compression) { - case compression_type::AUTO: - case compression_type::SNAPPY: return Compression::SNAPPY; - case compression_type::ZSTD: return Compression::ZSTD; - case compression_type::LZ4: - // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 - return Compression::LZ4_RAW; - case compression_type::NONE: return Compression::UNCOMPRESSED; - default: CUDF_FAIL("Unsupported compression type"); - } -} - /** * @brief Convert a mask of encodings to a vector. * @@ -326,6 +313,7 @@ struct leaf_schema_fn { column_in_metadata const& col_meta; bool timestamp_is_int96; bool timestamp_is_utc; + bool write_arrow_schema; template std::enable_if_t, void> operator()() @@ -493,10 +481,11 @@ struct leaf_schema_fn { } } - // unsupported outside cudf for parquet 1.0. template std::enable_if_t, void> operator()() { + // duration_D is based on int32_t and not a valid arrow duration type so simply convert to + // time32(ms). col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::TIME_MILLIS; col_schema.stats_dtype = statistics_dtype::dtype_int32; @@ -507,62 +496,86 @@ struct leaf_schema_fn { template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.ts_scale = 1000; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + // If writing arrow schema, no logical type nor converted type is necessary + if (write_arrow_schema) { + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + } else { + // Write as Time32 logical type otherwise. Parquet TIME_MILLIS annotates INT32 + col_schema.type = Type::INT32; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + col_schema.ts_scale = 1000; + } } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + // If writing arrow schema, no logical type nor converted type is necessary + if (write_arrow_schema) { + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + } else { + // Write as Time32 logical type otherwise. Parquet TIME_MILLIS annotates INT32 + col_schema.type = Type::INT32; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; + } } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.converted_type = ConvertedType::TIME_MICROS; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}}; + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + // Only write as time64 logical type if not writing arrow schema + if (not write_arrow_schema) { + col_schema.converted_type = ConvertedType::TIME_MICROS; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}}; + } } - // unsupported outside cudf for parquet 1.0. template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}}; + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + // Only write as time64 logical type if not writing arrow schema + if (not write_arrow_schema) { + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}}; + } } template std::enable_if_t(), void> operator()() { - if (std::is_same_v) { - col_schema.type = Type::INT32; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.decimal_precision = MAX_DECIMAL32_PRECISION; - col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}}; - } else if (std::is_same_v) { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_decimal64; - col_schema.decimal_precision = MAX_DECIMAL64_PRECISION; - col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}}; - } else if (std::is_same_v) { + // If writing arrow schema, then convert d32 and d64 to d128 + if (write_arrow_schema or std::is_same_v) { col_schema.type = Type::FIXED_LEN_BYTE_ARRAY; col_schema.type_length = sizeof(__int128_t); col_schema.stats_dtype = statistics_dtype::dtype_decimal128; col_schema.decimal_precision = MAX_DECIMAL128_PRECISION; col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL128_PRECISION}}; } else { - CUDF_FAIL("Unsupported fixed point type for parquet writer"); + if (std::is_same_v) { + col_schema.type = Type::INT32; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.decimal_precision = MAX_DECIMAL32_PRECISION; + col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}}; + } else if (std::is_same_v) { + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_decimal64; + col_schema.decimal_precision = MAX_DECIMAL64_PRECISION; + col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}}; + } else { + CUDF_FAIL("Unsupported fixed point type for parquet writer"); + } } + + // Write logical and converted types, decimal scale and precision col_schema.converted_type = ConvertedType::DECIMAL; col_schema.decimal_scale = -col->type().scale(); // parquet and cudf disagree about scale signs col_schema.logical_type->decimal_type->scale = -col->type().scale(); @@ -590,33 +603,19 @@ struct leaf_schema_fn { } }; -inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col, - column_in_metadata const& col_meta, - single_write_mode write_mode) -{ - if (col_meta.is_nullability_defined()) { - CUDF_EXPECTS(col_meta.nullable() or col->null_count() == 0, - "Mismatch in metadata prescribed nullability and input column. " - "Metadata for input column with nulls cannot prescribe nullability = false"); - return col_meta.nullable(); - } - // For chunked write, when not provided nullability, we assume the worst case scenario - // that all columns are nullable. - return write_mode == single_write_mode::NO or col->nullable(); -} - /** * @brief Construct schema from input columns and per-column input options * * Recursively traverses through linked_columns and corresponding metadata to construct schema tree. * The resulting schema tree is stored in a vector in pre-order traversal order. */ -std::vector construct_schema_tree( +std::vector construct_parquet_schema_tree( cudf::detail::LinkedColVector const& linked_columns, table_input_metadata& metadata, single_write_mode write_mode, bool int96_timestamps, - bool utc_timestamps) + bool utc_timestamps, + bool write_arrow_schema) { std::vector schema; schema_tree_node root{}; @@ -629,7 +628,7 @@ std::vector construct_schema_tree( std::function add_schema = [&](cudf::detail::LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) { - bool const col_nullable = is_col_nullable(col, col_meta, write_mode); + bool const col_nullable = is_output_column_nullable(col, col_meta, write_mode); auto set_field_id = [&schema, parent_idx](schema_tree_node& s, column_in_metadata const& col_meta) { @@ -854,7 +853,7 @@ std::vector construct_schema_tree( right_child_meta.set_name("value"); // check the repetition type of key is required i.e. the col should be non-nullable auto key_col = col->children[lists_column_view::child_column_index]->children[0]; - CUDF_EXPECTS(!is_col_nullable(key_col, left_child_meta, write_mode), + CUDF_EXPECTS(!is_output_column_nullable(key_col, left_child_meta, write_mode), "key column cannot be nullable. For chunked writing, explicitly set the " "nullability to false in metadata"); // process key @@ -886,7 +885,8 @@ std::vector construct_schema_tree( cudf::type_dispatcher( col->type(), - leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96, utc_timestamps}); + leaf_schema_fn{ + col_schema, col, col_meta, timestamp_is_int96, utc_timestamps, write_arrow_schema}); col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED; col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); @@ -1148,7 +1148,6 @@ void calculate_page_fragments(device_span frag, * * @param frag_stats output statistics * @param frags Input page fragments - * @param int96_timestamps Flag to indicate if timestamps will be written as INT96 * @param stream CUDA stream used for device memory operations and kernel launches */ void gather_fragment_statistics(device_span frag_stats, @@ -1164,32 +1163,6 @@ void gather_fragment_statistics(device_span frag_stats, stream.synchronize(); } -auto to_nvcomp_compression_type(Compression codec) -{ - if (codec == Compression::SNAPPY) return nvcomp::compression_type::SNAPPY; - if (codec == Compression::ZSTD) return nvcomp::compression_type::ZSTD; - // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 - if (codec == Compression::LZ4_RAW) return nvcomp::compression_type::LZ4; - CUDF_FAIL("Unsupported compression type"); -} - -auto page_alignment(Compression codec) -{ - if (codec == Compression::UNCOMPRESSED or - nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) { - return 1u; - } - - return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec)); -} - -size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize) -{ - if (codec == Compression::UNCOMPRESSED) return 0; - - return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); -} - auto init_page_sizes(hostdevice_2dvector& chunks, device_span col_desc, uint32_t num_columns, @@ -1629,23 +1602,127 @@ size_t column_index_buffer_size(EncColumnChunk* ck, } /** - * @brief Fill the table metadata with default column names. + * @brief Convert decimal32 and decimal64 data to decimal128 and return the device vector * - * @param table_meta The table metadata to fill + * @tparam DecimalType to convert from + * + * @param column A view of the input columns + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A device vector containing the converted decimal128 data */ -void fill_table_meta(std::unique_ptr const& table_meta) +template +rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& column, + rmm::cuda_stream_view stream) { - // Fill unnamed columns' names in table_meta - std::function add_default_name = - [&](column_in_metadata& col_meta, std::string default_name) { - if (col_meta.get_name().empty()) col_meta.set_name(default_name); - for (size_type i = 0; i < col_meta.num_children(); ++i) { - add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i)); - } - }; - for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) { - add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i)); - } + size_type constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType); + + rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream); + + thrust::for_each(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(column.size()), + [in = column.begin(), + out = reinterpret_cast(d128_buffer.data()), + BIT_WIDTH_RATIO] __device__(auto in_idx) { + auto const out_idx = in_idx * BIT_WIDTH_RATIO; + // The lowest order bits are the value, the remainder + // simply matches the sign bit to satisfy the two's + // complement integer representation of negative numbers. + out[out_idx] = in[in_idx]; +#pragma unroll BIT_WIDTH_RATIO - 1 + for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { + out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; + } + }); + + return d128_buffer; +} + +/** + * @brief Function to convert decimal32 and decimal64 columns to decimal128 data, + * update the input table metadata, and return a new vector of column views. + * + * @param[in,out] table_meta The table metadata + * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers. + * @param input The input table + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A device vector containing the converted decimal128 data + */ +std::vector convert_decimal_columns_and_metadata( + table_input_metadata& table_meta, + std::vector>& d128_vectors, + table_view const& table, + rmm::cuda_stream_view stream) +{ + // Lambda function to convert each decimal32/decimal64 column to decimal128. + std::function convert_column = + [&](column_view column, column_in_metadata& metadata) -> column_view { + // Vector of passable-by-reference children column views + std::vector converted_children; + + // Process children column views first + std::transform( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(column.num_children()), + std::back_inserter(converted_children), + [&](auto const idx) { return convert_column(column.child(idx), metadata.child(idx)); }); + + // Process this column view. Only convert if decimal32 and decimal64 column. + switch (column.type().id()) { + case type_id::DECIMAL32: + // Convert data to decimal128 type + d128_vectors.emplace_back(convert_data_to_decimal128(column, stream)); + // Update metadata + metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION); + metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()})); + // Create a new column view from the d128 data vector + return {data_type{type_id::DECIMAL128, column.type().scale()}, + column.size(), + d128_vectors.back().data(), + column.null_mask(), + column.null_count(), + column.offset(), + converted_children}; + case type_id::DECIMAL64: + // Convert data to decimal128 type + d128_vectors.emplace_back(convert_data_to_decimal128(column, stream)); + // Update metadata + metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION); + metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()})); + // Create a new column view from the d128 data vector + return {data_type{type_id::DECIMAL128, column.type().scale()}, + column.size(), + d128_vectors.back().data(), + column.null_mask(), + column.null_count(), + column.offset(), + converted_children}; + default: + // Update the children vector keeping everything else the same + return {column.type(), + column.size(), + column.head(), + column.null_mask(), + column.null_count(), + column.offset(), + converted_children}; + } + }; + + // Vector of converted column views + std::vector converted_column_views; + + // Convert each column view + std::transform( + thrust::make_zip_iterator( + thrust::make_tuple(table.begin(), table_meta.column_metadata.begin())), + thrust::make_zip_iterator(thrust::make_tuple(table.end(), table_meta.column_metadata.end())), + std::back_inserter(converted_column_views), + [&](auto elem) { return convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); }); + + return converted_column_views; } /** @@ -1698,12 +1775,22 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, bool int96_timestamps, bool utc_timestamps, bool write_v2_headers, + bool write_arrow_schema, host_span const> out_sink, rmm::cuda_stream_view stream) { - auto vec = table_to_linked_columns(input); - auto schema_tree = - construct_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps); + // Container to store decimal128 converted data if needed + std::vector> d128_vectors; + + // Convert decimal32/decimal64 data to decimal128 if writing arrow schema + // and initialize LinkedColVector + auto vec = table_to_linked_columns( + (write_arrow_schema) + ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_vectors, input, stream)}) + : input); + + auto schema_tree = construct_parquet_schema_tree( + vec, table_meta, write_mode, int96_timestamps, utc_timestamps, write_arrow_schema); // Construct parquet_column_views from the schema tree leaf nodes. std::vector parquet_columns; @@ -1826,7 +1913,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, std::unique_ptr agg_meta; if (!curr_agg_meta) { agg_meta = std::make_unique( - partitions, kv_meta, this_table_schema, num_columns, stats_granularity); + partitions, + kv_meta, + this_table_schema, + num_columns, + stats_granularity, + (write_arrow_schema) + ? construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps) + : ""); } else { agg_meta = std::make_unique(*curr_agg_meta); @@ -2307,6 +2401,7 @@ writer::impl::impl(std::vector> sinks, _int96_timestamps(options.is_enabled_int96_timestamps()), _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), + _write_arrow_schema(options.is_enabled_write_arrow_schema()), _sorting_columns(options.get_sorting_columns()), _column_index_truncate_length(options.get_column_index_truncate_length()), _kv_meta(options.get_key_value_metadata()), @@ -2337,6 +2432,7 @@ writer::impl::impl(std::vector> sinks, _int96_timestamps(options.is_enabled_int96_timestamps()), _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), + _write_arrow_schema(options.is_enabled_write_arrow_schema()), _sorting_columns(options.get_sorting_columns()), _column_index_truncate_length(options.get_column_index_truncate_length()), _kv_meta(options.get_key_value_metadata()), @@ -2378,7 +2474,7 @@ void writer::impl::write(table_view const& input, std::vector co CUDF_EXPECTS(not _closed, "Data has already been flushed to out and closed"); if (not _table_meta) { _table_meta = std::make_unique(input); } - fill_table_meta(_table_meta); + fill_table_meta(*_table_meta); // All kinds of memory allocation and data compressions/encoding are performed here. // If any error occurs, such as out-of-memory exception, the internal state of the current @@ -2415,6 +2511,7 @@ void writer::impl::write(table_view const& input, std::vector co _int96_timestamps, _utc_timestamps, _write_v2_headers, + _write_arrow_schema, _out_sink, _stream); } catch (...) { // catch any exception type diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 784f78f06d5..63128faf993 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -156,6 +156,7 @@ class writer::impl { bool const _int96_timestamps; bool const _utc_timestamps; bool const _write_v2_headers; + bool const _write_arrow_schema; std::optional> _sorting_columns; int32_t const _column_index_truncate_length; std::vector> const _kv_meta; // Optional user metadata. diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp new file mode 100644 index 00000000000..e2f09f872d3 --- /dev/null +++ b/cpp/src/io/parquet/writer_impl_helpers.cpp @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file writer_impl_helpers.cpp + * @brief Helper function implementation for Parquet writer + */ + +#include "writer_impl_helpers.hpp" + +#include +#include +#include +#include + +namespace cudf::io::parquet::detail { + +using namespace cudf::io::detail; + +Compression to_parquet_compression(compression_type compression) +{ + switch (compression) { + case compression_type::AUTO: + case compression_type::SNAPPY: return Compression::SNAPPY; + case compression_type::ZSTD: return Compression::ZSTD; + case compression_type::LZ4: + // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 + return Compression::LZ4_RAW; + case compression_type::NONE: return Compression::UNCOMPRESSED; + default: CUDF_FAIL("Unsupported compression type"); + } +} + +nvcomp::compression_type to_nvcomp_compression_type(Compression codec) +{ + switch (codec) { + case Compression::SNAPPY: return nvcomp::compression_type::SNAPPY; + case Compression::ZSTD: return nvcomp::compression_type::ZSTD; + // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 + case Compression::LZ4_RAW: return nvcomp::compression_type::LZ4; + default: CUDF_FAIL("Unsupported compression type"); + } +} + +uint32_t page_alignment(Compression codec) +{ + if (codec == Compression::UNCOMPRESSED or + nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) { + return 1u; + } + + return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec)); +} + +size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize) +{ + if (codec == Compression::UNCOMPRESSED) return 0; + + return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); +} + +void fill_table_meta(table_input_metadata& table_meta) +{ + // Fill unnamed columns' names in table_meta + std::function add_default_name = + [&](column_in_metadata& col_meta, std::string default_name) { + if (col_meta.get_name().empty()) col_meta.set_name(default_name); + for (size_type i = 0; i < col_meta.num_children(); ++i) { + add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i)); + } + }; + for (size_t i = 0; i < table_meta.column_metadata.size(); ++i) { + add_default_name(table_meta.column_metadata[i], "_col" + std::to_string(i)); + } +} + +[[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream) +{ + if (column.is_empty()) { return 0; } + + if (is_fixed_width(column.type())) { + return size_of(column.type()) * column.size(); + } else if (column.type().id() == type_id::STRING) { + auto const scol = strings_column_view(column); + return cudf::strings::detail::get_offset_value( + scol.offsets(), column.size() + column.offset(), stream) - + cudf::strings::detail::get_offset_value(scol.offsets(), column.offset(), stream); + } else if (column.type().id() == type_id::STRUCT) { + auto const scol = structs_column_view(column); + size_t ret = 0; + for (int i = 0; i < scol.num_children(); i++) { + ret += column_size(scol.get_sliced_child(i, stream), stream); + } + return ret; + } else if (column.type().id() == type_id::LIST) { + auto const lcol = lists_column_view(column); + return column_size(lcol.get_sliced_child(stream), stream); + } + + CUDF_FAIL("Unexpected compound type"); +} + +[[nodiscard]] bool is_output_column_nullable(cudf::detail::LinkedColPtr const& column, + column_in_metadata const& column_metadata, + single_write_mode write_mode) +{ + if (column_metadata.is_nullability_defined()) { + CUDF_EXPECTS(column_metadata.nullable() or column->null_count() == 0, + "Mismatch in metadata prescribed nullability and input column. " + "Metadata for input column with nulls cannot prescribe nullability = false"); + return column_metadata.nullable(); + } + // For chunked write, when not provided nullability, we assume the worst case scenario + // that all columns are nullable. + return write_mode == single_write_mode::NO or column->nullable(); +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp new file mode 100644 index 00000000000..a85411594e9 --- /dev/null +++ b/cpp/src/io/parquet/writer_impl_helpers.hpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file writer_impl_helpers.hpp + * @brief Helper function implementation for Parquet writer + */ + +#pragma once +#include "io/comp/nvcomp_adapter.hpp" +#include "parquet_common.hpp" + +#include +#include + +namespace cudf::io::parquet::detail { + +/** + * @brief Function that translates GDF compression to parquet compression. + * + * @param compression The compression type + * @return The supported Parquet compression + */ +Compression to_parquet_compression(compression_type compression); + +/** + * @brief Function that translates the given compression codec to nvcomp compression type. + * + * @param codec Compression codec + * @return Translated nvcomp compression type + */ +nvcomp::compression_type to_nvcomp_compression_type(Compression codec); + +/** + * @brief Function that computes input alignment requirements for the given compression type. + * + * @param codec Compression codec + * @return Required alignment + */ +uint32_t page_alignment(Compression codec); + +/** + * @brief Gets the maximum compressed chunk size for the largest chunk uncompressed chunk in the + * batch. + * + * @param codec Compression codec + * @param compression_blocksize Size of the largest uncompressed chunk in the batch + * @return Maximum compressed chunk size + */ +size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize); + +/** + * @brief Fill the table metadata with default column names. + * + * @param table_meta The table metadata to fill + */ +void fill_table_meta(table_input_metadata& table_meta); + +/** + * @brief Compute size (in bytes) of the data stored in the given column. + * + * @param column The input column + * @param stream CUDA stream used for device memory operations and kernel launches + * @return The data size of the input + */ +[[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream); + +/** + * @brief Indicates if the column should be marked as nullable in the output schema + * + * Returns `true` if the input column is nullable or if the write mode is not set to + * write the table all at once instead of chunked. + * + * @param column A view of the (linked) column + * @param column_metadata Metadata of the column + * @param write_mode Flag to indicate that we are guaranteeing a single table write + * + * @return Whether the column is nullable. + */ +[[nodiscard]] bool is_output_column_nullable(cudf::detail::LinkedColPtr const& column, + column_in_metadata const& column_metadata, + ::cudf::io::detail::single_write_mode write_mode); + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu index 0e3ce779089..badcd3f58f9 100644 --- a/cpp/src/io/text/bgzip_data_chunk_source.cu +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -16,12 +16,12 @@ #include "io/comp/nvcomp_adapter.hpp" #include "io/text/device_data_chunks.hpp" -#include "io/utilities/config_utils.hpp" #include #include #include #include +#include #include #include #include diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 51dc0ca90af..be2e2b9a79c 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -55,6 +55,8 @@ #include #include +namespace cudf::io::text { +namespace detail { namespace { using cudf::io::text::detail::multistate; @@ -299,11 +301,6 @@ CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel( } // namespace -namespace cudf { -namespace io { -namespace text { -namespace detail { - std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, std::string const& delimiter, byte_range_info byte_range, @@ -336,173 +333,181 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source CUDF_EXPECTS(delimiter.size() < multistate::max_segment_value, "delimiter contains too many total tokens to produce a deterministic result."); - auto const concurrency = 2; - - // must be at least 32 when using warp-reduce on partials - // must be at least 1 more than max possible concurrent tiles - // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s - auto num_tile_states = std::max(32, TILES_PER_CHUNK * concurrency + 32); - auto tile_multistates = - scan_tile_state(num_tile_states, stream, rmm::mr::get_current_device_resource()); - auto tile_offsets = - scan_tile_state(num_tile_states, stream, rmm::mr::get_current_device_resource()); - - multibyte_split_init_kernel<<>>( // - -TILES_PER_CHUNK, - TILES_PER_CHUNK, - tile_multistates, - tile_offsets, - cudf::io::text::detail::scan_tile_status::oob); - - auto multistate_seed = multistate(); - multistate_seed.enqueue(0, 0); // this represents the first state in the pattern. - - // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as - // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block - // would have to follow separate logic. - cudf::detail::device_single_thread( - [tm = scan_tile_state_view(tile_multistates), - to = scan_tile_state_view(tile_offsets), - multistate_seed] __device__() mutable { - tm.set_inclusive_prefix(-1, multistate_seed); - to.set_inclusive_prefix(-1, 0); - }, - stream); - - auto reader = source.create_reader(); - auto chunk_offset = std::max(0, byte_range.offset() - delimiter.size()); - auto const byte_range_end = byte_range.offset() + byte_range.size(); - reader->skip_bytes(chunk_offset); - // amortize output chunk allocations over 8 worst-case outputs. This limits the overallocation - constexpr auto max_growth = 8; - output_builder row_offset_storage(ITEMS_PER_CHUNK, max_growth, stream); - output_builder char_storage(ITEMS_PER_CHUNK, max_growth, stream); - - auto streams = cudf::detail::fork_streams(stream, concurrency); - - cudaEvent_t last_launch_event; - CUDF_CUDA_TRY(cudaEventCreate(&last_launch_event)); - - auto& read_stream = streams[0]; - auto& scan_stream = streams[1]; - auto chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream); - int64_t base_tile_idx = 0; + auto chunk_offset = std::max(0, byte_range.offset() - delimiter.size()); std::optional first_row_offset; - std::optional last_row_offset; - bool found_last_offset = false; if (byte_range.offset() == 0) { first_row_offset = 0; } - std::swap(read_stream, scan_stream); - - while (chunk->size() > 0) { - // if we found the last delimiter, or didn't find delimiters inside the byte range at all: abort - if (last_row_offset.has_value() or - (not first_row_offset.has_value() and chunk_offset >= byte_range_end)) { - break; - } - - auto tiles_in_launch = - cudf::util::div_rounding_up_safe(chunk->size(), static_cast(ITEMS_PER_TILE)); - - auto row_offsets = row_offset_storage.next_output(scan_stream); + std::optional last_row_offset; - // reset the next chunk of tile state - multibyte_split_init_kernel<<(num_tile_states, stream, rmm::mr::get_current_device_resource()); + auto tile_offsets = scan_tile_state( + num_tile_states, stream, rmm::mr::get_current_device_resource()); + + multibyte_split_init_kernel<<>>( // - base_tile_idx, - tiles_in_launch, + stream.value()>>>( // + -TILES_PER_CHUNK, + TILES_PER_CHUNK, tile_multistates, - tile_offsets); + tile_offsets, + cudf::io::text::detail::scan_tile_status::oob); - CUDF_CUDA_TRY(cudaStreamWaitEvent(scan_stream.value(), last_launch_event)); + auto multistate_seed = multistate(); + multistate_seed.enqueue(0, 0); // this represents the first state in the pattern. - if (delimiter.size() == 1) { - // the single-byte case allows for a much more efficient kernel, so we special-case it - byte_split_kernel<<>>( // - base_tile_idx, - chunk_offset, - row_offset_storage.size(), - tile_offsets, - delimiter[0], - *chunk, - row_offsets); - } else { - multibyte_split_kernel<<>>( // + // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as + // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block + // would have to follow separate logic. + cudf::detail::device_single_thread( + [tm = scan_tile_state_view(tile_multistates), + to = scan_tile_state_view(tile_offsets), + multistate_seed] __device__() mutable { + tm.set_inclusive_prefix(-1, multistate_seed); + to.set_inclusive_prefix(-1, 0); + }, + stream); + + auto reader = source.create_reader(); + auto const byte_range_end = byte_range.offset() + byte_range.size(); + reader->skip_bytes(chunk_offset); + // amortize output chunk allocations over 8 worst-case outputs. This limits the overallocation + constexpr auto max_growth = 8; + output_builder row_offset_storage(ITEMS_PER_CHUNK, max_growth, stream); + output_builder char_storage(ITEMS_PER_CHUNK, max_growth, stream); + + auto streams = cudf::detail::fork_streams(stream, concurrency); + + cudaEvent_t last_launch_event; + CUDF_CUDA_TRY(cudaEventCreate(&last_launch_event)); + + auto& read_stream = streams[0]; + auto& scan_stream = streams[1]; + auto chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream); + int64_t base_tile_idx = 0; + bool found_last_offset = false; + std::swap(read_stream, scan_stream); + + while (chunk->size() > 0) { + // if we found the last delimiter, or didn't find delimiters inside the byte range at all: + // abort + if (last_row_offset.has_value() or + (not first_row_offset.has_value() and chunk_offset >= byte_range_end)) { + break; + } + + auto tiles_in_launch = + cudf::util::div_rounding_up_safe(chunk->size(), static_cast(ITEMS_PER_TILE)); + + auto row_offsets = row_offset_storage.next_output(scan_stream); + + // reset the next chunk of tile state + multibyte_split_init_kernel<<>>( // base_tile_idx, - chunk_offset, - row_offset_storage.size(), + tiles_in_launch, tile_multistates, - tile_offsets, - {device_delim.data(), static_cast(device_delim.size())}, - *chunk, - row_offsets); - } + tile_offsets); + + CUDF_CUDA_TRY(cudaStreamWaitEvent(scan_stream.value(), last_launch_event)); + + if (delimiter.size() == 1) { + // the single-byte case allows for a much more efficient kernel, so we special-case it + byte_split_kernel<<>>( // + base_tile_idx, + chunk_offset, + row_offset_storage.size(), + tile_offsets, + delimiter[0], + *chunk, + row_offsets); + } else { + multibyte_split_kernel<<>>( // + base_tile_idx, + chunk_offset, + row_offset_storage.size(), + tile_multistates, + tile_offsets, + {device_delim.data(), static_cast(device_delim.size())}, + *chunk, + row_offsets); + } - // load the next chunk - auto next_chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream); - // while that is running, determine how many offsets we output (synchronizes) - auto const new_offsets = [&] { - auto const new_offsets_unclamped = - tile_offsets.get_inclusive_prefix(base_tile_idx + tiles_in_launch - 1, scan_stream) - - static_cast(row_offset_storage.size()); - // if we are not in the last chunk, we can use all offsets - if (chunk_offset + static_cast(chunk->size()) < byte_range_end) { - return new_offsets_unclamped; + // load the next chunk + auto next_chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream); + // while that is running, determine how many offsets we output (synchronizes) + auto const new_offsets = [&] { + auto const new_offsets_unclamped = + tile_offsets.get_inclusive_prefix(base_tile_idx + tiles_in_launch - 1, scan_stream) - + static_cast(row_offset_storage.size()); + // if we are not in the last chunk, we can use all offsets + if (chunk_offset + static_cast(chunk->size()) < byte_range_end) { + return new_offsets_unclamped; + } + // if we are in the last chunk, we need to find the first out-of-bounds offset + auto const it = thrust::make_counting_iterator(output_offset{}); + auto const end_loc = + *thrust::find_if(rmm::exec_policy_nosync(scan_stream), + it, + it + new_offsets_unclamped, + [row_offsets, byte_range_end] __device__(output_offset i) { + return row_offsets[i] >= byte_range_end; + }); + // if we had no out-of-bounds offset, we copy all offsets + if (end_loc == new_offsets_unclamped) { return end_loc; } + // otherwise we copy only up to (including) the first out-of-bounds delimiter + found_last_offset = true; + return end_loc + 1; + }(); + row_offset_storage.advance_output(new_offsets, scan_stream); + // determine if we found the first or last field offset for the byte range + if (new_offsets > 0 and not first_row_offset) { + first_row_offset = row_offset_storage.front_element(scan_stream); + } + if (found_last_offset) { last_row_offset = row_offset_storage.back_element(scan_stream); } + // copy over the characters we need, if we already encountered the first field delimiter + if (first_row_offset.has_value()) { + auto const begin = + chunk->data() + std::max(0, *first_row_offset - chunk_offset); + auto const sentinel = last_row_offset.value_or(std::numeric_limits::max()); + auto const end = + chunk->data() + std::min(sentinel - chunk_offset, chunk->size()); + auto const output_size = end - begin; + auto char_output = char_storage.next_output(scan_stream); + thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, end, char_output.begin()); + char_storage.advance_output(output_size, scan_stream); } - // if we are in the last chunk, we need to find the first out-of-bounds offset - auto const it = thrust::make_counting_iterator(output_offset{}); - auto const end_loc = - *thrust::find_if(rmm::exec_policy_nosync(scan_stream), - it, - it + new_offsets_unclamped, - [row_offsets, byte_range_end] __device__(output_offset i) { - return row_offsets[i] >= byte_range_end; - }); - // if we had no out-of-bounds offset, we copy all offsets - if (end_loc == new_offsets_unclamped) { return end_loc; } - // otherwise we copy only up to (including) the first out-of-bounds delimiter - found_last_offset = true; - return end_loc + 1; - }(); - row_offset_storage.advance_output(new_offsets, scan_stream); - // determine if we found the first or last field offset for the byte range - if (new_offsets > 0 and not first_row_offset) { - first_row_offset = row_offset_storage.front_element(scan_stream); - } - if (found_last_offset) { last_row_offset = row_offset_storage.back_element(scan_stream); } - // copy over the characters we need, if we already encountered the first field delimiter - if (first_row_offset.has_value()) { - auto const begin = chunk->data() + std::max(0, *first_row_offset - chunk_offset); - auto const sentinel = last_row_offset.value_or(std::numeric_limits::max()); - auto const end = - chunk->data() + std::min(sentinel - chunk_offset, chunk->size()); - auto const output_size = end - begin; - auto char_output = char_storage.next_output(scan_stream); - thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, end, char_output.begin()); - char_storage.advance_output(output_size, scan_stream); - } - CUDF_CUDA_TRY(cudaEventRecord(last_launch_event, scan_stream.value())); + CUDF_CUDA_TRY(cudaEventRecord(last_launch_event, scan_stream.value())); - std::swap(read_stream, scan_stream); - base_tile_idx += tiles_in_launch; - chunk_offset += chunk->size(); - chunk = std::move(next_chunk); - } + std::swap(read_stream, scan_stream); + base_tile_idx += tiles_in_launch; + chunk_offset += chunk->size(); + chunk = std::move(next_chunk); + } + + CUDF_CUDA_TRY(cudaEventDestroy(last_launch_event)); - CUDF_CUDA_TRY(cudaEventDestroy(last_launch_event)); + cudf::detail::join_streams(streams, stream); - cudf::detail::join_streams(streams, stream); + auto chars = char_storage.gather(stream, mr); + auto global_offsets = row_offset_storage.gather(stream, mr); + return std::pair{std::move(global_offsets), std::move(chars)}; + }(); // if the input was empty, we didn't find a delimiter at all, // or the first delimiter was also the last: empty output @@ -511,9 +516,6 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source return make_empty_column(type_id::STRING); } - auto chars = char_storage.gather(stream, mr); - auto global_offsets = row_offset_storage.gather(stream, mr); - // insert an offset at the beginning if we started at the beginning of the input bool const insert_begin = first_row_offset.value_or(0) == 0; // insert an offset at the end if we have not terminated the last row @@ -591,6 +593,4 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source return result; } -} // namespace text -} // namespace io -} // namespace cudf +} // namespace cudf::io::text diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index 20ac89b4d53..a3afbd52896 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -14,14 +14,16 @@ * limitations under the License. */ -#include "config_utils.hpp" +#include "getenv_or.hpp" +#include #include #include +#include #include -namespace cudf::io::detail { +namespace cudf::io { namespace cufile_integration { @@ -80,4 +82,4 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_ } // namespace nvcomp_integration -} // namespace cudf::io::detail +} // namespace cudf::io diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index a6cbbcd84a6..1dbb9369115 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -15,8 +15,9 @@ */ #include "file_io_utilities.hpp" -#include "io/utilities/config_utils.hpp" +#include +#include #include #include @@ -40,7 +41,7 @@ class file_sink : public data_sink { _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc); if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); } - if (detail::cufile_integration::is_kvikio_enabled()) { + if (cufile_integration::is_kvikio_enabled()) { _kvikio_file = kvikio::FileHandle(filepath, "w"); CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.", _kvikio_file.is_compat_mode_on() ? "on" : "off"); diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index ca8932322bf..91be154e09d 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -15,9 +15,10 @@ */ #include "file_io_utilities.hpp" -#include "io/utilities/config_utils.hpp" +#include #include +#include #include #include #include @@ -44,7 +45,7 @@ class file_source : public datasource { explicit file_source(char const* filepath) : _file(filepath, O_RDONLY) { detail::force_init_cuda_context(); - if (detail::cufile_integration::is_kvikio_enabled()) { + if (cufile_integration::is_kvikio_enabled()) { _kvikio_file = kvikio::FileHandle(filepath); CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.", _kvikio_file.is_compat_mode_on() ? "on" : "off"); @@ -216,7 +217,7 @@ class memory_mapped_source : public file_source { void map(int fd, size_t offset, size_t size) { - CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file"); + CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file", std::overflow_error); // Offset for `mmap()` must be page aligned _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1); @@ -433,7 +434,7 @@ std::unique_ptr datasource::create(std::string const& filepath, size_t size) { #ifdef CUFILE_FOUND - if (detail::cufile_integration::is_always_enabled()) { + if (cufile_integration::is_always_enabled()) { // avoid mmap as GDS is expected to be used for most reads return std::make_unique(filepath.c_str()); } diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index a9d4f19c848..d7b54399f8d 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -16,9 +16,11 @@ #include "file_io_utilities.hpp" -#include "io/utilities/config_utils.hpp" +#include "getenv_or.hpp" #include +#include +#include #include @@ -221,7 +223,6 @@ cufile_input_impl::cufile_input_impl(std::string const& filepath) // The benefit from multithreaded read plateaus around 16 threads pool(getenv_or("LIBCUDF_CUFILE_THREAD_COUNT", 16)) { - pool.sleep_duration = 10; } namespace { @@ -230,14 +231,15 @@ template > std::vector> make_sliced_tasks( - F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool) + F function, DataT* ptr, size_t offset, size_t size, BS::thread_pool& pool) { constexpr size_t default_max_slice_size = 4 * 1024 * 1024; static auto const max_slice_size = getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_size); auto const slices = make_file_io_slices(size, max_slice_size); std::vector> slice_tasks; std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) { - return pool.submit(function, ptr + slice.offset, slice.size, offset + slice.offset); + return pool.submit_task( + [&] { return function(ptr + slice.offset, slice.size, offset + slice.offset); }); }); return slice_tasks; } diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 91ef41fba6e..441bede200d 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -19,8 +19,7 @@ #ifdef CUFILE_FOUND #include -#include - +#include #include #endif @@ -150,7 +149,7 @@ class cufile_input_impl final : public cufile_input { private: cufile_shim const* shim = nullptr; cufile_registered_file const cf_file; - cudf::detail::thread_pool pool; + BS::thread_pool pool; }; /** @@ -167,7 +166,7 @@ class cufile_output_impl final : public cufile_output { private: cufile_shim const* shim = nullptr; cufile_registered_file const cf_file; - cudf::detail::thread_pool pool; + BS::thread_pool pool; }; #else diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/getenv_or.hpp similarity index 63% rename from cpp/src/io/utilities/config_utils.hpp rename to cpp/src/io/utilities/getenv_or.hpp index 74df1375e6f..3fd97a00b61 100644 --- a/cpp/src/io/utilities/config_utils.hpp +++ b/cpp/src/io/utilities/getenv_or.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,15 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once #include +#include #include #include -namespace cudf::io::detail { - +namespace { /** * @brief Returns the value of the environment variable, or a default value if the variable is not * present. @@ -45,37 +46,4 @@ T getenv_or(std::string_view env_var_name, T default_val) return converted_val; } -namespace cufile_integration { - -/** - * @brief Returns true if cuFile and its compatibility mode are enabled. - */ -bool is_always_enabled(); - -/** - * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled). - */ -bool is_gds_enabled(); - -/** - * @brief Returns true if KvikIO is enabled. - */ -bool is_kvikio_enabled(); - -} // namespace cufile_integration - -namespace nvcomp_integration { - -/** - * @brief Returns true if all nvCOMP uses are enabled. - */ -bool is_all_enabled(); - -/** - * @brief Returns true if stable nvCOMP use is enabled. - */ -bool is_stable_enabled(); - -} // namespace nvcomp_integration - -} // namespace cudf::io::detail +} // namespace diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh index 0fc1c3718b1..ea59f23c77f 100644 --- a/cpp/src/join/mixed_join_kernel.cuh +++ b/cpp/src/join/mixed_join_kernel.cuh @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -38,7 +39,7 @@ namespace cg = cooperative_groups; #pragma GCC diagnostic ignored "-Wattributes" template -__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ +CUDF_HIDDEN __launch_bounds__(block_size) __global__ void mixed_join(table_device_view left_table, table_device_view right_table, table_device_view probe, diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index 01e3fe09b38..1f31eaa7878 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -34,7 +35,7 @@ namespace cg = cooperative_groups; #pragma GCC diagnostic ignored "-Wattributes" template -__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ +CUDF_HIDDEN __launch_bounds__(block_size) __global__ void mixed_join_semi(table_device_view left_table, table_device_view right_table, table_device_view probe, diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 618e7a9082e..00a90f8273f 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -35,20 +36,19 @@ namespace cg = cooperative_groups; #pragma GCC diagnostic ignored "-Wattributes" template -__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ - void compute_mixed_join_output_size( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row) +CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size( + table_device_view left_table, + table_device_view right_table, + table_device_view probe, + table_device_view build, + row_hash const hash_probe, + row_equality const equality_probe, + join_kind const join_type, + cudf::detail::mixed_multimap_type::device_view hash_table_view, + ast::detail::expression_device_view device_expression_data, + bool const swap_tables, + std::size_t* output_size, + cudf::device_span matches_per_row) { // The (required) extern storage of the shared memory array leads to // conflicting declarations between different templates. The easiest diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu index fbb0f6cb0f5..4fb983dc5a6 100644 --- a/cpp/src/search/contains_table.cu +++ b/cpp/src/search/contains_table.cu @@ -76,18 +76,6 @@ struct comparator_adapter { { } - // suppress "function was declared but never referenced warning" -#pragma nv_diagnostic push -#pragma nv_diag_suppress 177 - __device__ constexpr auto operator()(lhs_index_type lhs_index, - lhs_index_type rhs_index) const noexcept - { - auto const lhs = static_cast(lhs_index); - auto const rhs = static_cast(rhs_index); - - return _self_equal(lhs, rhs); - } - __device__ constexpr auto operator()(rhs_index_type lhs_index, rhs_index_type rhs_index) const noexcept { @@ -103,13 +91,6 @@ struct comparator_adapter { return _two_table_equal(lhs_index, rhs_index); } - __device__ constexpr auto operator()(rhs_index_type lhs_index, - lhs_index_type rhs_index) const noexcept - { - return _two_table_equal(lhs_index, rhs_index); - } -#pragma nv_diagnostic pop - private: SelfEqual const _self_equal; TwoTableEqual const _two_table_equal; diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 43a3d69091a..2ca22f0e017 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -451,8 +451,8 @@ struct replace_multi_fn { while (spos < d_str.size_bytes()) { for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) { auto const d_tgt = d_targets.element(tgt_idx); - if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) && // check fit - (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // and match + if (!d_tgt.empty() && (d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) && // check fit + (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // and match { auto const d_repl = (d_repls.size() == 1) ? d_repls.element(0) : d_repls.element(tgt_idx); @@ -468,9 +468,8 @@ struct replace_multi_fn { } ++spos; } - if (out_ptr) // copy remainder - { - memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); + if (out_ptr) { + memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); // copy remainder } else { d_sizes[idx] = bytes; } diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index 23614ac0733..4d7096c02ca 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -357,6 +357,12 @@ std::pair, rmm::device_uvector> split auto const chars_bytes = get_offset_value(input.offsets(), input.offset() + strings_count, stream) - get_offset_value(input.offsets(), input.offset(), stream); + if (chars_bytes == 0) { + auto offsets = cudf::make_column_from_scalar( + numeric_scalar(0, true, stream), strings_count + 1, stream, mr); + auto tokens = rmm::device_uvector(0, stream); + return std::pair{std::move(offsets), std::move(tokens)}; + } auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh index 3bb574748b6..a2e441c3284 100644 --- a/cpp/src/text/bpe/byte_pair_encoding.cuh +++ b/cpp/src/text/bpe/byte_pair_encoding.cuh @@ -89,14 +89,6 @@ struct bpe_equal { return lhs == rhs; // all rows are unique } // used by find - __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept - { - lhs *= 2; - auto const left = d_strings.element(lhs); - auto const right = d_strings.element(lhs + 1); - return (left == rhs.first) && (right == rhs.second); - } - // used by find __device__ bool operator()(merge_pair_type const& lhs, cudf::size_type rhs) const noexcept { rhs *= 2; @@ -157,11 +149,6 @@ struct mp_equal { return left == right; } // used by find - __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept - { - auto const left = d_strings.element(lhs); - return left == rhs; - } __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept { auto const right = d_strings.element(rhs); diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu index 9cf934165f6..e465fb79c89 100644 --- a/cpp/src/text/jaccard.cu +++ b/cpp/src/text/jaccard.cu @@ -19,16 +19,19 @@ #include #include #include +#include #include -#include +#include +#include +#include #include #include #include -#include #include #include +#include #include #include @@ -36,127 +39,375 @@ #include #include #include +#include +#include +#include #include namespace nvtext { namespace detail { namespace { +constexpr cudf::thread_index_type block_size = 256; +constexpr cudf::thread_index_type bytes_per_thread = 4; + /** * @brief Retrieve the row data (span) for the given column/row-index * - * @param d_input Input lists column + * @param values Flat vector of all values + * @param offsets Offsets identifying rows within values * @param idx Row index to retrieve * @return A device-span of the row values */ -__device__ auto get_row(cudf::column_device_view const& d_input, cudf::size_type idx) +__device__ auto get_row(uint32_t const* values, int64_t const* offsets, cudf::size_type row_idx) { - auto const offsets = - d_input.child(cudf::lists_column_view::offsets_column_index).data(); - auto const offset = offsets[idx]; - auto const size = offsets[idx + 1] - offset; - auto const begin = - d_input.child(cudf::lists_column_view::child_column_index).data() + offset; + auto const offset = offsets[row_idx]; + auto const size = offsets[row_idx + 1] - offset; + auto const begin = values + offset; return cudf::device_span(begin, size); } /** - * @brief Count the unique values within each row of the input column + * @brief Kernel to count the unique values within each row of the input column + * + * This is called with a warp per row. * - * This is called with a warp per row + * @param d_values Sorted hash values to count uniqueness + * @param d_offsets Offsets to each set of row elements in d_values + * @param rows Number of rows in the output + * @param d_results Number of unique values in each row */ -struct sorted_unique_fn { - cudf::column_device_view const d_input; - cudf::size_type* d_results; +CUDF_KERNEL void sorted_unique_fn(uint32_t const* d_values, + int64_t const* d_offsets, + cudf::size_type rows, + cudf::size_type* d_results) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= (static_cast(rows) * cudf::detail::warp_size)) { return; } - // warp per row - __device__ void operator()(cudf::size_type idx) const - { - using warp_reduce = cub::WarpReduce; - __shared__ typename warp_reduce::TempStorage temp_storage; + using warp_reduce = cub::WarpReduce; + __shared__ typename warp_reduce::TempStorage temp_storage; - auto const row_idx = idx / cudf::detail::warp_size; - auto const lane_idx = idx % cudf::detail::warp_size; - auto const row = get_row(d_input, row_idx); - auto const begin = row.begin(); + auto const row_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + auto const row = get_row(d_values, d_offsets, row_idx); + auto const begin = row.begin(); - cudf::size_type count = 0; - for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) { - count += (itr == begin || *itr != *(itr - 1)); - } - auto const result = warp_reduce(temp_storage).Sum(count); - if (lane_idx == 0) { d_results[row_idx] = result; } + cudf::size_type count = 0; + for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) { + count += (itr == begin || *itr != *(itr - 1)); } -}; + auto const result = warp_reduce(temp_storage).Sum(count); + if (lane_idx == 0) { d_results[row_idx] = result; } +} -rmm::device_uvector compute_unique_counts(cudf::column_view const& input, +/** + * @brief Count the unique values within each row of the input column + * + * @param values Sorted hash values to count uniqueness + * @param offsets Offsets to each set of row elements in d_values + * @param rows Number of rows in the output + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Number of unique values + */ +rmm::device_uvector compute_unique_counts(uint32_t const* values, + int64_t const* offsets, + cudf::size_type rows, rmm::cuda_stream_view stream) { - auto const d_input = cudf::column_device_view::create(input, stream); - auto d_results = rmm::device_uvector(input.size(), stream); - sorted_unique_fn fn{*d_input, d_results.data()}; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::counting_iterator(0), - input.size() * cudf::detail::warp_size, - fn); + auto d_results = rmm::device_uvector(rows, stream); + auto const num_blocks = cudf::util::div_rounding_up_safe( + static_cast(rows) * cudf::detail::warp_size, block_size); + sorted_unique_fn<<>>( + values, offsets, rows, d_results.data()); return d_results; } +/** + * @brief Kernel to count the number of common values within each row of the 2 input columns + * + * This is called with a warp per row. + * + * @param d_values1 Sorted hash values to check against d_values2 + * @param d_offsets1 Offsets to each set of row elements in d_values1 + * @param d_values2 Sorted hash values to check against d_values1 + * @param d_offsets2 Offsets to each set of row elements in d_values2 + * @param rows Number of rows in the output + * @param d_results Number of common values in each row + */ +CUDF_KERNEL void sorted_intersect_fn(uint32_t const* d_values1, + int64_t const* d_offsets1, + uint32_t const* d_values2, + int64_t const* d_offsets2, + cudf::size_type rows, + cudf::size_type* d_results) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= (static_cast(rows) * cudf::detail::warp_size)) { return; } + + using warp_reduce = cub::WarpReduce; + __shared__ typename warp_reduce::TempStorage temp_storage; + + auto const row_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + + auto const needles = get_row(d_values1, d_offsets1, row_idx); + auto const haystack = get_row(d_values2, d_offsets2, row_idx); + + auto begin = haystack.begin(); + auto const end = haystack.end(); + + cudf::size_type count = 0; + for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end; + itr += cudf::detail::warp_size) { + if (itr != needles.begin() && *itr == *(itr - 1)) { continue; } // skip duplicates + // search haystack for this needle (*itr) + auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr); + count += (found != end) && (*found == *itr); // increment if found; + begin = found; // shorten the next lower-bound range + } + // sum up the counts across this warp + auto const result = warp_reduce(temp_storage).Sum(count); + if (lane_idx == 0) { d_results[row_idx] = result; } +} + /** * @brief Count the number of common values within each row of the 2 input columns * - * This is called with a warp per row + * @param d_values1 Sorted hash values to check against d_values2 + * @param d_offsets1 Offsets to each set of row elements in d_values1 + * @param d_values2 Sorted hash values to check against d_values1 + * @param d_offsets2 Offsets to each set of row elements in d_values2 + * @param rows Number of rows in the output + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Number of common values */ -struct sorted_intersect_fn { - cudf::column_device_view const d_input1; - cudf::column_device_view const d_input2; - cudf::size_type* d_results; +rmm::device_uvector compute_intersect_counts(uint32_t const* values1, + int64_t const* offsets1, + uint32_t const* values2, + int64_t const* offsets2, + cudf::size_type rows, + rmm::cuda_stream_view stream) +{ + auto d_results = rmm::device_uvector(rows, stream); + auto const num_blocks = cudf::util::div_rounding_up_safe( + static_cast(rows) * cudf::detail::warp_size, block_size); + sorted_intersect_fn<<>>( + values1, offsets1, values2, offsets2, rows, d_results.data()); + return d_results; +} - // warp per row - __device__ void operator()(cudf::size_type idx) const - { - using warp_reduce = cub::WarpReduce; - __shared__ typename warp_reduce::TempStorage temp_storage; +/** + * @brief Counts the number of substrings in each row of the given strings column + * + * Each warp processes a single string. + * Formula is `count = max(1, str.length() - width + 1)` + * If a string has less than width characters (but not empty), the count is 1 + * since the entire string is still hashed. + * + * @param d_strings Input column of strings + * @param width Substring size in characters + * @param d_counts Output number of substring per row of input + */ +CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_strings, + cudf::size_type width, + int64_t* d_counts) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= (static_cast(d_strings.size()) * cudf::detail::warp_size)) { + return; + } - auto const row_idx = idx / cudf::detail::warp_size; - auto const lane_idx = idx % cudf::detail::warp_size; + auto const str_idx = static_cast(idx / cudf::detail::warp_size); + if (d_strings.is_null(str_idx)) { + d_counts[str_idx] = 0; + return; + } - auto const needles = get_row(d_input1, row_idx); - auto const haystack = get_row(d_input2, row_idx); + auto const d_str = d_strings.element(str_idx); + if (d_str.empty()) { + d_counts[str_idx] = 0; + return; + } - auto begin = haystack.begin(); - auto const end = haystack.end(); + using warp_reduce = cub::WarpReduce; + __shared__ typename warp_reduce::TempStorage temp_storage; - // TODO: investigate cuCollections device-side static-map to match row values + auto const end = d_str.data() + d_str.size_bytes(); + auto const lane_idx = idx % cudf::detail::warp_size; + cudf::size_type count = 0; + for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end; + itr += cudf::detail::warp_size * bytes_per_thread) { + for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) { + count += static_cast(cudf::strings::detail::is_begin_utf8_char(*s)); + } + } + auto const char_count = warp_reduce(temp_storage).Sum(count); + if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); } +} + +/** + * @brief Kernel to hash the substrings for each input row + * + * Each warp processes a single string. + * Substrings of string "hello world" with width=4 produce: + * "hell", "ello", "llo ", "lo w", "o wo", " wor", "worl", "orld" + * Each of these substrings is hashed and the hash stored in d_results + * + * @param d_strings Input column of strings + * @param width Substring size in characters + * @param d_output_offsets Offsets into d_results + * @param d_results Hash values for each substring + */ +CUDF_KERNEL void substring_hash_kernel(cudf::column_device_view const d_strings, + cudf::size_type width, + int64_t const* d_output_offsets, + uint32_t* d_results) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= (static_cast(d_strings.size()) * cudf::detail::warp_size)) { + return; + } - cudf::size_type count = 0; - for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end; - itr += cudf::detail::warp_size) { - if (itr != needles.begin() && *itr == *(itr - 1)) { continue; } // skip duplicates - // search haystack for this needle (*itr) - auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr); - count += (found != end) && (*found == *itr); // increment if found; - begin = found; // shorten the next lower-bound range + auto const str_idx = idx / cudf::detail::warp_size; + auto const lane_idx = idx % cudf::detail::warp_size; + + if (d_strings.is_null(str_idx)) { return; } + auto const d_str = d_strings.element(str_idx); + if (d_str.empty()) { return; } + + __shared__ uint32_t hvs[block_size]; // temp store for hash values + + auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{0}; + auto const end = d_str.data() + d_str.size_bytes(); + auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1; + + auto d_hashes = d_results + d_output_offsets[str_idx]; + auto itr = d_str.data() + lane_idx; + for (auto i = 0; i < warp_count; ++i) { + uint32_t hash = 0; + if (itr < end && cudf::strings::detail::is_begin_utf8_char(*itr)) { + // resolve substring + auto const sub_str = + cudf::string_view(itr, static_cast(thrust::distance(itr, end))); + auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(sub_str, width); + // hash only if we have the full width of characters or this is the beginning of the string + if ((left == 0) || (itr == d_str.data())) { hash = hasher(cudf::string_view(itr, bytes)); } } - // sum up the counts across this warp - auto const result = warp_reduce(temp_storage).Sum(count); - if (lane_idx == 0) { d_results[row_idx] = result; } + hvs[threadIdx.x] = hash; // store hash into shared memory + __syncwarp(); + if (lane_idx == 0) { + // copy valid hash values for this warp into d_hashes + auto const hashes = &hvs[threadIdx.x]; + auto const hashes_end = hashes + cudf::detail::warp_size; + d_hashes = + thrust::copy_if(thrust::seq, hashes, hashes_end, d_hashes, [](auto h) { return h != 0; }); + } + __syncwarp(); + itr += cudf::detail::warp_size; } -}; +} -rmm::device_uvector compute_intersect_counts(cudf::column_view const& input1, - cudf::column_view const& input2, - rmm::cuda_stream_view stream) +void segmented_sort(uint32_t const* input, + uint32_t* output, + int64_t items, + cudf::size_type segments, + int64_t const* offsets, + rmm::cuda_stream_view stream) { - auto const d_input1 = cudf::column_device_view::create(input1, stream); - auto const d_input2 = cudf::column_device_view::create(input2, stream); - auto d_results = rmm::device_uvector(input1.size(), stream); - sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()}; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::counting_iterator(0), - input1.size() * cudf::detail::warp_size, - fn); - return d_results; + rmm::device_buffer temp; + std::size_t temp_bytes = 0; + cub::DeviceSegmentedSort::SortKeys( + temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value()); + temp = rmm::device_buffer(temp_bytes, stream); + cub::DeviceSegmentedSort::SortKeys( + temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value()); +} + +/** + * @brief Create hashes for each substring + * + * The hashes are sorted using a segmented-sort as setup to + * perform the unique and intersect operations. + * + * @param input Input strings column to hash + * @param width Substring width in characters + * @param stream CUDA stream used for device memory operations and kernel launches + * @return The sorted hash values and offsets to each row + */ +std::pair, rmm::device_uvector> hash_substrings( + cudf::strings_column_view const& input, cudf::size_type width, rmm::cuda_stream_view stream) +{ + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + // count substrings + auto offsets = rmm::device_uvector(input.size() + 1, stream); + auto const num_blocks = cudf::util::div_rounding_up_safe( + static_cast(input.size()) * cudf::detail::warp_size, block_size); + count_substrings_kernel<<>>( + *d_strings, width, offsets.data()); + auto const total_hashes = + cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), stream); + + // hash substrings + rmm::device_uvector hashes(total_hashes, stream); + substring_hash_kernel<<>>( + *d_strings, width, offsets.data(), hashes.data()); + + // sort hashes + rmm::device_uvector sorted(total_hashes, stream); + if (total_hashes < static_cast(std::numeric_limits::max())) { + segmented_sort( + hashes.begin(), sorted.begin(), sorted.size(), input.size(), offsets.begin(), stream); + } else { + // The CUB segmented sort can only handle max total values + // so this code calls it in sections. + auto const section_size = std::numeric_limits::max() / 2L; + auto const sort_sections = cudf::util::div_rounding_up_safe(total_hashes, section_size); + auto const offset_indices = [&] { + // build a set of indices that point to offsets subsections + auto sub_offsets = rmm::device_uvector(sort_sections + 1, stream); + thrust::sequence( + rmm::exec_policy(stream), sub_offsets.begin(), sub_offsets.end(), 0L, section_size); + auto indices = rmm::device_uvector(sub_offsets.size(), stream); + thrust::lower_bound(rmm::exec_policy(stream), + offsets.begin(), + offsets.end(), + sub_offsets.begin(), + sub_offsets.end(), + indices.begin()); + return cudf::detail::make_std_vector_sync(indices, stream); + }(); + + // Call segmented sort with the sort sections + for (auto i = 0L; i < sort_sections; ++i) { + auto const index1 = offset_indices[i]; + auto const index2 = std::min(offset_indices[i + 1], static_cast(offsets.size() - 1)); + auto const offset1 = offsets.element(index1, stream); + auto const offset2 = offsets.element(index2, stream); + + auto const num_items = offset2 - offset1; + auto const num_segments = index2 - index1; + + // There is a bug in the CUB segmented sort and the workaround is to + // shift the offset values so the first offset is 0. + // This transform can be removed once the bug is fixed. + auto sort_offsets = rmm::device_uvector(num_segments + 1, stream); + thrust::transform(rmm::exec_policy(stream), + offsets.begin() + index1, + offsets.begin() + index2 + 1, + sort_offsets.begin(), + [offset1] __device__(auto const o) { return o - offset1; }); + + segmented_sort(hashes.begin() + offset1, + sorted.begin() + offset1, + num_items, + num_segments, + sort_offsets.begin(), + stream); + } + } + return std::make_pair(std::move(sorted), std::move(offsets)); } /** @@ -186,62 +437,6 @@ struct jaccard_fn { } }; -/** - * @brief Create hashes for each substring - * - * Uses the hash_character_ngrams to hash substrings of the input column. - * This returns a lists column where each row is the hashes for the substrings - * of the corresponding input string row. - * - * The hashes are then sorted using a segmented-sort as setup to - * perform the unique and intersect operations. - */ -std::unique_ptr hash_substrings(cudf::strings_column_view const& col, - cudf::size_type width, - rmm::cuda_stream_view stream) -{ - auto hashes = hash_character_ngrams(col, width, stream, rmm::mr::get_current_device_resource()); - auto const input = cudf::lists_column_view(hashes->view()); - auto const offsets = input.offsets_begin(); - auto const data = input.child().data(); - - rmm::device_uvector sorted(input.child().size(), stream); - - // this is wicked fast and much faster than using cudf::lists::detail::sort_list - rmm::device_buffer d_temp_storage; - size_t temp_storage_bytes = 0; - cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(), - temp_storage_bytes, - data, - sorted.data(), - sorted.size(), - input.size(), - offsets, - offsets + 1, - stream.value()); - d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; - cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(), - temp_storage_bytes, - data, - sorted.data(), - sorted.size(), - input.size(), - offsets, - offsets + 1, - stream.value()); - - auto contents = hashes->release(); - // the offsets are taken from the hashes column since they are the same - // before and after the segmented-sort - return cudf::make_lists_column( - col.size(), - std::move(contents.children.front()), - std::make_unique(std::move(sorted), rmm::device_buffer{}, 0), - 0, - rmm::device_buffer{}, - stream, - rmm::mr::get_current_device_resource()); -} } // namespace std::unique_ptr jaccard_index(cudf::strings_column_view const& input1, @@ -261,13 +456,14 @@ std::unique_ptr jaccard_index(cudf::strings_column_view const& inp auto const [d_uniques1, d_uniques2, d_intersects] = [&] { // build hashes of the substrings - auto const hash1 = hash_substrings(input1, width, stream); - auto const hash2 = hash_substrings(input2, width, stream); + auto const [hash1, offsets1] = hash_substrings(input1, width, stream); + auto const [hash2, offsets2] = hash_substrings(input2, width, stream); // compute the unique counts in each set and the intersection counts - auto d_uniques1 = compute_unique_counts(hash1->view(), stream); - auto d_uniques2 = compute_unique_counts(hash2->view(), stream); - auto d_intersects = compute_intersect_counts(hash1->view(), hash2->view(), stream); + auto d_uniques1 = compute_unique_counts(hash1.data(), offsets1.data(), input1.size(), stream); + auto d_uniques2 = compute_unique_counts(hash2.data(), offsets2.data(), input2.size(), stream); + auto d_intersects = compute_intersect_counts( + hash1.data(), offsets1.data(), hash2.data(), offsets2.data(), input1.size(), stream); return std::tuple{std::move(d_uniques1), std::move(d_uniques2), std::move(d_intersects)}; }(); diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index ea09f5d17af..97abb1487d8 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -86,18 +86,10 @@ struct vocab_equal { return lhs == rhs; // all rows are expected to be unique } // used by find - // suppress "function was declared but never referenced warning" -#pragma nv_diagnostic push -#pragma nv_diag_suppress 177 - __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept - { - return d_strings.element(lhs) == rhs; - } __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept { return d_strings.element(rhs) == lhs; } -#pragma nv_diagnostic pop }; using probe_scheme = cuco::linear_probing<1, vocab_hasher>; diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index 64427326d87..ec21813705a 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -459,6 +460,14 @@ std::unique_ptr cast(column_view const& input, return type_dispatcher(input.type(), detail::dispatch_unary_cast_from{input}, type, stream, mr); } +struct is_supported_cast_impl { + template + bool operator()() const + { + return is_supported_cast(); + } +}; + } // namespace detail std::unique_ptr cast(column_view const& input, @@ -470,4 +479,11 @@ std::unique_ptr cast(column_view const& input, return detail::cast(input, type, stream, mr); } +bool is_supported_cast(data_type from, data_type to) noexcept +{ + // No matching detail API call/nvtx annotation, since this doesn't + // launch a kernel. + return double_type_dispatcher(from, to, detail::is_supported_cast_impl{}); +} + } // namespace cudf diff --git a/cpp/tests/binaryop/binop-verify-input-test.cpp b/cpp/tests/binaryop/binop-verify-input-test.cpp index 1346dcd4666..def6e94452e 100644 --- a/cpp/tests/binaryop/binop-verify-input-test.cpp +++ b/cpp/tests/binaryop/binop-verify-input-test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -42,5 +42,5 @@ TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize) EXPECT_THROW(cudf::binary_operation( lhs, rhs, cudf::binary_operator::ADD, cudf::data_type(cudf::type_id::INT64)), - cudf::logic_error); + std::invalid_argument); } diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index ab7984d4b03..a222289216d 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -38,7 +38,7 @@ struct FixedPointTest : public cudf::test::BaseFixture {}; template struct FixedPointTestAllReps : public cudf::test::BaseFixture {}; -using RepresentationTypes = ::testing::Types; +using RepresentationTypes = ::testing::Types; TYPED_TEST_SUITE(FixedPointTestAllReps, RepresentationTypes); @@ -53,6 +53,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction) auto num4 = cudf::convert_floating_to_fixed(1.234567, scale_type(-4)); auto num5 = cudf::convert_floating_to_fixed(1.234567, scale_type(-5)); auto num6 = cudf::convert_floating_to_fixed(1.234567, scale_type(-6)); + auto num7 = cudf::convert_floating_to_fixed(0.0, scale_type(-4)); EXPECT_EQ(1, cudf::convert_fixed_to_floating(num0)); EXPECT_EQ(1.2, cudf::convert_fixed_to_floating(num1)); @@ -61,6 +62,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction) EXPECT_EQ(1.2345, cudf::convert_fixed_to_floating(num4)); EXPECT_EQ(1.23456, cudf::convert_fixed_to_floating(num5)); EXPECT_EQ(1.234567, cudf::convert_fixed_to_floating(num6)); + EXPECT_EQ(0.0, cudf::convert_fixed_to_floating(num7)); } TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction) @@ -74,6 +76,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction) auto num4 = cudf::convert_floating_to_fixed(-1.234567, scale_type(-4)); auto num5 = cudf::convert_floating_to_fixed(-1.234567, scale_type(-5)); auto num6 = cudf::convert_floating_to_fixed(-1.234567, scale_type(-6)); + auto num7 = cudf::convert_floating_to_fixed(-0.0, scale_type(-4)); EXPECT_EQ(-1, cudf::convert_fixed_to_floating(num0)); EXPECT_EQ(-1.2, cudf::convert_fixed_to_floating(num1)); @@ -82,6 +85,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction) EXPECT_EQ(-1.2345, cudf::convert_fixed_to_floating(num4)); EXPECT_EQ(-1.23456, cudf::convert_fixed_to_floating(num5)); EXPECT_EQ(-1.234567, cudf::convert_fixed_to_floating(num6)); + EXPECT_EQ(-0.0, cudf::convert_fixed_to_floating(num7)); } TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction) @@ -99,14 +103,10 @@ TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction) EXPECT_EQ(1.1, cudf::convert_fixed_to_floating(a)); EXPECT_EQ(1.01, cudf::convert_fixed_to_floating(b)); - EXPECT_EQ(1, - cudf::convert_fixed_to_floating( - c)); // intentional (inherited problem from floating point) + EXPECT_EQ(1.001, cudf::convert_fixed_to_floating(c)); EXPECT_EQ(1.0001, cudf::convert_fixed_to_floating(d)); EXPECT_EQ(1.00001, cudf::convert_fixed_to_floating(e)); - EXPECT_EQ(1, - cudf::convert_fixed_to_floating( - f)); // intentional (inherited problem from floating point) + EXPECT_EQ(1.000001, cudf::convert_fixed_to_floating(f)); EXPECT_TRUE(1.000123 - cudf::convert_fixed_to_floating(x) < std::numeric_limits::epsilon()); @@ -153,6 +153,119 @@ TYPED_TEST(FixedPointTestAllReps, MoreSimpleBinaryFPConstruction) EXPECT_EQ(2.0625, cudf::convert_fixed_to_floating(num1)); } +TEST_F(FixedPointTest, PreciseFloatDecimal64Construction) +{ + // Need 9 decimal digits to uniquely represent all floats (numeric_limits::max_digits10()). + // Precise conversion: set the scale factor to 9 less than the order-of-magnitude. + // But with -9 scale factor decimal32 can overflow: use decimal64 instead. + + // Positive Exponent + { + auto num0 = cudf::convert_floating_to_fixed(3.141593E7f, scale_type(-2)); + auto num1 = cudf::convert_floating_to_fixed(3.141593E12f, scale_type(3)); + auto num2 = cudf::convert_floating_to_fixed(3.141593E17f, scale_type(8)); + auto num3 = cudf::convert_floating_to_fixed(3.141593E22f, scale_type(13)); + auto num4 = cudf::convert_floating_to_fixed(3.141593E27f, scale_type(18)); + auto num5 = cudf::convert_floating_to_fixed(3.141593E32f, scale_type(23)); + auto num6 = cudf::convert_floating_to_fixed(3.141593E37f, scale_type(28)); + + EXPECT_EQ(3.141593E7f, cudf::convert_fixed_to_floating(num0)); + EXPECT_EQ(3.141593E12f, cudf::convert_fixed_to_floating(num1)); + EXPECT_EQ(3.141593E17f, cudf::convert_fixed_to_floating(num2)); + EXPECT_EQ(3.141593E22f, cudf::convert_fixed_to_floating(num3)); + EXPECT_EQ(3.141593E27f, cudf::convert_fixed_to_floating(num4)); + EXPECT_EQ(3.141593E32f, cudf::convert_fixed_to_floating(num5)); + EXPECT_EQ(3.141593E37f, cudf::convert_fixed_to_floating(num6)); + } + + // Negative Exponent + { + auto num0 = cudf::convert_floating_to_fixed(3.141593E-7f, scale_type(-16)); + auto num1 = cudf::convert_floating_to_fixed(3.141593E-12f, scale_type(-21)); + auto num2 = cudf::convert_floating_to_fixed(3.141593E-17f, scale_type(-26)); + auto num3 = cudf::convert_floating_to_fixed(3.141593E-22f, scale_type(-31)); + auto num4 = cudf::convert_floating_to_fixed(3.141593E-27f, scale_type(-36)); + auto num5 = cudf::convert_floating_to_fixed(3.141593E-32f, scale_type(-41)); + auto num6 = cudf::convert_floating_to_fixed(3.141593E-37f, scale_type(-47)); + + EXPECT_EQ(3.141593E-7f, cudf::convert_fixed_to_floating(num0)); + EXPECT_EQ(3.141593E-12f, cudf::convert_fixed_to_floating(num1)); + EXPECT_EQ(3.141593E-17f, cudf::convert_fixed_to_floating(num2)); + EXPECT_EQ(3.141593E-22f, cudf::convert_fixed_to_floating(num3)); + EXPECT_EQ(3.141593E-27f, cudf::convert_fixed_to_floating(num4)); + EXPECT_EQ(3.141593E-32f, cudf::convert_fixed_to_floating(num5)); + EXPECT_EQ(3.141593E-37f, cudf::convert_fixed_to_floating(num6)); + + // Denormals + auto num7 = cudf::convert_floating_to_fixed(3.141593E-39f, scale_type(-48)); + auto num8 = cudf::convert_floating_to_fixed(3.141593E-41f, scale_type(-50)); + auto num9 = cudf::convert_floating_to_fixed(3.141593E-43f, scale_type(-52)); + auto num10 = cudf::convert_floating_to_fixed(FLT_TRUE_MIN, scale_type(-54)); + + EXPECT_EQ(3.141593E-39f, cudf::convert_fixed_to_floating(num7)); + EXPECT_EQ(3.141593E-41f, cudf::convert_fixed_to_floating(num8)); + EXPECT_EQ(3.141593E-43f, cudf::convert_fixed_to_floating(num9)); + EXPECT_EQ(FLT_TRUE_MIN, cudf::convert_fixed_to_floating(num10)); + } +} + +TEST_F(FixedPointTest, PreciseDoubleDecimal64Construction) +{ + // Need 17 decimal digits to uniquely represent all doubles (numeric_limits::max_digits10()). + // Precise conversion: set the scale factor to 17 less than the order-of-magnitude. + + using decimal64 = fixed_point; + + // Positive Exponent + { + auto num0 = cudf::convert_floating_to_fixed(3.141593E8, scale_type(-9)); + auto num1 = cudf::convert_floating_to_fixed(3.141593E58, scale_type(41)); + auto num2 = cudf::convert_floating_to_fixed(3.141593E108, scale_type(91)); + auto num3 = cudf::convert_floating_to_fixed(3.141593E158, scale_type(141)); + auto num4 = cudf::convert_floating_to_fixed(3.141593E208, scale_type(191)); + auto num5 = cudf::convert_floating_to_fixed(3.141593E258, scale_type(241)); + auto num6 = cudf::convert_floating_to_fixed(3.141593E307, scale_type(290)); + + EXPECT_EQ(3.141593E8, cudf::convert_fixed_to_floating(num0)); + EXPECT_EQ(3.141593E58, cudf::convert_fixed_to_floating(num1)); + EXPECT_EQ(3.141593E108, cudf::convert_fixed_to_floating(num2)); + EXPECT_EQ(3.141593E158, cudf::convert_fixed_to_floating(num3)); + EXPECT_EQ(3.141593E208, cudf::convert_fixed_to_floating(num4)); + EXPECT_EQ(3.141593E258, cudf::convert_fixed_to_floating(num5)); + EXPECT_EQ(3.141593E307, cudf::convert_fixed_to_floating(num6)); + } + + // Negative Exponent + { + auto num0 = cudf::convert_floating_to_fixed(3.141593E-8, scale_type(-25)); + auto num1 = cudf::convert_floating_to_fixed(3.141593E-58, scale_type(-75)); + auto num2 = cudf::convert_floating_to_fixed(3.141593E-108, scale_type(-125)); + auto num3 = cudf::convert_floating_to_fixed(3.141593E-158, scale_type(-175)); + auto num4 = cudf::convert_floating_to_fixed(3.141593E-208, scale_type(-225)); + auto num5 = cudf::convert_floating_to_fixed(3.141593E-258, scale_type(-275)); + auto num6 = cudf::convert_floating_to_fixed(3.141593E-308, scale_type(-325)); + + EXPECT_EQ(3.141593E-8, cudf::convert_fixed_to_floating(num0)); + EXPECT_EQ(3.141593E-58, cudf::convert_fixed_to_floating(num1)); + EXPECT_EQ(3.141593E-108, cudf::convert_fixed_to_floating(num2)); + EXPECT_EQ(3.141593E-158, cudf::convert_fixed_to_floating(num3)); + EXPECT_EQ(3.141593E-208, cudf::convert_fixed_to_floating(num4)); + EXPECT_EQ(3.141593E-258, cudf::convert_fixed_to_floating(num5)); + EXPECT_EQ(3.141593E-308, cudf::convert_fixed_to_floating(num6)); + + // Denormals + auto num7 = cudf::convert_floating_to_fixed(3.141593E-309, scale_type(-326)); + auto num8 = cudf::convert_floating_to_fixed(3.141593E-314, scale_type(-331)); + auto num9 = cudf::convert_floating_to_fixed(3.141593E-319, scale_type(-336)); + auto num10 = cudf::convert_floating_to_fixed(DBL_TRUE_MIN, scale_type(-341)); + + EXPECT_EQ(3.141593E-309, cudf::convert_fixed_to_floating(num7)); + EXPECT_EQ(3.141593E-314, cudf::convert_fixed_to_floating(num8)); + EXPECT_EQ(3.141593E-319, cudf::convert_fixed_to_floating(num9)); + EXPECT_EQ(DBL_TRUE_MIN, cudf::convert_fixed_to_floating(num10)); + } +} + TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath) { using decimalXX = fixed_point; @@ -442,8 +555,6 @@ void float_vector_test(ValueType const initial_value, int32_t const scale, Binop binop) { - using decimal32 = fixed_point; - std::vector vec1(size); std::vector vec2(size); diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 860544b8606..8903f09b82b 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -352,11 +352,15 @@ TEST_F(ToArrowDeviceTest, EmptyTable) auto got_arrow_device = cudf::to_arrow_device(table->view()); EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id); EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_device->sync_event))); compare_arrays(schema.get(), arr.get(), &got_arrow_device->array); got_arrow_device = cudf::to_arrow_device(std::move(*table)); EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id); EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_device->sync_event))); compare_arrays(schema.get(), arr.get(), &got_arrow_device->array); } @@ -386,6 +390,8 @@ TEST_F(ToArrowDeviceTest, DateTimeTable) auto got_arrow_array = cudf::to_arrow_device(input.view()); EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); EXPECT_EQ(data.size(), got_arrow_array->array.length); EXPECT_EQ(0, got_arrow_array->array.null_count); @@ -402,6 +408,8 @@ TEST_F(ToArrowDeviceTest, DateTimeTable) got_arrow_array = cudf::to_arrow_device(std::move(input)); EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); EXPECT_EQ(data.size(), got_arrow_array->array.length); EXPECT_EQ(0, got_arrow_array->array.null_count); @@ -456,6 +464,8 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable) auto got_arrow_array = cudf::to_arrow_device(input.view()); EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); EXPECT_EQ(data.size(), got_arrow_array->array.length); EXPECT_EQ(0, got_arrow_array->array.null_count); @@ -472,6 +482,8 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable) got_arrow_array = cudf::to_arrow_device(std::move(input)); EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); EXPECT_EQ(data.size(), got_arrow_array->array.length); EXPECT_EQ(0, got_arrow_array->array.null_count); @@ -538,6 +550,8 @@ TEST_F(ToArrowDeviceTest, NestedList) auto got_arrow_array = cudf::to_arrow_device(input.view()); EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); got_arrow_array = cudf::to_arrow_device(std::move(input)); @@ -682,11 +696,15 @@ TEST_F(ToArrowDeviceTest, StructColumn) auto got_arrow_array = cudf::to_arrow_device(input.view()); EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); got_arrow_array = cudf::to_arrow_device(std::move(input)); EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); } @@ -755,11 +773,15 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table) auto got_arrow_array = cudf::to_arrow_device(input.view()); ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); got_arrow_array = cudf::to_arrow_device(std::move(input)); ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); } } @@ -802,11 +824,15 @@ TEST_F(ToArrowDeviceTest, FixedPoint128Table) auto got_arrow_array = cudf::to_arrow_device(input.view()); EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); got_arrow_array = cudf::to_arrow_device(std::move(input)); EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id); EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type); + ASSERT_CUDA_SUCCEEDED( + cudaEventSynchronize(*reinterpret_cast(got_arrow_array->sync_event))); compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array); } } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 9c76c344157..993ab82f423 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1400,9 +1400,7 @@ TEST_F(JsonReaderTest, JsonLongString) .lines(true) .na_rep("null"); - cudf::io::write_json(options_builder.build(), - cudf::test::get_default_stream(), - rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); cudf::column_view int16_with_mask(repeat_times); cudf::column_view int16( diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json_writer.cpp index 946b939f456..2c4e29a01b9 100644 --- a/cpp/tests/io/json_writer.cpp +++ b/cpp/tests/io/json_writer.cpp @@ -51,16 +51,14 @@ TEST_F(JsonWriterTest, EmptyInput) .build(); // Empty columns in table - cudf::io::write_json( - out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(out_options, cudf::test::get_default_stream()); std::string const expected = R"([])"; EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); // Empty columns in table - JSON Lines out_buffer.clear(); out_options.enable_lines(true); - cudf::io::write_json( - out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(out_options, cudf::test::get_default_stream()); std::string const expected_lines = "\n"; EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); @@ -68,8 +66,7 @@ TEST_F(JsonWriterTest, EmptyInput) cudf::table_view tbl_view2{}; out_options.set_table(tbl_view2); out_buffer.clear(); - cudf::io::write_json( - out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(out_options, cudf::test::get_default_stream()); EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); } @@ -94,22 +91,17 @@ TEST_F(JsonWriterTest, ErrorCases) .build(); // not enough column names - EXPECT_THROW( - cudf::io::write_json( - out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), - cudf::logic_error); + EXPECT_THROW(cudf::io::write_json(out_options, cudf::test::get_default_stream()), + cudf::logic_error); mt.schema_info.emplace_back("int16"); out_options.set_metadata(mt); - EXPECT_NO_THROW(cudf::io::write_json( - out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource())); + EXPECT_NO_THROW(cudf::io::write_json(out_options, cudf::test::get_default_stream())); // chunk_rows must be at least 8 out_options.set_rows_per_chunk(0); - EXPECT_THROW( - cudf::io::write_json( - out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), - cudf::logic_error); + EXPECT_THROW(cudf::io::write_json(out_options, cudf::test::get_default_stream()), + cudf::logic_error); } TEST_F(JsonWriterTest, PlainTable) @@ -131,9 +123,7 @@ TEST_F(JsonWriterTest, PlainTable) .lines(false) .na_rep("null"); - cudf::io::write_json(options_builder.build(), - cudf::test::get_default_stream(), - rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); std::string const expected = R"([{"col1":"a","col2":"d","int":1,"float":1.5,"int16":null},{"col1":"b","col2":"e","int":2,"float":2.5,"int16":2},{"col1":"c","col2":"f","int":3,"float":3.5,"int16":null}])"; @@ -163,9 +153,7 @@ TEST_F(JsonWriterTest, SimpleNested) .lines(true) .na_rep("null"); - cudf::io::write_json(options_builder.build(), - cudf::test::get_default_stream(), - rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]} {"a":6,"b":7,"c":{"d":8},"f":10.5} {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]} @@ -197,9 +185,7 @@ TEST_F(JsonWriterTest, MixedNested) .lines(false) .na_rep("null"); - cudf::io::write_json(options_builder.build(), - cudf::test::get_default_stream(), - rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); std::string const expected = R"([{"a":1,"b":2,"c":{"d":[3]},"f":5.5,"g":[{"h":1}]},)" R"({"a":6,"b":7,"c":{"d":[8]},"f":10.5},)" @@ -232,8 +218,7 @@ TEST_F(JsonWriterTest, WriteReadNested) .na_rep("null") .build(); - cudf::io::write_json( - out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(out_options, cudf::test::get_default_stream()); std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]} {"a":6,"b":7,"c":{"d":8},"f":10.5} {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]} @@ -308,8 +293,7 @@ TEST_F(JsonWriterTest, WriteReadNested) mt.schema_info[2].children.clear(); out_options.set_metadata(mt); out_buffer.clear(); - cudf::io::write_json( - out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(out_options, cudf::test::get_default_stream()); in_options = cudf::io::json_reader_options::builder( cudf::io::source_info{out_buffer.data(), out_buffer.size()}) @@ -332,8 +316,7 @@ TEST_F(JsonWriterTest, WriteReadNested) // without column names out_options.set_metadata(cudf::io::table_metadata{}); out_buffer.clear(); - cudf::io::write_json( - out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(out_options, cudf::test::get_default_stream()); in_options = cudf::io::json_reader_options::builder( cudf::io::source_info{out_buffer.data(), out_buffer.size()}) .lines(true) @@ -371,8 +354,7 @@ TEST_F(JsonWriterTest, SpecialChars) .na_rep("null") .build(); - cudf::io::write_json( - out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(out_options, cudf::test::get_default_stream()); std::string const expected = R"({"\"a\"":1,"'b'":"abcd"} {"\"a\"":6,"'b'":"b\b\f\n\r\t"} {"\"a\"":1,"'b'":"\"c\""} @@ -405,9 +387,7 @@ TEST_F(JsonWriterTest, NullList) .lines(true) .na_rep("null"); - cudf::io::write_json(options_builder.build(), - cudf::test::get_default_stream(), - rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); std::string const expected = R"({"a":[null],"b":[[1,2,3],[null],[null,null,null],[4,null,5]]} {"a":[2,null,null,3],"b":null} {"a":[null,null,4],"b":[[2,null],null]} @@ -446,9 +426,7 @@ TEST_F(JsonWriterTest, ChunkedNested) .na_rep("null") .rows_per_chunk(8); - cudf::io::write_json(options_builder.build(), - cudf::test::get_default_stream(), - rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); std::string const expected = R"({"a":1,"b":-2,"c":{},"e":[{"f":1}]} {"a":2,"b":-2,"c":{}} @@ -504,9 +482,7 @@ TEST_F(JsonWriterTest, StructAllNullCombinations) .lines(true) .na_rep("null"); - cudf::io::write_json(options_builder.build(), - cudf::test::get_default_stream(), - rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); std::string const expected = R"({} {"e":1} {"d":1} @@ -568,9 +544,7 @@ TEST_F(JsonWriterTest, Unicode) .lines(true) .na_rep("null"); - cudf::io::write_json(options_builder.build(), - cudf::test::get_default_stream(), - rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); std::string const expected = R"({"col1":"\"\\\/\b\f\n\r\t","col2":"C\u10ae\u226a\u31f3\u434f\u51f9\u6ca6\u738b\u8fbf\u9fb8\ua057\ubbdc\uc2a4\ud3f6\ue4fe\ufd20","int16":null} diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index b5e080f3cc5..39ba62952b4 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -54,9 +54,9 @@ using int32_col = column_wrapper; using int64_col = column_wrapper; using float32_col = column_wrapper; using float64_col = column_wrapper; -using dec32_col = column_wrapper; -using dec64_col = column_wrapper; -using dec128_col = column_wrapper; +using dec32_col = cudf::test::fixed_point_column_wrapper; +using dec64_col = cudf::test::fixed_point_column_wrapper; +using dec128_col = cudf::test::fixed_point_column_wrapper; using struct_col = cudf::test::structs_column_wrapper; template using list_col = cudf::test::lists_column_wrapper; @@ -355,12 +355,6 @@ TEST_F(OrcWriterTest, MultiColumn) auto col4_data = random_values(num_rows); auto col5_data = random_values(num_rows); auto col6_vals = random_values(num_rows); - auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal128{col6_vals[i], numeric::scale_type{12}}; - }); - auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal128{col6_vals[i], numeric::scale_type{-12}}; - }); bool_col col0(col0_data.begin(), col0_data.end()); int8_col col1(col1_data.begin(), col1_data.end()); @@ -368,8 +362,8 @@ TEST_F(OrcWriterTest, MultiColumn) int32_col col3(col3_data.begin(), col3_data.end()); float32_col col4(col4_data.begin(), col4_data.end()); float64_col col5(col5_data.begin(), col5_data.end()); - dec128_col col6(col6_data, col6_data + num_rows); - dec128_col col7(col7_data, col7_data + num_rows); + dec128_col col6{col6_vals.begin(), col6_vals.end(), numeric::scale_type{12}}; + dec128_col col7{col6_vals.begin(), col6_vals.end(), numeric::scale_type{-12}}; list_col col8{ {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}}; @@ -416,9 +410,6 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls) auto col4_data = random_values(num_rows); auto col5_data = random_values(num_rows); auto col6_vals = random_values(num_rows); - auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal64{col6_vals[i], numeric::scale_type{2}}; - }); auto col0_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); }); auto col1_mask = @@ -438,7 +429,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls) int32_col col3{col3_data.begin(), col3_data.end(), col3_mask}; float32_col col4{col4_data.begin(), col4_data.end(), col4_mask}; float64_col col5{col5_data.begin(), col5_data.end(), col5_mask}; - dec64_col col6{col6_data, col6_data + num_rows, col6_mask}; + dec64_col col6{col6_vals.begin(), col6_vals.end(), col6_mask, numeric::scale_type{2}}; list_col col7{ {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}}, col0_mask}; @@ -541,14 +532,11 @@ TEST_F(OrcWriterTest, SlicedTable) auto seq_col0 = random_values(num_rows); auto seq_col2 = random_values(num_rows); auto vals_col3 = random_values(num_rows); - auto seq_col3 = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal64{vals_col3[i], numeric::scale_type{2}}; - }); int32_col col0(seq_col0.begin(), seq_col0.end()); str_col col1(strings.begin(), strings.end()); float32_col col2(seq_col2.begin(), seq_col2.end()); - dec64_col col3(seq_col3, seq_col3 + num_rows); + dec64_col col3{vals_col3.begin(), vals_col3.end(), numeric::scale_type{2}}; list_col col4{ {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}}; @@ -1213,11 +1201,8 @@ TEST_P(OrcWriterTestDecimal, Decimal64) // Using int16_t because scale causes values to overflow if they already require 32 bits auto const vals = random_values(num_rows); - auto data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal64{vals[i], numeric::scale_type{scale}}; - }); auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; }); - dec64_col col{data, data + num_rows, mask}; + dec64_col col{vals.begin(), vals.end(), mask, numeric::scale_type{scale}}; cudf::table_view tbl({static_cast(col)}); auto filepath = temp_env->get_temp_filepath("Decimal64.orc"); @@ -1244,11 +1229,8 @@ TEST_F(OrcWriterTest, Decimal32) // Using int16_t because scale causes values to overflow if they already require 32 bits auto const vals = random_values(num_rows); - auto data = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) { - return numeric::decimal32{vals[i], numeric::scale_type{2}}; - }); auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13; }); - dec32_col col{data, data + num_rows, mask}; + dec32_col col{vals.begin(), vals.end(), mask, numeric::scale_type{2}}; cudf::table_view expected({col}); auto filepath = temp_env->get_temp_filepath("Decimal32.orc"); @@ -1527,12 +1509,9 @@ TEST_F(OrcReaderTest, DecimalOptions) { constexpr auto num_rows = 10; auto col_vals = random_values(num_rows); - auto col_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal128{col_vals[i], numeric::scale_type{2}}; - }); auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 == 0; }); - dec128_col col{col_data, col_data + num_rows, mask}; + dec128_col col{col_vals.begin(), col_vals.end(), mask, numeric::scale_type{2}}; table_view expected({col}); cudf::io::table_input_metadata expected_metadata(expected); @@ -1555,15 +1534,9 @@ TEST_F(OrcWriterTest, DecimalOptionsNested) { auto const num_rows = 100; - auto dec_vals = random_values(num_rows); - auto dec1_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal64{dec_vals[i], numeric::scale_type{2}}; - }); - auto dec2_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal128{dec_vals[i], numeric::scale_type{2}}; - }); - dec64_col dec1_col(dec1_data, dec1_data + num_rows); - dec128_col dec2_col(dec2_data, dec2_data + num_rows); + auto dec_vals = random_values(num_rows); + dec64_col dec1_col{dec_vals.begin(), dec_vals.end(), numeric::scale_type{2}}; + dec128_col dec2_col{dec_vals.begin(), dec_vals.end(), numeric::scale_type{2}}; auto child_struct_col = cudf::test::structs_column_wrapper{dec1_col, dec2_col}; auto int_vals = random_values(num_rows); @@ -1974,7 +1947,7 @@ TEST_F(OrcStatisticsTest, Empty) int32_col col0{}; float64_col col1{}; str_col col2{}; - dec64_col col3{}; + dec64_col col3{{}, numeric::scale_type{0}}; column_wrapper col4; bool_col col5{}; table_view expected({col0, col1, col2, col3, col4, col5}); diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp index f106fd5a487..9e66fc9409f 100644 --- a/cpp/tests/io/parquet_v2_test.cpp +++ b/cpp/tests/io/parquet_v2_test.cpp @@ -47,15 +47,6 @@ TEST_P(ParquetV2Test, MultiColumn) auto col6_vals = random_values(num_rows); auto col7_vals = random_values(num_rows); auto col8_vals = random_values(num_rows); - auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) { - return numeric::decimal32{col6_vals[i], numeric::scale_type{5}}; - }); - auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) { - return numeric::decimal64{col7_vals[i], numeric::scale_type{-5}}; - }); - auto col8_data = cudf::detail::make_counting_transform_iterator(0, [&col8_vals](auto i) { - return numeric::decimal128{col8_vals[i], numeric::scale_type{-6}}; - }); // column_wrapper col0{col0_data.begin(), col0_data.end(), no_nulls()}; column_wrapper col1{col1_data.begin(), col1_data.end(), no_nulls()}; @@ -63,9 +54,13 @@ TEST_P(ParquetV2Test, MultiColumn) column_wrapper col3{col3_data.begin(), col3_data.end(), no_nulls()}; column_wrapper col4{col4_data.begin(), col4_data.end(), no_nulls()}; column_wrapper col5{col5_data.begin(), col5_data.end(), no_nulls()}; - column_wrapper col6{col6_data, col6_data + num_rows, no_nulls()}; - column_wrapper col7{col7_data, col7_data + num_rows, no_nulls()}; - column_wrapper col8{col8_data, col8_data + num_rows, no_nulls()}; + + cudf::test::fixed_point_column_wrapper col6( + col6_vals.begin(), col6_vals.end(), no_nulls(), numeric::scale_type{5}); + cudf::test::fixed_point_column_wrapper col7( + col7_vals.begin(), col7_vals.end(), no_nulls(), numeric::scale_type{-5}); + cudf::test::fixed_point_column_wrapper col8( + col8_vals.begin(), col8_vals.end(), no_nulls(), numeric::scale_type{-6}); auto expected = table_view{{col1, col2, col3, col4, col5, col6, col7, col8}}; @@ -109,14 +104,6 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls) auto col5_data = random_values(num_rows); auto col6_vals = random_values(num_rows); auto col7_vals = random_values(num_rows); - auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) { - return numeric::decimal32{col6_vals[i], numeric::scale_type{-2}}; - }); - auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) { - return numeric::decimal64{col7_vals[i], numeric::scale_type{-8}}; - }); - // auto col0_mask = cudf::detail::make_counting_transform_iterator( - // 0, [](auto i) { return (i % 2); }); auto col1_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); }); auto col2_mask = no_nulls(); @@ -138,8 +125,11 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls) column_wrapper col3{col3_data.begin(), col3_data.end(), col3_mask}; column_wrapper col4{col4_data.begin(), col4_data.end(), col4_mask}; column_wrapper col5{col5_data.begin(), col5_data.end(), col5_mask}; - column_wrapper col6{col6_data, col6_data + num_rows, col6_mask}; - column_wrapper col7{col7_data, col7_data + num_rows, col7_mask}; + + cudf::test::fixed_point_column_wrapper col6( + col6_vals.begin(), col6_vals.end(), col6_mask, numeric::scale_type{-2}); + cudf::test::fixed_point_column_wrapper col7( + col7_vals.begin(), col7_vals.end(), col7_mask, numeric::scale_type{-8}); auto expected = table_view{{/*col0, */ col1, col2, col3, col4, col5, col6, col7}}; diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index a1f4c7b81d8..e07ebe25322 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -35,7 +35,7 @@ using cudf::test::iterators::no_nulls; template -void test_durations(mask_op_t mask_op, bool use_byte_stream_split) +void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_schema) { std::default_random_engine generator; std::uniform_int_distribution distribution_d(0, 30); @@ -76,20 +76,27 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split) auto filepath = temp_env->get_temp_filepath("Durations.parquet"); cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_arrow_schema(arrow_schema); + cudf::io::write_parquet(out_opts); cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .use_arrow_schema(arrow_schema); auto result = cudf::io::read_parquet(in_opts); auto durations_d_got = cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view()); - auto durations_s_got = - cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view()); + if (arrow_schema) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1)); + } else { + auto durations_s_got = + cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view()); + } CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3)); @@ -98,10 +105,15 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split) TEST_F(ParquetWriterTest, Durations) { - test_durations([](auto i) { return true; }, false); - test_durations([](auto i) { return (i % 2) != 0; }, false); - test_durations([](auto i) { return (i % 3) != 0; }, false); - test_durations([](auto i) { return false; }, false); + test_durations([](auto i) { return true; }, false, false); + test_durations([](auto i) { return (i % 2) != 0; }, false, false); + test_durations([](auto i) { return (i % 3) != 0; }, false, false); + test_durations([](auto i) { return false; }, false, false); + + test_durations([](auto i) { return true; }, false, true); + test_durations([](auto i) { return (i % 2) != 0; }, false, true); + test_durations([](auto i) { return (i % 3) != 0; }, false, true); + test_durations([](auto i) { return false; }, false, true); } TEST_F(ParquetWriterTest, MultiIndex) @@ -493,6 +505,50 @@ TEST_F(ParquetWriterTest, DecimalWrite) CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, table); } +TEST_F(ParquetWriterTest, DecimalWriteWithArrowSchema) +{ + constexpr cudf::size_type num_rows = 500; + auto seq_col0 = random_values(num_rows); + auto seq_col1 = random_values(num_rows); + + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); + + auto col0 = cudf::test::fixed_point_column_wrapper{ + seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}}; + auto col1 = cudf::test::fixed_point_column_wrapper{ + seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}}; + + auto table = table_view({col0, col1}); + + auto filepath = temp_env->get_temp_filepath("DecimalWriteWithArrowSchema.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table) + .write_arrow_schema(true); + + cudf::io::table_input_metadata expected_metadata(table); + // verify success if equal precision is given + expected_metadata.column_metadata[0].set_decimal_precision( + cudf::io::parquet::detail::MAX_DECIMAL32_PRECISION); + expected_metadata.column_metadata[1].set_decimal_precision( + cudf::io::parquet::detail::MAX_DECIMAL64_PRECISION); + args.set_metadata(std::move(expected_metadata)); + cudf::io::write_parquet(args); + + auto expected_col0 = cudf::test::fixed_point_column_wrapper<__int128_t>{ + seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}}; + auto expected_col1 = cudf::test::fixed_point_column_wrapper<__int128_t>{ + seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}}; + + auto expected_table = table_view({expected_col0, expected_col1}); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected_table); +} + TEST_F(ParquetWriterTest, RowGroupSizeInvalid) { auto const unused_table = std::make_unique
(); @@ -1935,10 +1991,15 @@ TEST_F(ParquetWriterTest, DecimalByteStreamSplit) TEST_F(ParquetWriterTest, DurationByteStreamSplit) { - test_durations([](auto i) { return true; }, true); - test_durations([](auto i) { return (i % 2) != 0; }, true); - test_durations([](auto i) { return (i % 3) != 0; }, true); - test_durations([](auto i) { return false; }, true); + test_durations([](auto i) { return true; }, true, false); + test_durations([](auto i) { return (i % 2) != 0; }, true, false); + test_durations([](auto i) { return (i % 3) != 0; }, true, false); + test_durations([](auto i) { return false; }, true, false); + + test_durations([](auto i) { return true; }, true, true); + test_durations([](auto i) { return (i % 2) != 0; }, true, true); + test_durations([](auto i) { return (i % 3) != 0; }, true, true); + test_durations([](auto i) { return false; }, true, true); } TEST_F(ParquetWriterTest, WriteFixedLenByteArray) diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp index 718ee83cf09..8fb2b403051 100644 --- a/cpp/tests/lists/contains_tests.cpp +++ b/cpp/tests/lists/contains_tests.cpp @@ -224,9 +224,8 @@ TYPED_TEST(TypedContainsTest, SlicedLists) { // First Slice. - auto sliced_column_1 = - cudf::detail::slice(search_space, {1, 8}, cudf::get_default_stream()).front(); - auto search_key_one = create_scalar_search_key(1); + auto sliced_column_1 = cudf::slice(search_space, {1, 8}, cudf::get_default_stream()).front(); + auto search_key_one = create_scalar_search_key(1); { // CONTAINS auto result = cudf::lists::contains(sliced_column_1, *search_key_one); @@ -257,9 +256,8 @@ TYPED_TEST(TypedContainsTest, SlicedLists) { // Second Slice. - auto sliced_column_2 = - cudf::detail::slice(search_space, {3, 10}, cudf::get_default_stream()).front(); - auto search_key_one = create_scalar_search_key(1); + auto sliced_column_2 = cudf::slice(search_space, {3, 10}, cudf::get_default_stream()).front(); + auto search_key_one = create_scalar_search_key(1); { // CONTAINS auto result = cudf::lists::contains(sliced_column_2, *search_key_one); diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp index bc7488bbf9e..de155c35a5e 100644 --- a/cpp/tests/reshape/interleave_columns_tests.cpp +++ b/cpp/tests/reshape/interleave_columns_tests.cpp @@ -363,19 +363,16 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointInterleave) { using namespace numeric; using decimalXX = TypeParam; + using RepType = typename decimalXX::rep; for (int i = 0; i > -4; --i) { - auto const ONE = decimalXX{1, scale_type{i}}; - auto const TWO = decimalXX{2, scale_type{i}}; - auto const FOUR = decimalXX{4, scale_type{i}}; - auto const FIVE = decimalXX{5, scale_type{i}}; + auto const a = cudf::test::fixed_point_column_wrapper({1, 4}, scale_type{i}); + auto const b = cudf::test::fixed_point_column_wrapper({2, 5}, scale_type{i}); - auto const a = cudf::test::fixed_width_column_wrapper({ONE, FOUR}); - auto const b = cudf::test::fixed_width_column_wrapper({TWO, FIVE}); - - auto const input = cudf::table_view{std::vector{a, b}}; - auto const expected = cudf::test::fixed_width_column_wrapper({ONE, TWO, FOUR, FIVE}); - auto const actual = cudf::interleave_columns(input); + auto const input = cudf::table_view{std::vector{a, b}}; + auto const expected = + cudf::test::fixed_point_column_wrapper({1, 2, 4, 5}, scale_type{i}); + auto const actual = cudf::interleave_columns(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, actual->view()); } diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp index 6e27db02d56..42894a0ebcb 100644 --- a/cpp/tests/streams/io/csv_test.cpp +++ b/cpp/tests/streams/io/csv_test.cpp @@ -39,12 +39,6 @@ TEST_F(CSVTest, CSVWriter) std::vector zeros(num_rows, 0); std::vector ones(num_rows, 1); - auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal128{ones[i], numeric::scale_type{12}}; - }); - auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal128{ones[i], numeric::scale_type{-12}}; - }); cudf::test::fixed_width_column_wrapper col0(zeros.begin(), zeros.end()); cudf::test::fixed_width_column_wrapper col1(zeros.begin(), zeros.end()); @@ -52,8 +46,10 @@ TEST_F(CSVTest, CSVWriter) cudf::test::fixed_width_column_wrapper col3(zeros.begin(), zeros.end()); cudf::test::fixed_width_column_wrapper col4(zeros.begin(), zeros.end()); cudf::test::fixed_width_column_wrapper col5(zeros.begin(), zeros.end()); - cudf::test::fixed_width_column_wrapper col6(col6_data, col6_data + num_rows); - cudf::test::fixed_width_column_wrapper col7(col7_data, col7_data + num_rows); + cudf::test::fixed_point_column_wrapper col6( + ones.begin(), ones.end(), numeric::scale_type{12}); + cudf::test::fixed_point_column_wrapper col7( + ones.begin(), ones.end(), numeric::scale_type{-12}); std::vector col8_data(num_rows, "rapids"); cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end()); @@ -72,12 +68,6 @@ TEST_F(CSVTest, CSVReader) std::vector zeros(num_rows, 0); std::vector ones(num_rows, 1); - auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal128{ones[i], numeric::scale_type{12}}; - }); - auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal128{ones[i], numeric::scale_type{-12}}; - }); cudf::test::fixed_width_column_wrapper col0(zeros.begin(), zeros.end()); cudf::test::fixed_width_column_wrapper col1(zeros.begin(), zeros.end()); @@ -85,8 +75,10 @@ TEST_F(CSVTest, CSVReader) cudf::test::fixed_width_column_wrapper col3(zeros.begin(), zeros.end()); cudf::test::fixed_width_column_wrapper col4(zeros.begin(), zeros.end()); cudf::test::fixed_width_column_wrapper col5(zeros.begin(), zeros.end()); - cudf::test::fixed_width_column_wrapper col6(col6_data, col6_data + num_rows); - cudf::test::fixed_width_column_wrapper col7(col7_data, col7_data + num_rows); + cudf::test::fixed_point_column_wrapper col6( + ones.begin(), ones.end(), numeric::scale_type{12}); + cudf::test::fixed_point_column_wrapper col7( + ones.begin(), ones.end(), numeric::scale_type{-12}); std::vector col8_data(num_rows, "rapids"); cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end()); diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp index 401c7049381..cc43bf15b5d 100644 --- a/cpp/tests/streams/io/orc_test.cpp +++ b/cpp/tests/streams/io/orc_test.cpp @@ -59,22 +59,10 @@ cudf::table construct_table() cudf::test::fixed_width_column_wrapper col3(zeros_iterator, zeros_iterator + num_rows); cudf::test::fixed_width_column_wrapper col4(zeros_iterator, zeros_iterator + num_rows); cudf::test::fixed_width_column_wrapper col5(zeros_iterator, zeros_iterator + num_rows); - - cudf::test::fixed_width_column_wrapper col6 = [&ones_iterator] { - auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal128{ones_iterator[i], numeric::scale_type{12}}; - }); - return cudf::test::fixed_width_column_wrapper(col6_data, - col6_data + num_rows); - }(); - - cudf::test::fixed_width_column_wrapper col7 = [&ones_iterator] { - auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal128{ones_iterator[i], numeric::scale_type{-12}}; - }); - return cudf::test::fixed_width_column_wrapper(col7_data, - col7_data + num_rows); - }(); + cudf::test::fixed_point_column_wrapper col6( + ones_iterator, ones_iterator + num_rows, numeric::scale_type{12}); + cudf::test::fixed_point_column_wrapper col7( + ones_iterator, ones_iterator + num_rows, numeric::scale_type{-12}); cudf::test::lists_column_wrapper col8 = [] { auto col8_mask = diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp index b277d184e3a..9d2dec2d697 100644 --- a/cpp/tests/streams/io/parquet_test.cpp +++ b/cpp/tests/streams/io/parquet_test.cpp @@ -55,20 +55,10 @@ cudf::table construct_table() cudf::test::fixed_width_column_wrapper col3(zeros.begin(), zeros.end()); cudf::test::fixed_width_column_wrapper col4(zeros.begin(), zeros.end()); cudf::test::fixed_width_column_wrapper col5(zeros.begin(), zeros.end()); - cudf::test::fixed_width_column_wrapper col6 = [&ones] { - auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal128{ones[i], numeric::scale_type{12}}; - }); - return cudf::test::fixed_width_column_wrapper(col6_data, - col6_data + num_rows); - }(); - cudf::test::fixed_width_column_wrapper col7 = [&ones] { - auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { - return numeric::decimal128{ones[i], numeric::scale_type{-12}}; - }); - return cudf::test::fixed_width_column_wrapper(col7_data, - col7_data + num_rows); - }(); + cudf::test::fixed_point_column_wrapper col6( + ones.begin(), ones.end(), numeric::scale_type{12}); + cudf::test::fixed_point_column_wrapper col7( + ones.begin(), ones.end(), numeric::scale_type{-12}); cudf::test::lists_column_wrapper col8{ {1, 1}, {1, 1, 1}, {}, {1}, {1, 1, 1, 1}, {1, 1, 1, 1, 1}, {}, {1, -1}, {}, {-1, -1}}; diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp index 3aa7467d156..6c4afbb435a 100644 --- a/cpp/tests/strings/replace_tests.cpp +++ b/cpp/tests/strings/replace_tests.cpp @@ -532,6 +532,23 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong) } } +TEST_F(StringsReplaceTest, EmptyTarget) +{ + auto const input = cudf::test::strings_column_wrapper({"hello", "world", "", "accénted"}); + auto const sv = cudf::strings_column_view(input); + + auto const targets = cudf::test::strings_column_wrapper({"e", "", "d"}); + auto const tv = cudf::strings_column_view(targets); + + auto const repls = cudf::test::strings_column_wrapper({"E", "_", "D"}); + auto const rv = cudf::strings_column_view(repls); + + // empty target should be ignored + auto results = cudf::strings::replace_multiple(sv, tv, rv); + auto expected = cudf::test::strings_column_wrapper({"hEllo", "worlD", "", "accéntED"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsReplaceTest, EmptyStringsColumn) { auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index d53c64ed539..4c020cb4c29 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -307,6 +307,26 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } +TEST_F(StringsSplitTest, SplitRecordAllEmpty) +{ + auto input = cudf::test::strings_column_wrapper({"", "", "", ""}); + auto sv = cudf::strings_column_view(input); + auto delimiter = cudf::string_scalar("s"); + auto empty = cudf::string_scalar(""); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{}, LCW{}}); + auto result = cudf::strings::split_record(sv, delimiter); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + result = cudf::strings::split_record(sv, empty); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + + result = cudf::strings::rsplit_record(sv, delimiter); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + result = cudf::strings::rsplit_record(sv, empty); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); +} + TEST_F(StringsSplitTest, MultiByteDelimiters) { // Overlapping delimiters diff --git a/dependencies.yaml b/dependencies.yaml index e3f8a72e76c..a19574b7658 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -3,7 +3,7 @@ files: all: output: conda matrix: - cuda: ["11.8", "12.2"] + cuda: ["11.8", "12.5"] arch: [x86_64] includes: - build_base @@ -243,7 +243,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - &cmake_ver cmake>=3.26.4 + - &cmake_ver cmake>=3.26.4,!=3.30.0 - &ninja ninja build_all: common: @@ -323,6 +323,7 @@ dependencies: packages: # Hard pin the patch version used during the build. # Sync with conda build constraint & wheel run constraint. + # TODO: Change to `2.0.*` for NumPy 2 - numpy==1.23.* build_python_cudf: common: @@ -402,6 +403,10 @@ dependencies: cuda: "12.2" packages: - cuda-version=12.2 + - matrix: + cuda: "12.5" + packages: + - cuda-version=12.5 cuda: specific: - output_types: conda @@ -547,6 +552,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - fsspec>=0.6.0 + # TODO: Update `numpy` in `build_python_common` when dropping `<2.0a0` - numpy>=1.23,<2.0a0 - pandas>=2.0,<2.2.3dev0 run_cudf: @@ -755,7 +761,7 @@ dependencies: - {matrix: null, packages: *cupy_packages_cu11} test_python_pandas_cudf: common: - - output_types: pyproject + - output_types: [requirements, pyproject] packages: # dependencies to run pandas tests # https://github.com/pandas-dev/pandas/blob/main/environment.yml @@ -766,7 +772,7 @@ dependencies: - pytest-reportlog test_python_cudf_pandas: common: - - output_types: pyproject + - output_types: [requirements, pyproject] packages: - ipython - openpyxl diff --git a/docs/cudf/source/developer_guide/documentation.md b/docs/cudf/source/developer_guide/documentation.md index c8da689479c..4f5a57fec02 100644 --- a/docs/cudf/source/developer_guide/documentation.md +++ b/docs/cudf/source/developer_guide/documentation.md @@ -164,7 +164,7 @@ The directive should be used inside docstrings like so: Docstring body .. pandas-compat:: - **$API_NAME** + :meth:`pandas.DataFrame.METHOD` Explanation of differences ``` diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst index ebf5fab3052..558268ea495 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst @@ -1,6 +1,6 @@ -======= -copying -======= +======== +datetime +======== .. automodule:: cudf._lib.pylibcudf.datetime :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst new file mode 100644 index 00000000000..03f769ee861 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst @@ -0,0 +1,6 @@ +=========== +expressions +=========== + +.. automodule:: cudf._lib.pylibcudf.expressions + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index e9dad705cbf..505765bba0f 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -15,25 +15,27 @@ This page provides API documentation for pylibcudf. concatenate copying datetime + expressions filling gpumemoryview groupby - io/index.rst interop join lists merge quantiles reduce + replace reshape rolling round scalar search - stream_compaction sorting - replace + stream_compaction table + traits + transform types unary @@ -41,4 +43,5 @@ This page provides API documentation for pylibcudf. :maxdepth: 2 :caption: Subpackages + io/index.rst strings/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst new file mode 100644 index 00000000000..5a2276f8b2d --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst @@ -0,0 +1,6 @@ +=== +CSV +=== + +.. automodule:: cudf._lib.pylibcudf.io.csv + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst index bde6d8094ce..697bce739de 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -16,4 +16,5 @@ I/O Functions :maxdepth: 1 avro + csv json diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst new file mode 100644 index 00000000000..294ca8dc78c --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst @@ -0,0 +1,6 @@ +====== +traits +====== + +.. automodule:: cudf._lib.pylibcudf.traits + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst new file mode 100644 index 00000000000..ef04bbad7e6 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst @@ -0,0 +1,6 @@ +========= +transform +========= + +.. automodule:: cudf._lib.pylibcudf.transform + :members: diff --git a/java/README.md b/java/README.md index 2d8e2190fee..0d9e060b7cd 100644 --- a/java/README.md +++ b/java/README.md @@ -51,9 +51,13 @@ CUDA 11.0: ## Build From Source Build [libcudf](../cpp) first, and make sure the JDK is installed and available. Specify -the cmake option `-DCUDF_USE_ARROW_STATIC=ON -DCUDF_ENABLE_ARROW_S3=OFF` when building so -that Apache Arrow is linked statically to libcudf, as this will help create a jar that -does not require Arrow and its dependencies to be available in the runtime environment. +the following cmake options to the libcudf build: +``` +-DCUDF_LARGE_STRINGS_DISABLED=ON -DCUDF_USE_ARROW_STATIC=ON -DCUDF_ENABLE_ARROW_S3=OFF +``` +These options: +- Disable large string support, see https://github.com/rapidsai/cudf/issues/16215 +- Statically link Arrow to libcudf to remove Arrow as a runtime dependency. After building libcudf, the Java bindings can be built via Maven, e.g.: ``` diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh index 72b1742f7cb..5a429bdc739 100755 --- a/java/ci/build-in-docker.sh +++ b/java/ci/build-in-docker.sh @@ -1,7 +1,7 @@ #!/bin/bash # -# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -58,6 +58,7 @@ cmake .. -G"${CMAKE_GENERATOR}" \ -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \ -DCUDA_STATIC_RUNTIME=$ENABLE_CUDA_STATIC_RUNTIME \ -DUSE_NVTX=$ENABLE_NVTX \ + -DCUDF_LARGE_STRINGS_DISABLED=ON \ -DCUDF_USE_ARROW_STATIC=ON \ -DCUDF_ENABLE_ARROW_S3=OFF \ -DBUILD_TESTS=$BUILD_CPP_TESTS \ diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 1d6a3b3304a..7136b162c13 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3509,9 +3509,9 @@ void testCastFloatToDecimal() { @Test void testCastDoubleToDecimal() { testCastNumericToDecimalsAndBack(DType.FLOAT64, false, 0, - () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, (double) Long.MAX_VALUE), - () -> ColumnVector.fromBoxedDoubles(1.0, 2.0, -3.0, null, 2.0, (double) Long.MAX_VALUE), - new Long[]{1L, 2L, -3L, null, 2L, Long.MAX_VALUE} + () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, (double) Integer.MAX_VALUE), + () -> ColumnVector.fromBoxedDoubles(1.0, 2.0, -3.0, null, 2.0, (double) Integer.MAX_VALUE), + new Long[]{1L, 2L, -3L, null, 2L, (long) Integer.MAX_VALUE} ); testCastNumericToDecimalsAndBack(DType.FLOAT64, false, -2, () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, -55.01999), diff --git a/pyproject.toml b/pyproject.toml index 2f59864894b..e15cb7b3cdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,69 @@ quiet-level = 3 line-length = 79 [tool.ruff.lint] -select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH", "FA", "UP006", "UP007"] +typing-modules = ["cudf._typing"] +select = [ + # pycodestyle Error + "E", + # Pyflakes + "F", + # pycodestyle Warning + "W", + # no-blank-line-before-function + "D201", + # one-blank-line-after-class + "D204", + # indent-with-spaces + "D206", + # under-indentation + "D207", + # over-indentation + "D208", + # new-line-after-last-paragraph + "D209", + # surrounding-whitespace + "D210", + # blank-line-before-class + "D211", + # section-not-over-indented + "D214", + # section-underline-not-over-indented + "D215", + # triple-single-quotes + "D300", + # escape-sequence-in-docstring + "D301", + # first-line-capitalized + "D403", + # capitalize-section-name + "D405", + # new-line-after-section-name + "D406", + # dashed-underline-after-section + "D407", + # section-underline-after-name + "D408", + # section-underline-matches-section-length + "D409", + # no-blank-line-after-section + "D410", + # no-blank-line-before-section + "D411", + # blank-lines-between-header-and-content + "D412", + # empty-docstring-section + "D414", + # overload-with-docstring + "D418", + # flake8-type-checking + "TCH", + # flake8-future-annotations + "FA", + # non-pep585-annotation + "UP006", + # non-pep604-annotation + "UP007" +] ignore = [ # whitespace before : "E203", diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index e6dfe2eae62..8ce92e1c0f6 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -192,8 +192,7 @@ def convert_nulls_to_none(records, df): col for col in df.columns if df[col].dtype in pandas_dtypes_to_np_dtypes - or pd.api.types.is_datetime64_dtype(df[col].dtype) - or pd.api.types.is_timedelta64_dtype(df[col].dtype) + or df[col].dtype.kind in "mM" ] for record in records: diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 5a067e84f56..38b7e9ebe04 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -21,7 +21,6 @@ set(cython_sources copying.pyx csv.pyx datetime.pyx - expressions.pyx filling.pyx groupby.pyx hash.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 18b95f5f2e1..34c0e29d0b1 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import numpy as np from . import ( @@ -8,7 +8,6 @@ copying, csv, datetime, - expressions, filling, groupby, hash, diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 7155017b7af..e030147fdd3 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -202,11 +202,13 @@ cdef class Column: def _clear_cache(self): self._distinct_count = {} - try: - del self.memory_usage - except AttributeError: - # `self.memory_usage` was never called before, So ignore. - pass + attrs = ("memory_usage", "is_monotonic_increasing", "is_monotonic_decreasing") + for attr in attrs: + try: + delattr(self, attr) + except AttributeError: + # attr was not called yet, so ignore. + pass self._null_count = None def set_mask(self, value): diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index c706351a683..099b61d62ae 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -1,7 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp cimport bool -from libcpp.map cimport map from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move @@ -9,8 +8,12 @@ from libcpp.vector cimport vector cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types from cudf._lib.pylibcudf.io.datasource cimport Datasource, NativeFileDatasource -from cudf._lib.pylibcudf.libcudf.types cimport data_type -from cudf._lib.types cimport dtype_to_data_type +from cudf._lib.types cimport dtype_to_pylibcudf_type + +import errno +import os +from collections import abc +from io import BytesIO, StringIO import numpy as np import pandas as pd @@ -18,65 +21,24 @@ import pandas as pd import cudf from cudf.core.buffer import acquire_spill_lock -from cudf._lib.pylibcudf.libcudf.types cimport size_type - -import errno -import os -from collections import abc -from enum import IntEnum -from io import BytesIO, StringIO - -from libc.stdint cimport int32_t from libcpp cimport bool -from cudf._lib.io.utils cimport make_sink_info, make_source_info +from cudf._lib.io.utils cimport make_sink_info from cudf._lib.pylibcudf.libcudf.io.csv cimport ( - csv_reader_options, csv_writer_options, - read_csv as cpp_read_csv, write_csv as cpp_write_csv, ) from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink -from cudf._lib.pylibcudf.libcudf.io.types cimport ( - compression_type, - quote_style, - sink_info, - source_info, - table_with_metadata, -) +from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type, sink_info from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table from pyarrow.lib import NativeFile +import cudf._lib.pylibcudf as plc from cudf.api.types import is_hashable -ctypedef int32_t underlying_type_t_compression - - -class Compression(IntEnum): - INFER = ( - compression_type.AUTO - ) - SNAPPY = ( - compression_type.SNAPPY - ) - GZIP = ( - compression_type.GZIP - ) - BZ2 = ( - compression_type.BZIP2 - ) - BROTLI = ( - compression_type.BROTLI - ) - ZIP = ( - compression_type.ZIP - ) - XZ = ( - compression_type.XZ - ) - +from cudf._lib.pylibcudf.types cimport DataType CSV_HEX_TYPE_MAP = { "hex": np.dtype("int64"), @@ -84,234 +46,6 @@ CSV_HEX_TYPE_MAP = { "hex32": np.dtype("int32") } -cdef csv_reader_options make_csv_reader_options( - object datasource, - object lineterminator, - object quotechar, - int quoting, - bool doublequote, - object header, - bool mangle_dupe_cols, - object usecols, - object delimiter, - bool delim_whitespace, - bool skipinitialspace, - object names, - object dtype, - int skipfooter, - int skiprows, - bool dayfirst, - object compression, - object thousands, - object decimal, - object true_values, - object false_values, - object nrows, - object byte_range, - bool skip_blank_lines, - object parse_dates, - object comment, - object na_values, - bool keep_default_na, - bool na_filter, - object prefix, - object index_col, -) except *: - cdef source_info c_source_info = make_source_info([datasource]) - cdef compression_type c_compression - cdef vector[string] c_names - cdef size_t c_byte_range_offset = ( - byte_range[0] if byte_range is not None else 0 - ) - cdef size_t c_byte_range_size = ( - byte_range[1] if byte_range is not None else 0 - ) - cdef vector[int] c_use_cols_indexes - cdef vector[string] c_use_cols_names - cdef size_type c_nrows = nrows if nrows is not None else -1 - cdef quote_style c_quoting - cdef vector[string] c_parse_dates_names - cdef vector[int] c_parse_dates_indexes - cdef vector[string] c_hex_col_names - cdef vector[data_type] c_dtypes_list - cdef map[string, data_type] c_dtypes_map - cdef vector[int] c_hex_col_indexes - cdef vector[string] c_true_values - cdef vector[string] c_false_values - cdef vector[string] c_na_values - - # Reader settings - if compression is None: - c_compression = compression_type.NONE - else: - compression = str(compression) - compression = Compression[compression.upper()] - c_compression = ( - compression - ) - - if quoting == 1: - c_quoting = quote_style.ALL - elif quoting == 2: - c_quoting = quote_style.NONNUMERIC - elif quoting == 3: - c_quoting = quote_style.NONE - else: - # Default value - c_quoting = quote_style.MINIMAL - - cdef csv_reader_options csv_reader_options_c = move( - csv_reader_options.builder(c_source_info) - .compression(c_compression) - .mangle_dupe_cols(mangle_dupe_cols) - .byte_range_offset(c_byte_range_offset) - .byte_range_size(c_byte_range_size) - .nrows(c_nrows) - .skiprows(skiprows) - .skipfooter(skipfooter) - .quoting(c_quoting) - .lineterminator(ord(lineterminator)) - .quotechar(ord(quotechar)) - .decimal(ord(decimal)) - .delim_whitespace(delim_whitespace) - .skipinitialspace(skipinitialspace) - .skip_blank_lines(skip_blank_lines) - .doublequote(doublequote) - .keep_default_na(keep_default_na) - .na_filter(na_filter) - .dayfirst(dayfirst) - .build() - ) - - if names is not None: - # explicitly mentioned name, so don't check header - if header is None or header == 'infer': - csv_reader_options_c.set_header(-1) - else: - csv_reader_options_c.set_header(header) - - c_names.reserve(len(names)) - for name in names: - c_names.push_back(str(name).encode()) - csv_reader_options_c.set_names(c_names) - else: - if header is None: - csv_reader_options_c.set_header(-1) - elif header == 'infer': - csv_reader_options_c.set_header(0) - else: - csv_reader_options_c.set_header(header) - - if prefix is not None: - csv_reader_options_c.set_prefix(prefix.encode()) - - if usecols is not None: - all_int = all(isinstance(col, int) for col in usecols) - if all_int: - c_use_cols_indexes.reserve(len(usecols)) - c_use_cols_indexes = usecols - csv_reader_options_c.set_use_cols_indexes(c_use_cols_indexes) - else: - c_use_cols_names.reserve(len(usecols)) - for col_name in usecols: - c_use_cols_names.push_back( - str(col_name).encode() - ) - csv_reader_options_c.set_use_cols_names(c_use_cols_names) - - if delimiter is not None: - csv_reader_options_c.set_delimiter(ord(delimiter)) - - if thousands is not None: - csv_reader_options_c.set_thousands(ord(thousands)) - - if comment is not None: - csv_reader_options_c.set_comment(ord(comment)) - - if parse_dates is not None: - if isinstance(parse_dates, abc.Mapping): - raise NotImplementedError( - "`parse_dates`: dictionaries are unsupported") - if not isinstance(parse_dates, abc.Iterable): - raise NotImplementedError( - "`parse_dates`: an iterable is required") - for col in parse_dates: - if isinstance(col, str): - c_parse_dates_names.push_back(str(col).encode()) - elif isinstance(col, int): - c_parse_dates_indexes.push_back(col) - else: - raise NotImplementedError( - "`parse_dates`: Nesting is unsupported") - csv_reader_options_c.set_parse_dates(c_parse_dates_names) - csv_reader_options_c.set_parse_dates(c_parse_dates_indexes) - - if dtype is not None: - if isinstance(dtype, abc.Mapping): - for k, v in dtype.items(): - col_type = v - if is_hashable(v) and v in CSV_HEX_TYPE_MAP: - col_type = CSV_HEX_TYPE_MAP[v] - c_hex_col_names.push_back(str(k).encode()) - - c_dtypes_map[str(k).encode()] = \ - _get_cudf_data_type_from_dtype( - cudf.dtype(col_type)) - csv_reader_options_c.set_dtypes(c_dtypes_map) - csv_reader_options_c.set_parse_hex(c_hex_col_names) - elif ( - cudf.api.types.is_scalar(dtype) or - isinstance(dtype, ( - np.dtype, pd.api.extensions.ExtensionDtype, type - )) - ): - c_dtypes_list.reserve(1) - if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP: - dtype = CSV_HEX_TYPE_MAP[dtype] - c_hex_col_indexes.push_back(0) - - c_dtypes_list.push_back( - _get_cudf_data_type_from_dtype(dtype) - ) - csv_reader_options_c.set_dtypes(c_dtypes_list) - csv_reader_options_c.set_parse_hex(c_hex_col_indexes) - elif isinstance(dtype, abc.Collection): - c_dtypes_list.reserve(len(dtype)) - for index, col_dtype in enumerate(dtype): - if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP: - col_dtype = CSV_HEX_TYPE_MAP[col_dtype] - c_hex_col_indexes.push_back(index) - - c_dtypes_list.push_back( - _get_cudf_data_type_from_dtype(col_dtype) - ) - csv_reader_options_c.set_dtypes(c_dtypes_list) - csv_reader_options_c.set_parse_hex(c_hex_col_indexes) - else: - raise ValueError( - "dtype should be a scalar/str/list-like/dict-like" - ) - - if true_values is not None: - c_true_values.reserve(len(true_values)) - for tv in true_values: - c_true_values.push_back(tv.encode()) - csv_reader_options_c.set_true_values(c_true_values) - - if false_values is not None: - c_false_values.reserve(len(false_values)) - for fv in false_values: - c_false_values.push_back(fv.encode()) - csv_reader_options_c.set_false_values(c_false_values) - - if na_values is not None: - c_na_values.reserve(len(na_values)) - for nv in na_values: - c_na_values.push_back(nv.encode()) - csv_reader_options_c.set_na_values(c_na_values) - - return csv_reader_options_c - def validate_args( object delimiter, @@ -381,7 +115,6 @@ def read_csv( bool na_filter=True, object prefix=None, object index_col=None, - **kwargs, ): """ Cython function to call into libcudf API, see `read_csv`. @@ -413,23 +146,120 @@ def read_csv( if delimiter is None: delimiter = sep - cdef csv_reader_options read_csv_options_c = make_csv_reader_options( - datasource, lineterminator, quotechar, quoting, doublequote, - header, mangle_dupe_cols, usecols, delimiter, delim_whitespace, - skipinitialspace, names, dtype, skipfooter, skiprows, dayfirst, - compression, thousands, decimal, true_values, false_values, nrows, - byte_range, skip_blank_lines, parse_dates, comment, na_values, - keep_default_na, na_filter, prefix, index_col) + delimiter = str(delimiter) + + if byte_range is None: + byte_range = (0, 0) + + if compression is None: + c_compression = compression_type.NONE + else: + compression_map = { + "infer": compression_type.AUTO, + "gzip": compression_type.GZIP, + "bz2": compression_type.BZIP2, + "zip": compression_type.ZIP, + } + c_compression = compression_map[compression] - cdef table_with_metadata c_result - with nogil: - c_result = move(cpp_read_csv(read_csv_options_c)) + # We need this later when setting index cols + orig_header = header + + if names is not None: + # explicitly mentioned name, so don't check header + if header is None or header == 'infer': + header = -1 + else: + header = header + names = list(names) + else: + if header is None: + header = -1 + elif header == 'infer': + header = 0 - meta_names = [info.name.decode() for info in c_result.metadata.schema_info] - df = cudf.DataFrame._from_data(*data_from_unique_ptr( - move(c_result.tbl), - column_names=meta_names - )) + hex_cols = [] + + new_dtypes = [] + if dtype is not None: + if isinstance(dtype, abc.Mapping): + new_dtypes = dict() + for k, v in dtype.items(): + col_type = v + if is_hashable(v) and v in CSV_HEX_TYPE_MAP: + col_type = CSV_HEX_TYPE_MAP[v] + hex_cols.append(str(k)) + + new_dtypes[k] = _get_plc_data_type_from_dtype( + cudf.dtype(col_type) + ) + elif ( + cudf.api.types.is_scalar(dtype) or + isinstance(dtype, ( + np.dtype, pd.api.extensions.ExtensionDtype, type + )) + ): + if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP: + dtype = CSV_HEX_TYPE_MAP[dtype] + hex_cols.append(0) + + new_dtypes.append( + _get_plc_data_type_from_dtype(dtype) + ) + elif isinstance(dtype, abc.Collection): + for index, col_dtype in enumerate(dtype): + if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP: + col_dtype = CSV_HEX_TYPE_MAP[col_dtype] + hex_cols.append(index) + + new_dtypes.append( + _get_plc_data_type_from_dtype(col_dtype) + ) + else: + raise ValueError( + "dtype should be a scalar/str/list-like/dict-like" + ) + + lineterminator = str(lineterminator) + + df = cudf.DataFrame._from_data( + *data_from_pylibcudf_io( + plc.io.csv.read_csv( + plc.io.SourceInfo([datasource]), + lineterminator=lineterminator, + quotechar = quotechar, + quoting = quoting, + doublequote = doublequote, + header = header, + mangle_dupe_cols = mangle_dupe_cols, + usecols = usecols, + delimiter = delimiter, + delim_whitespace = delim_whitespace, + skipinitialspace = skipinitialspace, + col_names = names, + dtypes = new_dtypes, + skipfooter = skipfooter, + skiprows = skiprows, + dayfirst = dayfirst, + compression = c_compression, + thousands = thousands, + decimal = decimal, + true_values = true_values, + false_values = false_values, + nrows = nrows if nrows is not None else -1, + byte_range_offset = byte_range[0], + byte_range_size = byte_range[1], + skip_blank_lines = skip_blank_lines, + parse_dates = parse_dates, + parse_hex = hex_cols, + comment = comment, + na_values = na_values, + keep_default_na = keep_default_na, + na_filter = na_filter, + prefix = prefix, + ) + ) + ) if dtype is not None: if isinstance(dtype, abc.Mapping): @@ -450,7 +280,7 @@ def read_csv( col_name = df._data.names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) - if names is not None and isinstance(names[0], (int)): + if names is not None and len(names) and isinstance(names[0], (int)): df.columns = [int(x) for x in df._data] # Set index if the index_col parameter is passed @@ -459,7 +289,7 @@ def read_csv( index_col_name = df._data.select_by_index(index_col).names[0] df = df.set_index(index_col_name) if isinstance(index_col_name, str) and \ - names is None and header in ("infer",): + names is None and orig_header == "infer": if index_col_name.startswith("Unnamed:"): # TODO: Try to upstream it to libcudf # csv reader in future @@ -550,7 +380,7 @@ def write_csv( ) -cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: +cdef DataType _get_plc_data_type_from_dtype(object dtype) except *: # TODO: Remove this work-around Dictionary types # in libcudf are fully mapped to categorical columns: # https://github.com/rapidsai/cudf/issues/3960 @@ -561,36 +391,36 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: if isinstance(dtype, str): if str(dtype) == "date32": - return libcudf_types.data_type( + return DataType( libcudf_types.type_id.TIMESTAMP_DAYS ) elif str(dtype) in ("date", "date64"): - return libcudf_types.data_type( + return DataType( libcudf_types.type_id.TIMESTAMP_MILLISECONDS ) elif str(dtype) == "timestamp": - return libcudf_types.data_type( + return DataType( libcudf_types.type_id.TIMESTAMP_MILLISECONDS ) elif str(dtype) == "timestamp[us]": - return libcudf_types.data_type( + return DataType( libcudf_types.type_id.TIMESTAMP_MICROSECONDS ) elif str(dtype) == "timestamp[s]": - return libcudf_types.data_type( + return DataType( libcudf_types.type_id.TIMESTAMP_SECONDS ) elif str(dtype) == "timestamp[ms]": - return libcudf_types.data_type( + return DataType( libcudf_types.type_id.TIMESTAMP_MILLISECONDS ) elif str(dtype) == "timestamp[ns]": - return libcudf_types.data_type( + return DataType( libcudf_types.type_id.TIMESTAMP_NANOSECONDS ) dtype = cudf.dtype(dtype) - return dtype_to_data_type(dtype) + return dtype_to_pylibcudf_type(dtype) def columns_apply_na_rep(column_names, na_rep): diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx deleted file mode 100644 index 3fb29279ed7..00000000000 --- a/python/cudf/cudf/_lib/expressions.pyx +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -from enum import Enum - -import numpy as np - -from cython.operator cimport dereference -from libc.stdint cimport int64_t -from libcpp.memory cimport make_unique, unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp -from cudf._lib.pylibcudf.libcudf.types cimport size_type -from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport ( - timestamp_ms, - timestamp_us, -) - -# Necessary for proper casting, see below. -ctypedef int32_t underlying_type_ast_operator - - -# Aliases for simplicity -ctypedef unique_ptr[libcudf_exp.expression] expression_ptr - - -class ASTOperator(Enum): - ADD = libcudf_exp.ast_operator.ADD - SUB = libcudf_exp.ast_operator.SUB - MUL = libcudf_exp.ast_operator.MUL - DIV = libcudf_exp.ast_operator.DIV - TRUE_DIV = libcudf_exp.ast_operator.TRUE_DIV - FLOOR_DIV = libcudf_exp.ast_operator.FLOOR_DIV - MOD = libcudf_exp.ast_operator.MOD - PYMOD = libcudf_exp.ast_operator.PYMOD - POW = libcudf_exp.ast_operator.POW - EQUAL = libcudf_exp.ast_operator.EQUAL - NULL_EQUAL = libcudf_exp.ast_operator.NULL_EQUAL - NOT_EQUAL = libcudf_exp.ast_operator.NOT_EQUAL - LESS = libcudf_exp.ast_operator.LESS - GREATER = libcudf_exp.ast_operator.GREATER - LESS_EQUAL = libcudf_exp.ast_operator.LESS_EQUAL - GREATER_EQUAL = libcudf_exp.ast_operator.GREATER_EQUAL - BITWISE_AND = libcudf_exp.ast_operator.BITWISE_AND - BITWISE_OR = libcudf_exp.ast_operator.BITWISE_OR - BITWISE_XOR = libcudf_exp.ast_operator.BITWISE_XOR - LOGICAL_AND = libcudf_exp.ast_operator.LOGICAL_AND - NULL_LOGICAL_AND = libcudf_exp.ast_operator.NULL_LOGICAL_AND - LOGICAL_OR = libcudf_exp.ast_operator.LOGICAL_OR - NULL_LOGICAL_OR = libcudf_exp.ast_operator.NULL_LOGICAL_OR - # Unary operators - IDENTITY = libcudf_exp.ast_operator.IDENTITY - IS_NULL = libcudf_exp.ast_operator.IS_NULL - SIN = libcudf_exp.ast_operator.SIN - COS = libcudf_exp.ast_operator.COS - TAN = libcudf_exp.ast_operator.TAN - ARCSIN = libcudf_exp.ast_operator.ARCSIN - ARCCOS = libcudf_exp.ast_operator.ARCCOS - ARCTAN = libcudf_exp.ast_operator.ARCTAN - SINH = libcudf_exp.ast_operator.SINH - COSH = libcudf_exp.ast_operator.COSH - TANH = libcudf_exp.ast_operator.TANH - ARCSINH = libcudf_exp.ast_operator.ARCSINH - ARCCOSH = libcudf_exp.ast_operator.ARCCOSH - ARCTANH = libcudf_exp.ast_operator.ARCTANH - EXP = libcudf_exp.ast_operator.EXP - LOG = libcudf_exp.ast_operator.LOG - SQRT = libcudf_exp.ast_operator.SQRT - CBRT = libcudf_exp.ast_operator.CBRT - CEIL = libcudf_exp.ast_operator.CEIL - FLOOR = libcudf_exp.ast_operator.FLOOR - ABS = libcudf_exp.ast_operator.ABS - RINT = libcudf_exp.ast_operator.RINT - BIT_INVERT = libcudf_exp.ast_operator.BIT_INVERT - NOT = libcudf_exp.ast_operator.NOT - - -class TableReference(Enum): - LEFT = libcudf_exp.table_reference.LEFT - RIGHT = libcudf_exp.table_reference.RIGHT - - -# Note that this function only currently supports numeric literals. libcudf -# expressions don't really support other types yet though, so this isn't -# restrictive at the moment. -cdef class Literal(Expression): - def __cinit__(self, value): - if isinstance(value, int): - self.c_scalar.reset(new numeric_scalar[int64_t](value, True)) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - elif isinstance(value, float): - self.c_scalar.reset(new numeric_scalar[double](value, True)) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - elif isinstance(value, str): - self.c_scalar.reset(new string_scalar(value.encode(), True)) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - elif isinstance(value, np.datetime64): - scale, _ = np.datetime_data(value.dtype) - int_value = value.astype(np.int64) - if scale == "ms": - self.c_scalar.reset(new timestamp_scalar[timestamp_ms]( - int_value, True) - ) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - elif scale == "us": - self.c_scalar.reset(new timestamp_scalar[timestamp_us]( - int_value, True) - ) - self.c_obj = move(make_unique[libcudf_exp.literal]( - dereference(self.c_scalar) - )) - else: - raise NotImplementedError( - f"Unhandled datetime scale {scale=}" - ) - else: - raise NotImplementedError( - f"Don't know how to make literal with type {type(value)}" - ) - - -cdef class ColumnReference(Expression): - def __cinit__(self, size_type index): - self.c_obj = move(make_unique[libcudf_exp.column_reference]( - index - )) - - -cdef class Operation(Expression): - def __cinit__(self, op, Expression left, Expression right=None): - cdef libcudf_exp.ast_operator op_value = ( - op.value - ) - - if right is None: - self.c_obj = move(make_unique[libcudf_exp.operation]( - op_value, dereference(left.c_obj) - )) - else: - self.c_obj = move(make_unique[libcudf_exp.operation]( - op_value, dereference(left.c_obj), dereference(right.c_obj) - )) - -cdef class ColumnNameReference(Expression): - def __cinit__(self, string name): - self.c_obj = \ - move(make_unique[libcudf_exp.column_name_reference](name)) diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index 252d986843a..680a87c789e 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -16,6 +16,10 @@ cdef source_info make_source_info(list src) except* cdef sink_info make_sinks_info( list src, vector[unique_ptr[data_sink]] & data) except* cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except* +cdef add_df_col_struct_names( + df, + child_names_dict +) cdef update_struct_field_names( table, vector[column_name_info]& schema_info) diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index 1d7c56888d9..58956b9e9b7 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -147,10 +147,37 @@ cdef cppclass iobase_data_sink(data_sink): return buf.tell() +cdef add_df_col_struct_names(df, child_names_dict): + for name, child_names in child_names_dict.items(): + col = df._data[name] + + df._data[name] = update_col_struct_field_names(col, child_names) + + +cdef update_col_struct_field_names(Column col, child_names): + if col.children: + children = list(col.children) + for i, (child, names) in enumerate(zip(children, child_names.values())): + children[i] = update_col_struct_field_names( + child, + names + ) + col.set_base_children(tuple(children)) + + if isinstance(col.dtype, StructDtype): + col = col._rename_fields( + child_names.keys() + ) + + return col + + cdef update_struct_field_names( table, vector[column_name_info]& schema_info ): + # Deprecated, remove in favor of add_col_struct_names + # when a reader is ported to pylibcudf for i, (name, col) in enumerate(table._data.items()): table._data[name] = update_column_struct_field_names( col, schema_info[i] diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 22e34feb547..03bf9ed8b75 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -8,26 +8,17 @@ import cudf from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.map cimport map -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types -from cudf._lib.io.utils cimport make_source_info, update_struct_field_names -from cudf._lib.pylibcudf.libcudf.io.json cimport ( - json_reader_options, - json_recovery_mode_t, - read_json as libcudf_read_json, - schema_element, -) -from cudf._lib.pylibcudf.libcudf.io.types cimport ( - compression_type, - table_with_metadata, -) -from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type +from cudf._lib.column cimport Column +from cudf._lib.io.utils cimport add_df_col_struct_names +from cudf._lib.pylibcudf.io.types cimport compression_type +from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t +from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type +from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id +from cudf._lib.pylibcudf.types cimport DataType from cudf._lib.types cimport dtype_to_data_type -from cudf._lib.utils cimport data_from_unique_ptr +from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io import cudf._lib.pylibcudf as plc @@ -62,6 +53,7 @@ cpdef read_json(object filepaths_or_buffers, # If input data is a JSON string (or StringIO), hold a reference to # the encoded memoryview externally to ensure the encoded buffer # isn't destroyed before calling libcudf `read_json()` + for idx in range(len(filepaths_or_buffers)): if isinstance(filepaths_or_buffers[idx], io.StringIO): filepaths_or_buffers[idx] = \ @@ -71,17 +63,7 @@ cpdef read_json(object filepaths_or_buffers, filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode() # Setup arguments - cdef vector[data_type] c_dtypes_list - cdef map[string, schema_element] c_dtypes_schema_map cdef cudf_io_types.compression_type c_compression - # Determine byte read offsets if applicable - cdef size_type c_range_offset = ( - byte_range[0] if byte_range is not None else 0 - ) - cdef size_type c_range_size = ( - byte_range[1] if byte_range is not None else 0 - ) - cdef bool c_lines = lines if compression is not None: if compression == 'gzip': @@ -94,57 +76,71 @@ cpdef read_json(object filepaths_or_buffers, c_compression = cudf_io_types.compression_type.AUTO else: c_compression = cudf_io_types.compression_type.NONE - is_list_like_dtypes = False + + processed_dtypes = None + if dtype is False: raise ValueError("False value is unsupported for `dtype`") elif dtype is not True: + processed_dtypes = [] if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): - c_dtypes_schema_map[str(k).encode()] = \ - _get_cudf_schema_element_from_dtype(v) + # Make sure keys are string + k = str(k) + lib_type, child_types = _get_cudf_schema_element_from_dtype(v) + processed_dtypes.append((k, lib_type, child_types)) elif isinstance(dtype, abc.Collection): - is_list_like_dtypes = True - c_dtypes_list.reserve(len(dtype)) for col_dtype in dtype: - c_dtypes_list.push_back( - _get_cudf_data_type_from_dtype( - col_dtype)) + processed_dtypes.append( + # Ignore child columns since we cannot specify their dtypes + # when passing a list + _get_cudf_schema_element_from_dtype(col_dtype)[0] + ) else: raise TypeError("`dtype` must be 'list like' or 'dict'") - cdef json_reader_options opts = move( - json_reader_options.builder(make_source_info(filepaths_or_buffers)) - .compression(c_compression) - .lines(c_lines) - .byte_range_offset(c_range_offset) - .byte_range_size(c_range_size) - .recovery_mode(_get_json_recovery_mode(on_bad_lines)) - .build() - ) - if is_list_like_dtypes: - opts.set_dtypes(c_dtypes_list) + if cudf.get_option("io.json.low_memory") and lines: + res_cols, res_col_names, res_child_names = plc.io.json.chunked_read_json( + plc.io.SourceInfo(filepaths_or_buffers), + processed_dtypes, + c_compression, + keep_quotes = keep_quotes, + mixed_types_as_string = mixed_types_as_string, + prune_columns = prune_columns, + recovery_mode = _get_json_recovery_mode(on_bad_lines) + ) + df = cudf.DataFrame._from_data( + *_data_from_columns( + columns=[Column.from_pylibcudf(plc) for plc in res_cols], + column_names=res_col_names, + index_names=None + ) + ) + add_df_col_struct_names(df, res_child_names) + return df else: - opts.set_dtypes(c_dtypes_schema_map) - - opts.enable_keep_quotes(keep_quotes) - opts.enable_mixed_types_as_string(mixed_types_as_string) - opts.enable_prune_columns(prune_columns) - - # Read JSON - cdef cudf_io_types.table_with_metadata c_result - - with nogil: - c_result = move(libcudf_read_json(opts)) - - meta_names = [info.name.decode() for info in c_result.metadata.schema_info] - df = cudf.DataFrame._from_data(*data_from_unique_ptr( - move(c_result.tbl), - column_names=meta_names - )) + table_w_meta = plc.io.json.read_json( + plc.io.SourceInfo(filepaths_or_buffers), + processed_dtypes, + c_compression, + lines, + byte_range_offset = byte_range[0] if byte_range is not None else 0, + byte_range_size = byte_range[1] if byte_range is not None else 0, + keep_quotes = keep_quotes, + mixed_types_as_string = mixed_types_as_string, + prune_columns = prune_columns, + recovery_mode = _get_json_recovery_mode(on_bad_lines) + ) - update_struct_field_names(df, c_result.metadata.schema_info) + df = cudf.DataFrame._from_data( + *data_from_pylibcudf_io( + table_w_meta + ) + ) - return df + # Post-processing to add in struct column names + add_df_col_struct_names(df, table_w_meta.child_names) + return df @acquire_spill_lock() @@ -192,28 +188,32 @@ def write_json( ) -cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *: - cdef schema_element s_element - cdef data_type lib_type +cdef _get_cudf_schema_element_from_dtype(object dtype) except *: dtype = cudf.dtype(dtype) if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( "CategoricalDtype as dtype is not yet " "supported in JSON reader" ) - lib_type = dtype_to_data_type(dtype) - s_element.type = lib_type + + lib_type = DataType.from_libcudf(dtype_to_data_type(dtype)) + child_types = [] + if isinstance(dtype, cudf.StructDtype): for name, child_type in dtype.fields.items(): - s_element.child_types[name.encode()] = \ + child_lib_type, grandchild_types = \ _get_cudf_schema_element_from_dtype(child_type) + child_types.append((name, child_lib_type, grandchild_types)) elif isinstance(dtype, cudf.ListDtype): - s_element.child_types["offsets".encode()] = \ - _get_cudf_schema_element_from_dtype(cudf.dtype("int32")) - s_element.child_types["element".encode()] = \ + child_lib_type, grandchild_types = \ _get_cudf_schema_element_from_dtype(dtype.element_type) - return s_element + child_types = [ + ("offsets", DataType.from_libcudf(data_type(type_id.INT32)), []), + ("element", child_lib_type, grandchild_types) + ] + + return lib_type, child_types cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 0ad09dba717..76f37c3b845 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -8,11 +8,6 @@ from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport ( - count_elements as cpp_count_elements, -) -from cudf._lib.pylibcudf.libcudf.lists.extract cimport extract_list_element from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( lists_column_view, ) @@ -38,19 +33,10 @@ from cudf._lib.pylibcudf cimport Scalar @acquire_spill_lock() def count_elements(Column col): - - # shared_ptr required because lists_column_view has no default - # ctor - cdef shared_ptr[lists_column_view] list_view = ( - make_shared[lists_column_view](col.view()) + return Column.from_pylibcudf( + pylibcudf.lists.count_elements( + col.to_pylibcudf(mode="read")) ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_count_elements(list_view.get()[0])) - - result = Column.from_unique_ptr(move(c_result)) - return result @acquire_spill_lock() @@ -116,37 +102,23 @@ def sort_lists(Column col, bool ascending, str na_position): @acquire_spill_lock() def extract_element_scalar(Column col, size_type index): - # shared_ptr required because lists_column_view has no default - # ctor - cdef shared_ptr[lists_column_view] list_view = ( - make_shared[lists_column_view](col.view()) + return Column.from_pylibcudf( + pylibcudf.lists.extract_list_element( + col.to_pylibcudf(mode="read"), + index, + ) ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(extract_list_element(list_view.get()[0], index)) - - result = Column.from_unique_ptr(move(c_result)) - return result - @acquire_spill_lock() def extract_element_column(Column col, Column index): - cdef shared_ptr[lists_column_view] list_view = ( - make_shared[lists_column_view](col.view()) + return Column.from_pylibcudf( + pylibcudf.lists.extract_list_element( + col.to_pylibcudf(mode="read"), + index.to_pylibcudf(mode="read"), + ) ) - cdef column_view index_view = index.view() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(extract_list_element(list_view.get()[0], index_view)) - - result = Column.from_unique_ptr(move(c_result)) - return result - @acquire_spill_lock() def contains_scalar(Column col, py_search_key): diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index d1ec5be9e62..e7959d21e01 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -37,12 +37,12 @@ cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types cimport cudf._lib.pylibcudf.libcudf.types as cudf_types from cudf._lib.column cimport Column -from cudf._lib.expressions cimport Expression from cudf._lib.io.utils cimport ( make_sinks_info, make_source_info, update_struct_field_names, ) +from cudf._lib.pylibcudf.expressions cimport Expression from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource from cudf._lib.pylibcudf.libcudf.expressions cimport expression from cudf._lib.pylibcudf.libcudf.io.parquet cimport ( @@ -440,6 +440,7 @@ def write_parquet( object column_encoding=None, object column_type_length=None, object output_as_binary=None, + write_arrow_schema=False, ): """ Cython function to call into libcudf API, see `write_parquet`. @@ -544,6 +545,7 @@ def write_parquet( .write_v2_headers(header_version == "2.0") .dictionary_policy(dict_policy) .utc_timestamps(False) + .write_arrow_schema(write_arrow_schema) .build() ) if partitions_info is not None: @@ -623,6 +625,9 @@ cdef class ParquetWriter: If ``True``, enable dictionary encoding for Parquet page data subject to ``max_dictionary_size`` constraints. If ``False``, disable dictionary encoding for Parquet page data. + store_schema : bool, default False + If ``True``, enable computing and writing arrow schema to Parquet + file footer's key-value metadata section for faithful round-tripping. See Also -------- cudf.io.parquet.write_parquet @@ -641,6 +646,7 @@ cdef class ParquetWriter: cdef size_type max_page_size_rows cdef size_t max_dictionary_size cdef cudf_io_types.dictionary_policy dict_policy + cdef bool write_arrow_schema def __cinit__(self, object filepath_or_buffer, object index=None, object compression="snappy", str statistics="ROWGROUP", @@ -649,7 +655,8 @@ cdef class ParquetWriter: int max_page_size_bytes=524288, int max_page_size_rows=20000, int max_dictionary_size=1048576, - bool use_dictionary=True): + bool use_dictionary=True, + bool store_schema=False): filepaths_or_buffers = ( list(filepath_or_buffer) if is_list_like(filepath_or_buffer) @@ -670,6 +677,7 @@ cdef class ParquetWriter: if use_dictionary else cudf_io_types.dictionary_policy.NEVER ) + self.write_arrow_schema = store_schema def write_table(self, table, object partitions_info=None): """ Writes a single table to the file """ @@ -788,6 +796,7 @@ cdef class ParquetWriter: .max_page_size_bytes(self.max_page_size_bytes) .max_page_size_rows(self.max_page_size_rows) .max_dictionary_size(self.max_dictionary_size) + .write_arrow_schema(self.write_arrow_schema) .build() ) args.set_dictionary_policy(self.dict_policy) diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 0a198f431a7..0800fa18e94 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -20,6 +20,7 @@ set(cython_sources concatenate.pyx copying.pyx datetime.pyx + expressions.pyx filling.pyx gpumemoryview.pyx groupby.pyx @@ -38,6 +39,8 @@ set(cython_sources stream_compaction.pyx sorting.pyx table.pyx + traits.pyx + transform.pyx types.pyx unary.pyx utils.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 5131df9a5cd..26e89b818d3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -8,6 +8,7 @@ from . cimport ( concatenate, copying, datetime, + expressions, filling, groupby, join, @@ -23,6 +24,8 @@ from . cimport ( sorting, stream_compaction, strings, + traits, + transform, types, unary, ) @@ -54,12 +57,15 @@ __all__ = [ "quantiles", "reduce", "replace", + "reshape", "rolling", "round", "search", "stream_compaction", "strings", "sorting", + "traits", + "transform", "types", "unary", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 43a9e2aca31..e89a5ed9f96 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -7,6 +7,7 @@ concatenate, copying, datetime, + expressions, filling, groupby, interop, @@ -23,6 +24,8 @@ sorting, stream_compaction, strings, + traits, + transform, types, unary, ) @@ -35,6 +38,7 @@ __all__ = [ "Column", "DataType", + "MaskState", "Scalar", "Table", "TypeId", @@ -54,12 +58,15 @@ "quantiles", "reduce", "replace", + "reshape", "rolling", "round", "search", "stream_compaction", "strings", "sorting", + "traits", + "transform", "types", "unary", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd index 9a8c8e49dcf..2411e28ac66 100644 --- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd @@ -1,5 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool + from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator from .column cimport Column @@ -22,3 +24,10 @@ cpdef Column binary_operation( binary_operator op, DataType output_type ) + +cpdef bool is_supported_operation( + DataType out, + DataType lhs, + DataType rhs, + binary_operator op +) diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx index c1d669c3c1c..44d9f4ad04a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx @@ -2,6 +2,7 @@ from cython.operator import dereference +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -84,3 +85,37 @@ cpdef Column binary_operation( raise ValueError(f"Invalid arguments {lhs} and {rhs}") return Column.from_libcudf(move(result)) + + +cpdef bool is_supported_operation( + DataType out, + DataType lhs, + DataType rhs, + binary_operator op +): + """Check if an operation is supported for the given data types. + + For details, see :cpp:func::is_supported_operation`. + + Parameters + ---------- + out : DataType + The output data type. + lhs : DataType + The left hand side data type. + rhs : DataType + The right hand side data type. + op : BinaryOperator + The operation to check. + Returns + ------- + bool + True if the operation is supported, False otherwise + """ + + return cpp_binaryop.is_supported_operation( + out.c_obj, + lhs.c_obj, + rhs.c_obj, + op + ) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd index d13791d95cf..13ee0a70681 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd @@ -50,6 +50,7 @@ cdef class Column: cpdef gpumemoryview null_mask(self) cpdef list children(self) cpdef Column copy(self) + cpdef Column with_mask(self, gpumemoryview, size_type) cpdef ListColumnView list_view(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index e0cf8b7ee32..cb96c1d9fce 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -175,6 +175,32 @@ cdef class Column: children, ) + cpdef Column with_mask(self, gpumemoryview mask, size_type null_count): + """Augment this column with a new null mask. + + Parameters + ---------- + mask : gpumemoryview + New mask (or None to unset the mask) + null_count : int + New null count. If this is incorrect, bad things happen. + + Returns + ------- + New Column object sharing data with self (except for the mask which is new). + """ + if mask is None and null_count > 0: + raise ValueError("Empty mask must have null count of zero") + return Column( + self._data_type, + self._size, + self._data, + mask, + null_count, + self._offset, + self._children, + ) + @staticmethod cdef Column from_column_view(const column_view& cv, Column owner): """Create a Column from a libcudf column_view. @@ -250,7 +276,7 @@ cdef class Column: column is in use. """ data = gpumemoryview(obj) - iface = data.__cuda_array_interface__() + iface = data.__cuda_array_interface__ if iface.get('mask') is not None: raise ValueError("mask not yet supported.") @@ -400,8 +426,8 @@ def is_c_contiguous( itemsize : int Size of an element in bytes. - Return - ------ + Returns + ------- bool The boolean answer. """ diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/expressions.pxd similarity index 50% rename from python/cudf/cudf/_lib/expressions.pxd rename to python/cudf/cudf/_lib/pylibcudf/expressions.pxd index 4a20c5fc545..64825b89d9f 100644 --- a/python/cudf/cudf/_lib/expressions.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pxd @@ -1,36 +1,31 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t, int64_t +# Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp.string cimport string from cudf._lib.pylibcudf.libcudf.expressions cimport ( - column_reference, + ast_operator, expression, - literal, - operation, -) -from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport ( - numeric_scalar, - scalar, - string_scalar, - timestamp_scalar, + table_reference, ) +from .scalar cimport Scalar + cdef class Expression: cdef unique_ptr[expression] c_obj - cdef class Literal(Expression): - cdef unique_ptr[scalar] c_scalar - + # Hold on to input scalar so it doesn't get gc'ed + cdef Scalar scalar cdef class ColumnReference(Expression): pass - cdef class Operation(Expression): - pass + # Hold on to the input expressions so + # they don't get gc'ed + cdef Expression right + cdef Expression left cdef class ColumnNameReference(Expression): pass diff --git a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx new file mode 100644 index 00000000000..38de11406ad --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx @@ -0,0 +1,195 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cudf._lib.pylibcudf.libcudf.expressions import \ + ast_operator as ASTOperator # no-cython-lint +from cudf._lib.pylibcudf.libcudf.expressions import \ + table_reference as TableReference # no-cython-lint + +from cython.operator cimport dereference +from libc.stdint cimport int32_t, int64_t +from libcpp.memory cimport make_unique, unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport ( + duration_scalar, + numeric_scalar, + string_scalar, + timestamp_scalar, +) +from cudf._lib.pylibcudf.libcudf.types cimport size_type, type_id +from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport ( + duration_ms, + duration_ns, + duration_s, + duration_us, +) +from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport ( + timestamp_ms, + timestamp_ns, + timestamp_s, + timestamp_us, +) + +from .scalar cimport Scalar +from .traits cimport is_chrono, is_numeric +from .types cimport DataType + +# Aliases for simplicity +ctypedef unique_ptr[libcudf_exp.expression] expression_ptr + +cdef class Literal(Expression): + """ + A literal value used in an abstract syntax tree. + + For details, see :cpp:class:`cudf::ast::literal`. + + Parameters + ---------- + value : Scalar + The Scalar value of the Literal. + Must be either numeric, string, or a timestamp/duration scalar. + """ + def __cinit__(self, Scalar value): + self.scalar = value + cdef DataType typ = value.type() + cdef type_id tid = value.type().id() + if not (is_numeric(typ) or is_chrono(typ) or tid == type_id.STRING): + raise ValueError( + "Only numeric, string, or timestamp/duration scalars are accepted" + ) + # TODO: Accept type-erased scalar in AST C++ code + # Then a lot of this code can be deleted + if tid == type_id.INT64: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.INT32: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.FLOAT64: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.FLOAT32: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.STRING: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_NANOSECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_MICROSECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_MILLISECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_MILLISECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.TIMESTAMP_SECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_NANOSECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_MICROSECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_MILLISECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_MILLISECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + elif tid == type_id.DURATION_SECONDS: + self.c_obj = move(make_unique[libcudf_exp.literal]( + dereference(self.scalar.c_obj) + )) + else: + raise NotImplementedError( + f"Don't know how to make literal with type id {tid}" + ) + +cdef class ColumnReference(Expression): + """ + An expression referring to data from a column in a table. + + For details, see :cpp:class:`cudf::ast::column_reference`. + + Parameters + ---------- + index : size_type + The index of this column in the table + (provided when the expression is evaluated). + table_source : TableReference, default TableReferenece.LEFT + Which table to use in cases with two tables (e.g. joins) + """ + def __cinit__( + self, + size_type index, + table_reference table_source=table_reference.LEFT + ): + self.c_obj = move(make_unique[libcudf_exp.column_reference]( + index, table_source + )) + + +cdef class Operation(Expression): + """ + An operation expression holds an operator and zero or more operands. + + For details, see :cpp:class:`cudf::ast::operation`. + + Parameters + ---------- + op : Operator + left : Expression + Left input expression (left operand) + right: Expression, default None + Right input expression (right operand). + You should only pass this if the input expression is a binary operation. + """ + def __cinit__(self, ast_operator op, Expression left, Expression right=None): + self.left = left + self.right = right + if right is None: + self.c_obj = move(make_unique[libcudf_exp.operation]( + op, dereference(left.c_obj) + )) + else: + self.c_obj = move(make_unique[libcudf_exp.operation]( + op, dereference(left.c_obj), dereference(right.c_obj) + )) + +cdef class ColumnNameReference(Expression): + """ + An expression referring to data from a column in a table. + + For details, see :cpp:class:`cudf::ast::column_name_reference`. + + Parameters + ---------- + column_name : str + Name of this column in the table metadata + (provided when the expression is evaluated). + """ + def __cinit__(self, str name): + self.c_obj = \ + move(make_unique[libcudf_exp.column_name_reference]( + (name.encode("utf-8")) + )) diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx index a2f5b2ac387..0904022a944 100644 --- a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx @@ -22,5 +22,6 @@ cdef class gpumemoryview: # TODO: Need to respect readonly self.ptr = cai["data"][0] + @property def __cuda_array_interface__(self): return self.obj.__cuda_array_interface__ diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd index c6c146b0445..eaa05c26986 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd @@ -16,6 +16,7 @@ from cudf._lib.pylibcudf.libcudf.groupby cimport ( scan_request, ) from cudf._lib.pylibcudf.libcudf.table.table cimport table +from cudf._lib.pylibcudf.libcudf.types cimport null_order, order from .column cimport Column from .table cimport Table @@ -38,6 +39,9 @@ cdef class GroupByRequest: cdef class GroupBy: cdef unique_ptr[groupby] c_obj cdef Table _keys + cdef unique_ptr[vector[order]] _column_order + cdef unique_ptr[vector[null_order]] _null_precedence + cpdef tuple aggregate(self, list requests) cpdef tuple scan(self, list requests) cpdef tuple shift(self, Table values, list offset, list fill_values) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index 46fe61025ce..f5bb46ca6a2 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -2,7 +2,7 @@ from cython.operator cimport dereference from libcpp.functional cimport reference_wrapper -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.pair cimport pair from libcpp.utility cimport move from libcpp.vector cimport vector @@ -22,7 +22,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type from .aggregation cimport Aggregation from .column cimport Column from .table cimport Table -from .types cimport null_policy, sorted +from .types cimport null_order, null_policy, order, sorted from .utils cimport _as_vector @@ -87,17 +87,43 @@ cdef class GroupBy: keys : Table The columns to group by. null_handling : null_policy, optional - Whether or not to include null rows in ``keys``. Default is null_policy.EXCLUDE. + Whether or not to include null rows in `keys`. + Default is ``null_policy.EXCLUDE``. keys_are_sorted : sorted, optional - Whether the keys are already sorted. Default is sorted.NO. + Whether the keys are already sorted. Default is ``sorted.NO``. + column_order : list[order] + Indicates the order of each column. Default is ``order.ASCENDING``. + Ignored if `keys_are_sorted` is ``sorted.NO``. + null_precedence : list[null_order] + Indicates the ordering of null values in each column. + Default is ``null_order.AFTER``. Ignored if `keys_are_sorted` is ``sorted.NO``. """ def __init__( self, Table keys, null_policy null_handling=null_policy.EXCLUDE, - sorted keys_are_sorted=sorted.NO + sorted keys_are_sorted=sorted.NO, + list column_order=None, + list null_precedence=None, ): - self.c_obj.reset(new groupby(keys.view(), null_handling, keys_are_sorted)) + self._column_order = make_unique[vector[order]]() + self._null_precedence = make_unique[vector[null_order]]() + if column_order is not None: + for o in column_order: + dereference(self._column_order).push_back(o) + if null_precedence is not None: + for o in null_precedence: + dereference(self._null_precedence).push_back(o) + + self.c_obj.reset( + new groupby( + keys.view(), + null_handling, + keys_are_sorted, + dereference(self._column_order.get()), + dereference(self._null_precedence.get()), + ) + ) # keep a reference to the keys table so it doesn't get # deallocated from under us: self._keys = keys diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt index 084b341ec48..8dd08d11dc8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources avro.pyx datasource.pyx json.pyx types.pyx) +set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx types.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( @@ -21,7 +21,7 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf ) -set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_json - pylibcudf_io_types +set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_csv pylibcudf_io_datasource + pylibcudf_io_json pylibcudf_io_types ) link_to_pyarrow_headers("${targets_using_arrow_headers}") diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd index ef4c65b277e..5b3272d60e0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd @@ -1,4 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +# CSV is removed since it is def not cpdef (to force kw-only arguments) from . cimport avro, datasource, json, types from .types cimport SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py index fb4e4c7e4bb..e17deaa4663 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, datasource, json, types +from . import avro, csv, datasource, json, types from .types import SinkInfo, SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx b/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx new file mode 100644 index 00000000000..e9efb5befee --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx @@ -0,0 +1,264 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.map cimport map +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from cudf._lib.pylibcudf.libcudf.io.csv cimport ( + csv_reader_options, + read_csv as cpp_read_csv, +) +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + compression_type, + quote_style, + table_with_metadata, +) +from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type +from cudf._lib.pylibcudf.types cimport DataType + + +cdef tuple _process_parse_dates_hex(list cols): + cdef vector[string] str_cols + cdef vector[int] int_cols + for col in cols: + if isinstance(col, str): + str_cols.push_back(col.encode()) + else: + int_cols.push_back(col) + return str_cols, int_cols + +cdef vector[string] _make_str_vector(list vals): + cdef vector[string] res + for val in vals: + res.push_back((val).encode()) + return res + + +def read_csv( + SourceInfo source_info, + *, + compression_type compression = compression_type.AUTO, + size_t byte_range_offset = 0, + size_t byte_range_size = 0, + list col_names = None, + str prefix = "", + bool mangle_dupe_cols = True, + list usecols = None, + size_type nrows = -1, + size_type skiprows = 0, + size_type skipfooter = 0, + size_type header = 0, + str lineterminator = "\n", + str delimiter = None, + str thousands = None, + str decimal = ".", + str comment = None, + bool delim_whitespace = False, + bool skipinitialspace = False, + bool skip_blank_lines = True, + quote_style quoting = quote_style.MINIMAL, + str quotechar = '"', + bool doublequote = True, + list parse_dates = None, + list parse_hex = None, + # Technically this should be dict/list + # but using a fused type prevents using None as default + object dtypes = None, + list true_values = None, + list false_values = None, + list na_values = None, + bool keep_default_na = True, + bool na_filter = True, + bool dayfirst = False, + # Note: These options are supported by the libcudf reader + # but are not exposed here since there is no demand for them + # on the Python side yet. + # bool detect_whitespace_around_quotes = False, + # DataType timestamp_type = DataType(type_id.EMPTY), +): + """Reads a CSV file into a :py:class:`~.types.TableWithMetadata`. + + Parameters + ---------- + source_info : SourceInfo + The SourceInfo to read the CSV file from. + compression : compression_type, default CompressionType.AUTO + The compression format of the CSV source. + byte_range_offset : size_type, default 0 + Number of bytes to skip from source start. + byte_range_size : size_type, default 0 + Number of bytes to read. By default, will read all bytes. + col_names : list, default None + The column names to use. + prefix : string, default '' + The prefix to apply to the column names. + mangle_dupe_cols : bool, default True + If True, rename duplicate column names. + usecols : list, default None + Specify the string column names/integer column indices of columns to be read. + nrows : size_type, default -1 + The number of rows to read. + skiprows : size_type, default 0 + The number of rows to skip from the start before reading + skipfooter : size_type, default 0 + The number of rows to skip from the end + header : size_type, default 0 + The index of the row that will be used for header names. + Pass -1 to use default column names. + lineterminator : str, default '\\n' + The character used to determine the end of a line. + delimiter : str, default "," + The character used to separate fields in a row. + thousands : str, default None + The character used as the thousands separator. + Cannot match delimiter. + decimal : str, default '.' + The character used as the decimal separator. + Cannot match delimiter. + comment : str, default None + The character used to identify the start of a comment line. + (which will be skipped by the reader) + delim_whitespace : bool, default False + If True, treat whitespace as the field delimiter. + skipinitialspace : bool, default False + If True, skip whitespace after the delimiter. + skip_blank_lines : bool, default True + If True, ignore empty lines (otherwise line values are parsed as null). + quoting : QuoteStyle, default QuoteStyle.MINIMAL + The quoting style used in the input CSV data. One of + { QuoteStyle.MINIMAL, QuoteStyle.ALL, QuoteStyle.NONNUMERIC, QuoteStyle.NONE } + quotechar : str, default '"' + The character used to indicate quoting. + doublequote : bool, default True + If True, a quote inside a value is double-quoted. + parse_dates : list, default None + A list of integer column indices/string column names + of columns to read as datetime. + parse_hex : list, default None + A list of integer column indices/string column names + of columns to read as hexadecimal. + dtypes : Union[Dict[str, DataType], List[DataType]], default None + A list of data types or a dictionary mapping column names + to a DataType. + true_values : List[str], default None + A list of additional values to recognize as True. + false_values : List[str], default None + A list of additional values to recognize as False. + na_values : List[str], default None + A list of additional values to recognize as null. + keep_default_na : bool, default True + Whether to keep the built-in default N/A values. + na_filter : bool, default True + Whether to detect missing values. If False, can + improve performance. + dayfirst : bool, default False + If True, interpret dates as being in the DD/MM format. + + Returns + ------- + TableWithMetadata + The Table and its corresponding metadata (column names) that were read in. + """ + cdef vector[string] c_parse_dates_names + cdef vector[int] c_parse_dates_indexes + cdef vector[int] c_parse_hex_names + cdef vector[int] c_parse_hex_indexes + cdef vector[data_type] c_dtypes_list + cdef map[string, data_type] c_dtypes_map + + cdef csv_reader_options options = move( + csv_reader_options.builder(source_info.c_obj) + .compression(compression) + .mangle_dupe_cols(mangle_dupe_cols) + .byte_range_offset(byte_range_offset) + .byte_range_size(byte_range_size) + .nrows(nrows) + .skiprows(skiprows) + .skipfooter(skipfooter) + .quoting(quoting) + .lineterminator(ord(lineterminator)) + .quotechar(ord(quotechar)) + .decimal(ord(decimal)) + .delim_whitespace(delim_whitespace) + .skipinitialspace(skipinitialspace) + .skip_blank_lines(skip_blank_lines) + .doublequote(doublequote) + .keep_default_na(keep_default_na) + .na_filter(na_filter) + .dayfirst(dayfirst) + .build() + ) + + options.set_header(header) + + if col_names is not None: + options.set_names([str(name).encode() for name in col_names]) + + if prefix is not None: + options.set_prefix(prefix.encode()) + + if usecols is not None: + if all([isinstance(col, int) for col in usecols]): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name).encode() for name in usecols]) + + if delimiter is not None: + options.set_delimiter(ord(delimiter)) + + if thousands is not None: + options.set_thousands(ord(thousands)) + + if comment is not None: + options.set_comment(ord(comment)) + + if parse_dates is not None: + if not all([isinstance(col, (str, int)) for col in parse_dates]): + raise NotImplementedError( + "`parse_dates`: Must pass a list of column names/indices") + + # Set both since users are allowed to mix column names and indices + c_parse_dates_names, c_parse_dates_indexes = \ + _process_parse_dates_hex(parse_dates) + options.set_parse_dates(c_parse_dates_names) + options.set_parse_dates(c_parse_dates_indexes) + + if parse_hex is not None: + if not all([isinstance(col, (str, int)) for col in parse_hex]): + raise NotImplementedError( + "`parse_hex`: Must pass a list of column names/indices") + + # Set both since users are allowed to mix column names and indices + c_parse_hex_names, c_parse_hex_indexes = _process_parse_dates_hex(parse_hex) + options.set_parse_hex(c_parse_hex_names) + options.set_parse_hex(c_parse_hex_indexes) + + if isinstance(dtypes, list): + for dtype in dtypes: + c_dtypes_list.push_back((dtype).c_obj) + options.set_dtypes(c_dtypes_list) + elif isinstance(dtypes, dict): + # dtypes_t is dict + for k, v in dtypes.items(): + c_dtypes_map[str(k).encode()] = (v).c_obj + options.set_dtypes(c_dtypes_map) + elif dtypes is not None: + raise TypeError("dtypes must either by a list/dict") + + if true_values is not None: + options.set_true_values(_make_str_vector(true_values)) + + if false_values is not None: + options.set_false_values(_make_str_vector(false_values)) + + if na_values is not None: + options.set_na_values(_make_str_vector(na_values)) + + cdef table_with_metadata c_result + with nogil: + c_result = move(cpp_read_csv(options)) + + return TableWithMetadata.from_libcudf(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx index aa7fa0efdaf..8f265f585de 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx @@ -7,6 +7,8 @@ from pyarrow.lib cimport NativeFile from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource +import warnings + cdef class Datasource: cdef datasource* get_datasource(self) except * nogil: @@ -16,10 +18,16 @@ cdef class Datasource: cdef class NativeFileDatasource(Datasource): - def __cinit__(self, NativeFile native_file,): + def __cinit__(self, NativeFile native_file): cdef shared_ptr[CRandomAccessFile] ra_src + warnings.warn( + "Support for reading pyarrow's NativeFile is deprecated " + "and will be removed in a future release of cudf.", + FutureWarning, + ) + ra_src = native_file.get_random_access_file() self.c_datasource.reset(new arrow_io_source(ra_src)) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd index a91d574131f..2e0e92a054f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd @@ -1,11 +1,30 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from libcpp cimport bool -from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata +from cudf._lib.pylibcudf.io.types cimport ( + SinkInfo, + SourceInfo, + TableWithMetadata, + compression_type, +) +from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t from cudf._lib.pylibcudf.libcudf.types cimport size_type +cpdef TableWithMetadata read_json( + SourceInfo source_info, + list dtypes = *, + compression_type compression = *, + bool lines = *, + size_type byte_range_offset = *, + size_type byte_range_size = *, + bool keep_quotes = *, + bool mixed_types_as_string = *, + bool prune_columns = *, + json_recovery_mode_t recovery_mode = *, +) + + cpdef void write_json( SinkInfo sink_info, TableWithMetadata tbl, @@ -16,3 +35,14 @@ cpdef void write_json( str true_value = *, str false_value = * ) + +cpdef tuple chunked_read_json( + SourceInfo source_info, + list dtypes = *, + compression_type compression = *, + bool keep_quotes = *, + bool mixed_types_as_string = *, + bool prune_columns = *, + json_recovery_mode_t recovery_mode = *, + int chunk_size= *, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx index 7530eba3803..2710ee60075 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx @@ -1,16 +1,262 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from libcpp cimport bool from libcpp.limits cimport numeric_limits +from libcpp.map cimport map from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector -from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata +from cudf._lib.pylibcudf.concatenate cimport concatenate +from cudf._lib.pylibcudf.io.types cimport ( + SinkInfo, + SourceInfo, + TableWithMetadata, +) from cudf._lib.pylibcudf.libcudf.io.json cimport ( + json_reader_options, + json_recovery_mode_t, json_writer_options, + read_json as cpp_read_json, + schema_element, write_json as cpp_write_json, ) -from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata -from cudf._lib.pylibcudf.types cimport size_type +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + compression_type, + table_metadata, + table_with_metadata, +) +from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type +from cudf._lib.pylibcudf.types cimport DataType + + +cdef map[string, schema_element] _generate_schema_map(list dtypes): + cdef map[string, schema_element] schema_map + cdef schema_element s_elem + cdef string c_name + + for name, dtype, child_dtypes in dtypes: + if not (isinstance(name, str) and + isinstance(dtype, DataType) and + isinstance(child_dtypes, list)): + + raise ValueError("Must pass a list of a tuple containing " + "(column_name, column_dtype, list of child_dtypes)") + + c_name = name.encode() + + s_elem.type = (dtype).c_obj + s_elem.child_types = _generate_schema_map(child_dtypes) + + schema_map[c_name] = s_elem + return schema_map + + +cdef json_reader_options _setup_json_reader_options( + SourceInfo source_info, + list dtypes, + compression_type compression, + bool lines, + size_type byte_range_offset, + size_type byte_range_size, + bool keep_quotes, + bool mixed_types_as_string, + bool prune_columns, + json_recovery_mode_t recovery_mode): + + cdef vector[data_type] types_vec + cdef json_reader_options opts = move( + json_reader_options.builder(source_info.c_obj) + .compression(compression) + .lines(lines) + .byte_range_offset(byte_range_offset) + .byte_range_size(byte_range_size) + .recovery_mode(recovery_mode) + .build() + ) + + if dtypes is not None: + if isinstance(dtypes[0], tuple): + opts.set_dtypes(move(_generate_schema_map(dtypes))) + else: + for dtype in dtypes: + types_vec.push_back((dtype).c_obj) + opts.set_dtypes(types_vec) + + opts.enable_keep_quotes(keep_quotes) + opts.enable_mixed_types_as_string(mixed_types_as_string) + opts.enable_prune_columns(prune_columns) + return opts + + +cpdef tuple chunked_read_json( + SourceInfo source_info, + list dtypes = None, + compression_type compression = compression_type.AUTO, + bool keep_quotes = False, + bool mixed_types_as_string = False, + bool prune_columns = False, + json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL, + int chunk_size=100_000_000, +): + """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`. + + Parameters + ---------- + source_info : SourceInfo + The SourceInfo object to read the JSON file from. + dtypes : list, default None + Set data types for the columns in the JSON file. + + Each element of the list has the format + (column_name, column_dtype, list of child dtypes), where + the list of child dtypes is an empty list if the child is not + a nested type (list or struct dtype), and is of format + (column_child_name, column_child_type, list of grandchild dtypes). + compression: CompressionType, default CompressionType.AUTO + The compression format of the JSON source. + keep_quotes : bool, default False + Whether the reader should keep quotes of string values. + mixed_types_as_string : bool, default False + If True, mixed type columns are returned as string columns. + If `False` parsing mixed type columns will thrown an error. + prune_columns : bool, default False + Whether to only read columns specified in dtypes. + recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL + Whether to raise an error or set corresponding values to null + when encountering an invalid JSON line. + chunk_size : int, default 100_000_000 bytes. + The number of bytes to be read in chunks. + The chunk_size should be set to at least row_size. + + Returns + ------- + tuple + A tuple of (columns, column_name, child_names) + """ + cdef size_type c_range_size = ( + chunk_size if chunk_size is not None else 0 + ) + cdef json_reader_options opts = _setup_json_reader_options( + source_info=source_info, + dtypes=dtypes, + compression=compression, + lines=True, + byte_range_offset=0, + byte_range_size=0, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + recovery_mode=recovery_mode, + ) + + # Read JSON + cdef table_with_metadata c_result + + final_columns = [] + meta_names = None + child_names = None + i = 0 + while True: + opts.set_byte_range_offset(c_range_size * i) + opts.set_byte_range_size(c_range_size) + + try: + with nogil: + c_result = move(cpp_read_json(opts)) + except (ValueError, OverflowError): + break + if meta_names is None: + meta_names = [info.name.decode() for info in c_result.metadata.schema_info] + if child_names is None: + child_names = TableWithMetadata._parse_col_names( + c_result.metadata.schema_info + ) + new_chunk = [ + col for col in TableWithMetadata.from_libcudf( + c_result).columns + ] + + if len(final_columns) == 0: + final_columns = new_chunk + else: + for col_idx in range(len(meta_names)): + final_columns[col_idx] = concatenate( + [final_columns[col_idx], new_chunk[col_idx]] + ) + # Must drop any residual GPU columns to save memory + new_chunk[col_idx] = None + i += 1 + return (final_columns, meta_names, child_names) + + +cpdef TableWithMetadata read_json( + SourceInfo source_info, + list dtypes = None, + compression_type compression = compression_type.AUTO, + bool lines = False, + size_type byte_range_offset = 0, + size_type byte_range_size = 0, + bool keep_quotes = False, + bool mixed_types_as_string = False, + bool prune_columns = False, + json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL, +): + """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`. + + Parameters + ---------- + source_info : SourceInfo + The SourceInfo object to read the JSON file from. + dtypes : list, default None + Set data types for the columns in the JSON file. + + Each element of the list has the format + (column_name, column_dtype, list of child dtypes), where + the list of child dtypes is an empty list if the child is not + a nested type (list or struct dtype), and is of format + (column_child_name, column_child_type, list of grandchild dtypes). + compression: CompressionType, default CompressionType.AUTO + The compression format of the JSON source. + byte_range_offset : size_type, default 0 + Number of bytes to skip from source start. + byte_range_size : size_type, default 0 + Number of bytes to read. By default, will read all bytes. + keep_quotes : bool, default False + Whether the reader should keep quotes of string values. + mixed_types_as_string : bool, default False + If True, mixed type columns are returned as string columns. + If `False` parsing mixed type columns will thrown an error. + prune_columns : bool, default False + Whether to only read columns specified in dtypes. + recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL + Whether to raise an error or set corresponding values to null + when encountering an invalid JSON line. + + Returns + ------- + TableWithMetadata + The Table and its corresponding metadata (column names) that were read in. + """ + cdef json_reader_options opts = _setup_json_reader_options( + source_info=source_info, + dtypes=dtypes, + compression=compression, + lines=lines, + byte_range_offset=byte_range_offset, + byte_range_size=byte_range_size, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + recovery_mode=recovery_mode, + ) + + # Read JSON + cdef table_with_metadata c_result + + with nogil: + c_result = move(cpp_read_json(opts)) + + return TableWithMetadata.from_libcudf(c_result) cpdef void write_json( diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd index 88daf54f33b..0094bf6032c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd @@ -28,11 +28,19 @@ cdef class TableWithMetadata: cdef vector[column_name_info] _make_column_info(self, list column_names) + cdef list _make_columns_list(self, dict child_dict) + + @staticmethod + cdef dict _parse_col_names(vector[column_name_info] infos) + @staticmethod cdef TableWithMetadata from_libcudf(table_with_metadata& tbl) cdef class SourceInfo: cdef source_info c_obj + # Keep the bytes converted from stringio alive + # (otherwise we end up with a use after free when they get gc'ed) + cdef list byte_sources cdef class SinkInfo: # This vector just exists to keep the unique_ptrs to the sinks alive diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index f94e20970a4..68498ff88f4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -22,6 +22,11 @@ import errno import io import os +from cudf._lib.pylibcudf.libcudf.io.json import \ + json_recovery_mode_t as JSONRecoveryMode # no-cython-lint +from cudf._lib.pylibcudf.libcudf.io.types import \ + compression_type as CompressionType # no-cython-lint + cdef class TableWithMetadata: """A container holding a table and its associated metadata @@ -69,16 +74,44 @@ cdef class TableWithMetadata: """ return self.tbl.columns() - @property - def column_names(self): + cdef list _make_columns_list(self, dict child_dict): + cdef list names = [] + for child in child_dict: + grandchildren = self._make_columns_list(child_dict[child]) + names.append((child, grandchildren)) + return names + + def column_names(self, include_children=False): """ Return a list containing the column names of the table """ cdef list names = [] + cdef str name + cdef dict child_names = self.child_names for col_info in self.metadata.schema_info: - # TODO: Handle nesting (columns with child columns) - assert col_info.children.size() == 0, "Child column names are not handled!" - names.append(col_info.name.decode()) + name = col_info.name.decode() + if include_children: + children = self._make_columns_list(child_names[name]) + names.append((name, children)) + else: + names.append(name) + return names + + @property + def child_names(self): + """ + Return a dictionary mapping the names of columns with children + to the names of their child columns + """ + return TableWithMetadata._parse_col_names(self.metadata.schema_info) + + @staticmethod + cdef dict _parse_col_names(vector[column_name_info] infos): + cdef dict child_names = dict() + cdef dict names = dict() + for col_info in infos: + child_names = TableWithMetadata._parse_col_names(col_info.children) + names[col_info.name.decode()] = child_names return names @staticmethod @@ -137,6 +170,15 @@ cdef class SourceInfo: cdef vector[host_buffer] c_host_buffers cdef const unsigned char[::1] c_buffer cdef bint empty_buffer = False + cdef list new_sources = [] + + if isinstance(sources[0], io.StringIO): + for buffer in sources: + if not isinstance(buffer, io.StringIO): + raise ValueError("All sources must be of the same type!") + new_sources.append(buffer.read().encode()) + sources = new_sources + self.byte_sources = sources if isinstance(sources[0], bytes): empty_buffer = True for buffer in sources: @@ -156,7 +198,10 @@ cdef class SourceInfo: c_buffer.shape[0])) else: raise ValueError("Sources must be a list of str/paths, " - "bytes, io.BytesIO, or a Datasource") + "bytes, io.BytesIO, io.StringIO, or a Datasource") + + if empty_buffer is True: + c_host_buffers.push_back(host_buffer(NULL, 0)) self.c_obj = source_info(c_host_buffers) diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/cudf/cudf/_lib/pylibcudf/join.pyx index 308b1b39291..2ded84d84d1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/join.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/join.pyx @@ -10,12 +10,7 @@ from rmm._lib.device_buffer cimport device_buffer from cudf._lib.pylibcudf.libcudf cimport join as cpp_join from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.table.table cimport table -from cudf._lib.pylibcudf.libcudf.types cimport ( - data_type, - null_equality, - size_type, - type_id, -) +from cudf._lib.pylibcudf.libcudf.types cimport null_equality from .column cimport Column from .table cimport Table @@ -23,15 +18,11 @@ from .table cimport Table cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map): # helper to convert a gather map to a Column - cdef device_buffer c_empty - cdef size_type size = dereference(gather_map.get()).size() return Column.from_libcudf( move( make_unique[column]( - data_type(type_id.INT32), - size, - dereference(gather_map.get()).release(), - move(c_empty), + move(dereference(gather_map.get())), + device_buffer(), 0 ) ) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt index 6c66d01ca57..b04e94f1546 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx copying.pyx reduce.pyx replace.pyx round.pyx - stream_compaction.pyx types.pyx unary.pyx +set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx reduce.pyx replace.pyx + round.pyx stream_compaction.pyx types.pyx unary.pyx ) set(linked_libraries cudf::cudf) @@ -22,4 +22,5 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp ) +add_subdirectory(io) add_subdirectory(strings) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd index 0eda7d34ff9..b34fea6a775 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd @@ -1,9 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string +from cudf._lib.exception_handler cimport cudf_exception_handler from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar @@ -19,9 +21,20 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil: TRUE_DIV FLOOR_DIV MOD + PMOD PYMOD POW INT_POW + LOG_BASE + ATAN2 + SHIFT_LEFT + SHIFT_RIGHT + SHIFT_RIGHT_UNSIGNED + BITWISE_AND + BITWISE_OR + BITWISE_XOR + LOGICAL_AND + LOGICAL_OR EQUAL NOT_EQUAL LESS @@ -29,38 +42,46 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil: LESS_EQUAL GREATER_EQUAL NULL_EQUALS + NULL_MAX + NULL_MIN NULL_NOT_EQUALS - BITWISE_AND - BITWISE_OR - BITWISE_XOR - LOGICAL_AND - LOGICAL_OR GENERIC_BINARY + NULL_LOGICAL_AND + NULL_LOGICAL_OR + INVALID_BINARY cdef unique_ptr[column] binary_operation ( const scalar& lhs, const column_view& rhs, binary_operator op, data_type output_type - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] binary_operation ( const column_view& lhs, const scalar& rhs, binary_operator op, data_type output_type - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] binary_operation ( const column_view& lhs, const column_view& rhs, binary_operator op, data_type output_type - ) except + + ) except +cudf_exception_handler cdef unique_ptr[column] binary_operation ( const column_view& lhs, const column_view& rhs, const string& op, data_type output_type - ) except + + ) except +cudf_exception_handler + +cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil: + cdef bool is_supported_operation( + data_type output_type, + data_type lhs_type, + data_type rhs_type, + binary_operator op + ) except +cudf_exception_handler diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd index 279d969db50..427e16d4ff8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -14,63 +15,63 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: - ctypedef enum ast_operator: + cpdef enum class ast_operator(int32_t): # Binary operators - ADD "cudf::ast::ast_operator::ADD" - SUB "cudf::ast::ast_operator::SUB" - MUL "cudf::ast::ast_operator::MUL" - DIV "cudf::ast::ast_operator::DIV" - TRUE_DIV "cudf::ast::ast_operator::TRUE_DIV" - FLOOR_DIV "cudf::ast::ast_operator::FLOOR_DIV" - MOD "cudf::ast::ast_operator::MOD" - PYMOD "cudf::ast::ast_operator::PYMOD" - POW "cudf::ast::ast_operator::POW" - EQUAL "cudf::ast::ast_operator::EQUAL" - NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL" - NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL" - LESS "cudf::ast::ast_operator::LESS" - GREATER "cudf::ast::ast_operator::GREATER" - LESS_EQUAL "cudf::ast::ast_operator::LESS_EQUAL" - GREATER_EQUAL "cudf::ast::ast_operator::GREATER_EQUAL" - BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND" - BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR" - BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR" - NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND" - LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND" - NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR" - LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR" + ADD + SUB + MUL + DIV + TRUE_DIV + FLOOR_DIV + MOD + PYMOD + POW + EQUAL + NULL_EQUAL + NOT_EQUAL + LESS + GREATER + LESS_EQUAL + GREATER_EQUAL + BITWISE_AND + BITWISE_OR + BITWISE_XOR + NULL_LOGICAL_AND + LOGICAL_AND + NULL_LOGICAL_OR + LOGICAL_OR # Unary operators - IDENTITY "cudf::ast::ast_operator::IDENTITY" - IS_NULL "cudf::ast::ast_operator::IS_NULL" - SIN "cudf::ast::ast_operator::SIN" - COS "cudf::ast::ast_operator::COS" - TAN "cudf::ast::ast_operator::TAN" - ARCSIN "cudf::ast::ast_operator::ARCSIN" - ARCCOS "cudf::ast::ast_operator::ARCCOS" - ARCTAN "cudf::ast::ast_operator::ARCTAN" - SINH "cudf::ast::ast_operator::SINH" - COSH "cudf::ast::ast_operator::COSH" - TANH "cudf::ast::ast_operator::TANH" - ARCSINH "cudf::ast::ast_operator::ARCSINH" - ARCCOSH "cudf::ast::ast_operator::ARCCOSH" - ARCTANH "cudf::ast::ast_operator::ARCTANH" - EXP "cudf::ast::ast_operator::EXP" - LOG "cudf::ast::ast_operator::LOG" - SQRT "cudf::ast::ast_operator::SQRT" - CBRT "cudf::ast::ast_operator::CBRT" - CEIL "cudf::ast::ast_operator::CEIL" - FLOOR "cudf::ast::ast_operator::FLOOR" - ABS "cudf::ast::ast_operator::ABS" - RINT "cudf::ast::ast_operator::RINT" - BIT_INVERT "cudf::ast::ast_operator::BIT_INVERT" - NOT "cudf::ast::ast_operator::NOT" + IDENTITY + IS_NULL + SIN + COS + TAN + ARCSIN + ARCCOS + ARCTAN + SINH + COSH + TANH + ARCSINH + ARCCOSH + ARCTANH + EXP + LOG + SQRT + CBRT + CEIL + FLOOR + ABS + RINT + BIT_INVERT + NOT cdef cppclass expression: pass - ctypedef enum table_reference: - LEFT "cudf::ast::table_reference::LEFT" - RIGHT "cudf::ast::table_reference::RIGHT" + cpdef enum class table_reference(int32_t): + LEFT + RIGHT cdef cppclass literal(expression): # Due to https://github.com/cython/cython/issues/3198, we need to diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt new file mode 100644 index 00000000000..6831063ecb9 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt @@ -0,0 +1,26 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources json.pyx types.pyx) + +set(linked_libraries cudf::cudf) + +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_io_ +) + +set(targets_using_arrow_headers cpp_io_json cpp_io_types) +link_to_pyarrow_headers("${targets_using_arrow_headers}") diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd index 2e50cccd132..86621ae184f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libc.stdint cimport uint8_t +from libc.stdint cimport int32_t, uint8_t from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr @@ -19,9 +19,9 @@ cdef extern from "cudf/io/json.hpp" \ data_type type map[string, schema_element] child_types - cdef enum json_recovery_mode_t: - FAIL "cudf::io::json_recovery_mode_t::FAIL" - RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL" + cpdef enum class json_recovery_mode_t(int32_t): + FAIL + RECOVER_WITH_NULL cdef cppclass json_reader_options: json_reader_options() except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd index 0ef6553db56..c38f39f7749 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd @@ -78,6 +78,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: size_t get_max_page_size_bytes() except + size_type get_max_page_size_rows() except + size_t get_max_dictionary_size() except + + bool is_enabled_write_arrow_schema() except + void set_metadata( cudf_io_types.table_input_metadata m @@ -103,6 +104,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_max_page_size_rows(size_type val) except + void set_max_dictionary_size(size_t val) except + void enable_write_v2_headers(bool val) except + + void enable_write_arrow_schema(bool val) except + void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except + cdef cppclass parquet_writer_options(parquet_writer_options_base): @@ -143,6 +145,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: BuilderT& utc_timestamps( bool enabled ) except + + BuilderT& write_arrow_schema( + bool enabled + ) except + BuilderT& row_group_size_bytes( size_t val ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd index 38bdd4db0bb..ba57a839fbc 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd @@ -9,4 +9,4 @@ from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil: - cdef unique_ptr[column] count_elements(const lists_column_view) except + + cdef unique_ptr[column] count_elements(const lists_column_view&) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd index caa12f41914..53609ba8830 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd @@ -11,10 +11,10 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] extract_list_element( - const lists_column_view, + const lists_column_view&, size_type ) except + cdef unique_ptr[column] extract_list_element( - const lists_column_view, - column_view + const lists_column_view&, + const column_view& ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd index 17b4c1877a6..ab7ed141365 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd @@ -10,6 +10,6 @@ from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] segmented_gather( - const lists_column_view source_column, - const lists_column_view gather_map_list + const lists_column_view& source_column, + const lists_column_view& gather_map_list ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd index fd21e7b334b..8917a6ac899 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd @@ -10,7 +10,9 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil: cdef cppclass lists_column_view(column_view): lists_column_view() except + + lists_column_view(const lists_column_view& lists_column) except + lists_column_view(const column_view& lists_column) except + + lists_column_view& operator=(const lists_column_view&) except + column_view parent() except + column_view offsets() except + column_view child() except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd new file mode 100644 index 00000000000..0382a5d42c3 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( + lists_column_view, +) + + +cdef extern from "cudf/lists/reverse.hpp" namespace "cudf::lists" nogil: + cdef unique_ptr[column] reverse( + const lists_column_view& lists_column, + ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd index 7f8ae2b7617..2a1b189af51 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t +from libcpp cimport bool from libcpp.memory cimport unique_ptr from cudf._lib.pylibcudf.libcudf.column.column cimport column @@ -43,5 +44,6 @@ cdef extern from "cudf/unary.hpp" namespace "cudf" nogil: cdef extern unique_ptr[column] cast( column_view input, data_type out_type) except + + cdef extern bool is_supported_cast(data_type from_, data_type to) noexcept cdef extern unique_ptr[column] is_nan(column_view input) except + cdef extern unique_ptr[column] is_not_nan(column_view input) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd new file mode 100644 index 00000000000..0cc58af735b --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.types cimport data_type + + +cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil: + cdef bool is_relationally_comparable(data_type) + cdef bool is_equality_comparable(data_type) + cdef bool is_numeric(data_type) + cdef bool is_index_type(data_type) + cdef bool is_unsigned(data_type) + cdef bool is_integral(data_type) + cdef bool is_integral_not_bool(data_type) + cdef bool is_floating_point(data_type) + cdef bool is_boolean(data_type) + cdef bool is_timestamp(data_type) + cdef bool is_fixed_point(data_type) + cdef bool is_duration(data_type) + cdef bool is_chrono(data_type) + cdef bool is_dictionary(data_type) + cdef bool is_fixed_width(data_type) + cdef bool is_compound(data_type) + cdef bool is_nested(data_type) + cdef bool is_bit_castable(data_type, data_type) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd new file mode 100644 index 00000000000..890fca3a662 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.libcudf.types cimport type_id + + +cdef extern from "cudf/utilities/type_dispatcher.hpp" namespace "cudf" nogil: + cdef type_id type_to_id[T]() diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index 2ccf0139e90..38eb575ee8d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -12,6 +12,10 @@ ctypedef fused ColumnOrScalar: Column Scalar +ctypedef fused ColumnOrSizeType: + Column + size_type + cpdef Table explode_outer(Table, size_type explode_column_idx) cpdef Column concatenate_rows(Table) @@ -23,3 +27,11 @@ cpdef Column contains(Column, ColumnOrScalar) cpdef Column contains_nulls(Column) cpdef Column index_of(Column, ColumnOrScalar, bool) + +cpdef Column reverse(Column) + +cpdef Column segmented_gather(Column, Column) + +cpdef Column extract_list_element(Column, ColumnOrSizeType) + +cpdef Column count_elements(Column) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index a94d940accd..ea469642dd5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -9,15 +9,23 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.lists cimport ( contains as cpp_contains, explode as cpp_explode, + gather as cpp_gather, + reverse as cpp_reverse, ) from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( concatenate_list_elements as cpp_concatenate_list_elements, concatenate_null_policy, concatenate_rows as cpp_concatenate_rows, ) +from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport ( + count_elements as cpp_count_elements, +) +from cudf._lib.pylibcudf.libcudf.lists.extract cimport ( + extract_list_element as cpp_extract_list_element, +) from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.types cimport size_type -from cudf._lib.pylibcudf.lists cimport ColumnOrScalar +from cudf._lib.pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType from .column cimport Column, ListColumnView from .scalar cimport Scalar @@ -206,3 +214,109 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o find_option, )) return Column.from_libcudf(move(c_result)) + + +cpdef Column reverse(Column input): + """Reverse the element order within each list of the input column. + + For details, see :cpp:func:`reverse`. + + Parameters + ---------- + input : Column + The input column. + + Returns + ------- + Column + A new Column with reversed lists. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView list_view = input.list_view() + + with nogil: + c_result = move(cpp_reverse.reverse( + list_view.view(), + )) + return Column.from_libcudf(move(c_result)) + + +cpdef Column segmented_gather(Column input, Column gather_map_list): + """Create a column with elements gathered based on the indices in gather_map_list + + For details, see :cpp:func:`segmented_gather`. + + Parameters + ---------- + input : Column + The input column. + gather_map_list : Column + The indices of the lists column to gather. + + Returns + ------- + Column + A new Column with elements in list of rows + gathered based on gather_map_list + """ + + cdef unique_ptr[column] c_result + cdef ListColumnView list_view1 = input.list_view() + cdef ListColumnView list_view2 = gather_map_list.list_view() + + with nogil: + c_result = move(cpp_gather.segmented_gather( + list_view1.view(), + list_view2.view(), + )) + return Column.from_libcudf(move(c_result)) + + +cpdef Column extract_list_element(Column input, ColumnOrSizeType index): + """Create a column of extracted list elements. + + Parameters + ---------- + input : Column + The input column. + index : Union[Column, size_type] + The selection index or indices. + + Returns + ------- + Column + A new Column with elements extracted. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView list_view = input.list_view() + + with nogil: + c_result = move(cpp_extract_list_element( + list_view.view(), + index.view() if ColumnOrSizeType is Column else index, + )) + return Column.from_libcudf(move(c_result)) + + +cpdef Column count_elements(Column input): + """Count the number of rows in each + list element in the given lists column. + For details, see :cpp:func:`count_elements`. + + Parameters + ---------- + input : Column + The input column + + Returns + ------- + Column + A new Column of the lengths of each list element + """ + cdef ListColumnView list_view = input.list_view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_count_elements(list_view.view())) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/traits.pxd new file mode 100644 index 00000000000..668fa775202 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/traits.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool + +from .types cimport DataType + + +cpdef bool is_relationally_comparable(DataType typ) +cpdef bool is_equality_comparable(DataType typ) +cpdef bool is_numeric(DataType typ) +cpdef bool is_index_type(DataType typ) +cpdef bool is_unsigned(DataType typ) +cpdef bool is_integral(DataType typ) +cpdef bool is_integral_not_bool(DataType typ) +cpdef bool is_floating_point(DataType typ) +cpdef bool is_boolean(DataType typ) +cpdef bool is_timestamp(DataType typ) +cpdef bool is_fixed_point(DataType typ) +cpdef bool is_duration(DataType typ) +cpdef bool is_chrono(DataType typ) +cpdef bool is_dictionary(DataType typ) +cpdef bool is_fixed_width(DataType typ) +cpdef bool is_compound(DataType typ) +cpdef bool is_nested(DataType typ) +cpdef bool is_bit_castable(DataType source, DataType target) diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/cudf/cudf/_lib/pylibcudf/traits.pyx new file mode 100644 index 00000000000..d2370f8d641 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/traits.pyx @@ -0,0 +1,151 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool + +from cudf._lib.pylibcudf.libcudf.utilities cimport traits + +from .types cimport DataType + + +cpdef bool is_relationally_comparable(DataType typ): + """Checks if the given data type supports relational comparisons. + + For details, see :cpp:func:`is_relationally_comparable`. + """ + return traits.is_relationally_comparable(typ.c_obj) + + +cpdef bool is_equality_comparable(DataType typ): + """Checks if the given data type supports equality comparisons. + + For details, see :cpp:func:`is_equality_comparable`. + """ + return traits.is_equality_comparable(typ.c_obj) + + +cpdef bool is_numeric(DataType typ): + """Checks if the given data type is numeric. + + For details, see :cpp:func:`is_numeric`. + """ + return traits.is_numeric(typ.c_obj) + + +cpdef bool is_index_type(DataType typ): + """Checks if the given data type is an index type. + + For details, see :cpp:func:`is_index_type`. + """ + return traits.is_index_type(typ.c_obj) + + +cpdef bool is_unsigned(DataType typ): + """Checks if the given data type is an unsigned type. + + For details, see :cpp:func:`is_unsigned`. + """ + return traits.is_unsigned(typ.c_obj) + + +cpdef bool is_integral(DataType typ): + """Checks if the given data type is an integral type. + + For details, see :cpp:func:`is_integral`. + """ + return traits.is_integral(typ.c_obj) + + +cpdef bool is_integral_not_bool(DataType typ): + """Checks if the given data type is an integral type excluding booleans. + + For details, see :cpp:func:`is_integral_not_bool`. + """ + return traits.is_integral_not_bool(typ.c_obj) + + +cpdef bool is_floating_point(DataType typ): + """Checks if the given data type is a floating point type. + + For details, see :cpp:func:`is_floating_point`. + """ + return traits.is_floating_point(typ.c_obj) + + +cpdef bool is_boolean(DataType typ): + """Checks if the given data type is a boolean type. + + For details, see :cpp:func:`is_boolean`. + """ + return traits.is_boolean(typ.c_obj) + + +cpdef bool is_timestamp(DataType typ): + """Checks if the given data type is a timestamp type. + + For details, see :cpp:func:`is_timestamp`. + """ + return traits.is_timestamp(typ.c_obj) + + +cpdef bool is_fixed_point(DataType typ): + """Checks if the given data type is a fixed point type. + + For details, see :cpp:func:`is_fixed_point`. + """ + return traits.is_fixed_point(typ.c_obj) + + +cpdef bool is_duration(DataType typ): + """Checks if the given data type is a duration type. + + For details, see :cpp:func:`is_duration`. + """ + return traits.is_duration(typ.c_obj) + + +cpdef bool is_chrono(DataType typ): + """Checks if the given data type is a chrono type. + + For details, see :cpp:func:`is_chrono`. + """ + return traits.is_chrono(typ.c_obj) + + +cpdef bool is_dictionary(DataType typ): + """Checks if the given data type is a dictionary type. + + For details, see :cpp:func:`is_dictionary`. + """ + return traits.is_dictionary(typ.c_obj) + + +cpdef bool is_fixed_width(DataType typ): + """Checks if the given data type is a fixed width type. + + For details, see :cpp:func:`is_fixed_width`. + """ + return traits.is_fixed_width(typ.c_obj) + + +cpdef bool is_compound(DataType typ): + """Checks if the given data type is a compound type. + + For details, see :cpp:func:`is_compound`. + """ + return traits.is_compound(typ.c_obj) + + +cpdef bool is_nested(DataType typ): + """Checks if the given data type is a nested type. + + For details, see :cpp:func:`is_nested`. + """ + return traits.is_nested(typ.c_obj) + + +cpdef bool is_bit_castable(DataType source, DataType target): + """Checks if the source type is bit-castable to the target type. + + For details, see :cpp:func:`is_bit_castable`. + """ + return traits.is_bit_castable(source.c_obj, target.c_obj) diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pxd b/python/cudf/cudf/_lib/pylibcudf/transform.pxd new file mode 100644 index 00000000000..4b21feffe25 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/transform.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from .column cimport Column +from .gpumemoryview cimport gpumemoryview + + +cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input) diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pyx b/python/cudf/cudf/_lib/pylibcudf/transform.pyx new file mode 100644 index 00000000000..a734e71b820 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/transform.pyx @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move, pair + +from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer + +from cudf._lib.pylibcudf.libcudf cimport transform as cpp_transform +from cudf._lib.pylibcudf.libcudf.types cimport size_type + +from .column cimport Column +from .gpumemoryview cimport gpumemoryview + + +cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): + """Create a null mask preserving existing nulls and converting nans to null. + + Parameters + ---------- + input : Column + Column to produce new mask from. + + Returns + ------- + Two-tuple of a gpumemoryview wrapping the null mask and the new null count. + """ + cdef pair[unique_ptr[device_buffer], size_type] c_result + + with nogil: + c_result = move(cpp_transform.nans_to_nulls(input.view())) + + return ( + gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), + c_result.second + ) diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index 6dbb287f3c4..c45c6071bb3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -2,7 +2,8 @@ from libc.stdint cimport int32_t -from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id +from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id +from cudf._lib.pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy # no-cython-lint, isort:skip @@ -67,3 +68,7 @@ cdef class DataType: cdef DataType ret = DataType.__new__(DataType, type_id.EMPTY) ret.c_obj = dt return ret + + +SIZE_TYPE = DataType(type_to_id[size_type]()) +SIZE_TYPE_ID = SIZE_TYPE.id() diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/unary.pxd index 4aa4543bb80..d07df838172 100644 --- a/python/cudf/cudf/_lib/pylibcudf/unary.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/unary.pxd @@ -1,5 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool + from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator from .column cimport Column @@ -17,3 +19,5 @@ cpdef Column cast(Column input, DataType data_type) cpdef Column is_nan(Column input) cpdef Column is_not_nan(Column input) + +cpdef bool is_supported_cast(DataType from_, DataType to) diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pyx b/python/cudf/cudf/_lib/pylibcudf/unary.pyx index 0879b501a49..8da46f0a832 100644 --- a/python/cudf/cudf/_lib/pylibcudf/unary.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/unary.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -154,3 +155,23 @@ cpdef Column is_not_nan(Column input): result = move(cpp_unary.is_not_nan(input.view())) return Column.from_libcudf(move(result)) + +cpdef bool is_supported_cast(DataType from_, DataType to): + """Check if a cast between datatypes is supported. + + For details, see :cpp:func:`is_supported_cast`. + + Parameters + ---------- + from_ + The source datatype + to + The target datatype + + Returns + ------- + bool + True if the cast is supported. + """ + with nogil: + return cpp_unary.is_supported_cast(from_.c_obj, to.c_obj) diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 56bfa0ba332..64634b7a6f9 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -1,4 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +import warnings import cudf from cudf.core.buffer import acquire_spill_lock @@ -26,11 +27,15 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): A numpy data type to use for the output, defaults to the same type as the input column """ - - col_dtype = ( - dtype if dtype is not None - else incol._reduction_result_dtype(reduction_op) - ) + if dtype is not None: + warnings.warn( + "dtype is deprecated and will be remove in a future release. " + "Cast the result (e.g. .astype) after the operation instead.", + FutureWarning + ) + col_dtype = dtype + else: + col_dtype = incol._reduction_result_dtype(reduction_op) # check empty case if len(incol) <= incol.null_count: diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index b325173f20d..622725e06a3 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -19,7 +19,8 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform from cudf._lib.column cimport Column -from cudf._lib.expressions cimport Expression +from cudf._lib.pylibcudf cimport transform as plc_transform +from cudf._lib.pylibcudf.expressions cimport Expression from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view from cudf._lib.pylibcudf.libcudf.expressions cimport expression @@ -82,18 +83,10 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): @acquire_spill_lock() def nans_to_nulls(Column input): - cdef column_view c_input = input.view() - cdef pair[unique_ptr[device_buffer], size_type] c_output - cdef unique_ptr[device_buffer] c_buffer - - with nogil: - c_output = move(libcudf_transform.nans_to_nulls(c_input)) - c_buffer = move(c_output.first) - - if c_output.second == 0: - return None - - return as_buffer(DeviceBuffer.c_from_unique_ptr(move(c_buffer))) + (mask, _) = plc_transform.nans_to_nulls( + input.to_pylibcudf(mode="read") + ) + return as_buffer(mask) @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 895e1afc502..253fdf7b0d9 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -21,8 +21,6 @@ from cudf._lib.types cimport ( import cudf from cudf._lib import pylibcudf -size_type_dtype = np.dtype("int32") - class TypeId(IntEnum): EMPTY = libcudf_types.type_id.EMPTY @@ -150,6 +148,8 @@ datetime_unit_map = { TypeId.TIMESTAMP_NANOSECONDS: "ns", } +size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID] + class Interpolation(IntEnum): LINEAR = ( @@ -239,6 +239,9 @@ cdef dtype_from_column_view(column_view cv): ] cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: + # Note: This function is to be phased out in favor of + # dtype_to_pylibcudf_type which will return a pylibcudf + # DataType object cdef libcudf_types.type_id tid if isinstance(dtype, cudf.ListDtype): tid = libcudf_types.type_id.LIST diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index 99850d549a1..1d55f7218dc 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -19,3 +19,4 @@ cdef table_view table_view_from_table(tbl, ignore_index=*) except* cdef columns_from_unique_ptr(unique_ptr[table] c_tbl) cdef columns_from_table_view(table_view tv, object owners) cdef columns_from_pylibcudf_table(tbl) +cdef _data_from_columns(columns, column_names, index_names=*) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index de6b9f690b6..f136cd997a7 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -322,7 +322,7 @@ cdef data_from_pylibcudf_io(tbl_with_meta): """ return _data_from_columns( columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], - column_names=tbl_with_meta.column_names, + column_names=tbl_with_meta.column_names(include_children=False), index_names=None ) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index d97e9c815b6..294ae2fd985 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -90,7 +90,7 @@ def is_integer(obj): bool """ if isinstance(obj, cudf.Scalar): - return pd.api.types.is_integer_dtype(obj.dtype) + return obj.dtype.kind in "iu" return pd.api.types.is_integer(obj) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index e160fa697ee..c38352009de 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -19,15 +19,7 @@ ) from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default -from cudf.api.types import ( - is_bool_dtype, - is_integer, - is_integer_dtype, - is_list_like, - is_scalar, - is_signed_integer_dtype, - is_unsigned_integer_dtype, -) +from cudf.api.types import is_integer, is_list_like, is_scalar from cudf.core.abc import Serializable from cudf.core.column import ColumnBase, column from cudf.errors import MixedTypeError @@ -38,6 +30,8 @@ if TYPE_CHECKING: from collections.abc import Generator + import cupy + from cudf.core.column_accessor import ColumnAccessor @@ -61,6 +55,12 @@ def copy(self, deep: bool = True) -> Self: def __len__(self): raise NotImplementedError + def __bool__(self): + raise ValueError( + f"The truth value of a {type(self).__name__} is ambiguous. Use " + "a.empty, a.bool(), a.item(), a.any() or a.all()." + ) + @property def size(self): # The size of an index is always its length irrespective of dimension. @@ -608,20 +608,14 @@ def union(self, other, sort=None): ) if cudf.get_option("mode.pandas_compatible"): - if ( - is_bool_dtype(self.dtype) and not is_bool_dtype(other.dtype) - ) or ( - not is_bool_dtype(self.dtype) and is_bool_dtype(other.dtype) + if (self.dtype.kind == "b" and other.dtype.kind != "b") or ( + self.dtype.kind != "b" and other.dtype.kind == "b" ): # Bools + other types will result in mixed type. # This is not yet consistent in pandas and specific to APIs. raise MixedTypeError("Cannot perform union with mixed types") - if ( - is_signed_integer_dtype(self.dtype) - and is_unsigned_integer_dtype(other.dtype) - ) or ( - is_unsigned_integer_dtype(self.dtype) - and is_signed_integer_dtype(other.dtype) + if (self.dtype.kind == "i" and other.dtype.kind == "u") or ( + self.dtype.kind == "u" and other.dtype.kind == "i" ): # signed + unsigned types will result in # mixed type for union in pandas. @@ -2001,7 +1995,7 @@ def drop_duplicates( self._column_names, ) - def duplicated(self, keep="first"): + def duplicated(self, keep="first") -> cupy.ndarray: """ Indicate duplicate index values. @@ -2098,7 +2092,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True): # TODO: For performance, the check and conversion of gather map should # be done by the caller. This check will be removed in future release. - if not is_integer_dtype(gather_map.dtype): + if gather_map.dtype.kind not in "iu": gather_map = gather_map.astype(size_type_dtype) if not _gather_map_is_valid( @@ -2152,7 +2146,7 @@ def _apply_boolean_mask(self, boolean_mask): Rows corresponding to `False` is dropped. """ boolean_mask = cudf.core.column.as_column(boolean_mask) - if not is_bool_dtype(boolean_mask.dtype): + if boolean_mask.dtype.kind != "b": raise ValueError("boolean_mask is not boolean type.") return self._from_columns_like_self( diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py index 393a68dd844..63714a78572 100644 --- a/python/cudf/cudf/core/_internals/expressions.py +++ b/python/cudf/cudf/core/_internals/expressions.py @@ -4,7 +4,10 @@ import ast import functools -from cudf._lib.expressions import ( +import pyarrow as pa + +import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.expressions import ( ASTOperator, ColumnReference, Expression, @@ -122,7 +125,9 @@ def visit_Constant(self, node): f"Unsupported literal {repr(node.value)} of type " "{type(node.value).__name__}" ) - self.stack.append(Literal(node.value)) + self.stack.append( + Literal(plc.interop.from_arrow(pa.scalar(node.value))) + ) def visit_UnaryOp(self, node): self.visit(node.operand) @@ -132,7 +137,7 @@ def visit_UnaryOp(self, node): # operand, so there's no way to know whether this should be a float # or an int. We should maybe see what Spark does, and this will # probably require casting. - self.nodes.append(Literal(-1)) + self.nodes.append(Literal(plc.interop.from_arrow(pa.scalar(-1)))) op = ASTOperator.MUL self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) elif isinstance(node.op, ast.UAdd): diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 44ce0ddef25..18ab32d2c9e 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -7,18 +7,9 @@ import numpy as np import cudf -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - is_bool_dtype, - is_scalar, -) +from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.dtypes import CategoricalDtype -from cudf.utils.dtypes import ( - _can_cast, - _dtype_can_hold_element, - find_common_type, - is_mixed_with_object_dtype, -) +from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype if TYPE_CHECKING: from cudf._typing import ScalarLike @@ -48,19 +39,25 @@ def _check_and_cast_columns_with_other( inplace: bool, ) -> tuple[ColumnBase, ScalarLike | ColumnBase]: # Returns type-casted `source_col` & `other` based on `inplace`. + from cudf.core.column import as_column + source_dtype = source_col.dtype if isinstance(source_dtype, CategoricalDtype): return _normalize_categorical(source_col, other) other_is_scalar = is_scalar(other) if other_is_scalar: - if (isinstance(other, float) and not np.isnan(other)) and ( - source_dtype.type(other) != other - ): - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{type(other).__name__} to {source_dtype.name}" - ) + if isinstance(other, (float, np.floating)) and not np.isnan(other): + try: + is_safe = source_dtype.type(other) == other + except OverflowError: + is_safe = False + + if not is_safe: + raise TypeError( + f"Cannot safely cast non-equivalent " + f"{type(other).__name__} to {source_dtype.name}" + ) if cudf.utils.utils.is_na_like(other): return _normalize_categorical( @@ -84,15 +81,9 @@ def _check_and_cast_columns_with_other( ) return _normalize_categorical(source_col, other.astype(source_dtype)) - if _is_non_decimal_numeric_dtype(source_dtype) and _can_cast( - other, source_dtype - ): - common_dtype = source_dtype - elif ( - isinstance(source_col, cudf.core.column.NumericalColumn) - and other_is_scalar - and _dtype_can_hold_element(source_dtype, other) - ): + if _is_non_decimal_numeric_dtype(source_dtype) and as_column( + other + ).can_cast_safely(source_dtype): common_dtype = source_dtype else: common_dtype = find_common_type( @@ -106,7 +97,7 @@ def _check_and_cast_columns_with_other( other = cudf.Scalar(other) if is_mixed_with_object_dtype(other, source_col) or ( - is_bool_dtype(source_dtype) and not is_bool_dtype(common_dtype) + source_dtype.kind == "b" and common_dtype.kind != "b" ): raise TypeError(mixed_err) @@ -128,3 +119,58 @@ def _make_categorical_like(result, column): ordered=column.ordered, ) return result + + +def _can_cast(from_dtype, to_dtype): + """ + Utility function to determine if we can cast + from `from_dtype` to `to_dtype`. This function primarily calls + `np.can_cast` but with some special handling around + cudf specific dtypes. + """ + if cudf.utils.utils.is_na_like(from_dtype): + return True + if isinstance(from_dtype, type): + from_dtype = cudf.dtype(from_dtype) + if isinstance(to_dtype, type): + to_dtype = cudf.dtype(to_dtype) + + # TODO : Add precision & scale checking for + # decimal types in future + + if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype): + if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype): + return True + elif isinstance(to_dtype, np.dtype): + if to_dtype.kind in {"i", "f", "u", "U", "O"}: + return True + else: + return False + elif isinstance(from_dtype, np.dtype): + if isinstance(to_dtype, np.dtype): + return np.can_cast(from_dtype, to_dtype) + elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype): + if from_dtype.kind in {"i", "f", "u", "U", "O"}: + return True + else: + return False + elif isinstance(to_dtype, cudf.core.types.CategoricalDtype): + return True + else: + return False + elif isinstance(from_dtype, cudf.core.dtypes.ListDtype): + # TODO: Add level based checks too once casting of + # list columns is supported + if isinstance(to_dtype, cudf.core.dtypes.ListDtype): + return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type) + else: + return False + elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype): + if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype): + return True + elif isinstance(to_dtype, np.dtype): + return np.can_cast(from_dtype._categories.dtype, to_dtype) + else: + return False + else: + return np.can_cast(from_dtype, to_dtype) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index e8b82ff60c2..6c69fbd2637 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -1,17 +1,22 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + import warnings +from typing import TYPE_CHECKING import cupy as cp import numpy as np from cudf.core.column import as_column -from cudf.core.copy_types import BooleanMask from cudf.core.index import RangeIndex, ensure_index -from cudf.core.indexed_frame import IndexedFrame from cudf.core.scalar import Scalar from cudf.options import get_option from cudf.utils.dtypes import can_convert_to_column +if TYPE_CHECKING: + from cudf.core.column.column import ColumnBase + from cudf.core.index import BaseIndex + def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): """Encode the input values as integer labels @@ -110,55 +115,31 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): return labels, cats.values if return_cupy_array else ensure_index(cats) -def _linear_interpolation(column, index=None): - """ - Interpolate over a float column. Implicitly assumes that values are - evenly spaced with respect to the x-axis, for example the data - [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way - between the two valid values, yielding [1.0, 2.0, 3.0] - """ - - index = RangeIndex(start=0, stop=len(column), step=1) - return _index_or_values_interpolation(column, index=index) - - -def _index_or_values_interpolation(column, index=None): +def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase: """ Interpolate over a float column. assumes a linear interpolation strategy using the index of the data to denote spacing of the x values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4] - would result in [1.0, 3.0, 4.0] + would result in [1.0, 3.0, 4.0]. """ # figure out where the nans are - mask = cp.isnan(column) + mask = column.isnull() # trivial cases, all nan or no nans - num_nan = mask.sum() - if num_nan == 0 or num_nan == len(column): - return column + if not mask.any() or mask.all(): + return column.copy() - to_interp = IndexedFrame(data={None: column}, index=index) - known_x_and_y = to_interp._apply_boolean_mask( - BooleanMask(~mask, len(to_interp)) - ) - - known_x = known_x_and_y.index.to_cupy() - known_y = known_x_and_y._data.columns[0].values + valid_locs = ~mask + if isinstance(index, RangeIndex): + # Each point is evenly spaced, index values don't matter + known_x = cp.flatnonzero(valid_locs.values) + else: + known_x = index._column.apply_boolean_mask(valid_locs).values # type: ignore[attr-defined] + known_y = column.apply_boolean_mask(valid_locs).values result = cp.interp(index.to_cupy(), known_x, known_y) # find the first nan - first_nan_idx = (mask == 0).argmax().item() + first_nan_idx = valid_locs.values.argmax().item() result[:first_nan_idx] = np.nan - return result - - -def get_column_interpolator(method): - interpolator = { - "linear": _linear_interpolation, - "index": _index_or_values_interpolation, - "values": _index_or_values_interpolation, - }.get(method, None) - if not interpolator: - raise ValueError(f"Interpolation method `{method}` not found") - return interpolator + return as_column(result) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 231af30c06d..9aaccca349d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -47,7 +47,9 @@ ) -_DEFAULT_CATEGORICAL_VALUE = -1 +# Using np.int8(-1) to allow silent wrap-around when casting to uint +# it may make sense to make this dtype specific or a function. +_DEFAULT_CATEGORICAL_VALUE = np.int8(-1) class CategoricalAccessor(ColumnMethods): @@ -1113,24 +1115,18 @@ def is_monotonic_decreasing(self) -> bool: def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: if isinstance(dtype, str) and dtype == "category": return self + if isinstance(dtype, pd.CategoricalDtype): + dtype = cudf.CategoricalDtype.from_pandas(dtype) if ( - isinstance( - dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype) - ) - and (dtype.categories is None) - and (dtype.ordered is None) + isinstance(dtype, cudf.CategoricalDtype) + and dtype.categories is None + and dtype.ordered is None ): return self - - if isinstance(dtype, pd.CategoricalDtype): - dtype = CategoricalDtype( - categories=dtype.categories, ordered=dtype.ordered - ) - - if not isinstance(dtype, CategoricalDtype): + elif not isinstance(dtype, CategoricalDtype): raise ValueError("dtype must be CategoricalDtype") - if not isinstance(self.categories, type(dtype.categories._values)): + if not isinstance(self.categories, type(dtype.categories._column)): # If both categories are of different Column types, # return a column full of Nulls. return _create_empty_categorical_column(self, dtype) @@ -1142,26 +1138,14 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: return self._get_decategorized_column().as_numerical_column(dtype) - def as_string_column( - self, dtype, format: str | None = None - ) -> StringColumn: - return self._get_decategorized_column().as_string_column( - dtype, format=format - ) + def as_string_column(self) -> StringColumn: + return self._get_decategorized_column().as_string_column() - def as_datetime_column( - self, dtype, format: str | None = None - ) -> DatetimeColumn: - return self._get_decategorized_column().as_datetime_column( - dtype, format - ) + def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: + return self._get_decategorized_column().as_datetime_column(dtype) - def as_timedelta_column( - self, dtype, format: str | None = None - ) -> TimeDeltaColumn: - return self._get_decategorized_column().as_timedelta_column( - dtype, format - ) + def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn: + return self._get_decategorized_column().as_timedelta_column(dtype) def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e7a2863da8c..32e6aade65b 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -41,7 +41,6 @@ _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, infer_dtype, - is_bool_dtype, is_dtype_equal, is_scalar, is_string_dtype, @@ -72,7 +71,7 @@ get_time_unit, is_column_like, is_mixed_with_object_dtype, - min_scalar_type, + min_signed_type, min_unsigned_type, ) from cudf.utils.utils import _array_ufunc, mask_dtype @@ -262,7 +261,7 @@ def all(self, skipna: bool = True) -> bool: if self.null_count == self.size: return True - return libcudf.reduce.reduce("all", self, dtype=np.bool_) + return libcudf.reduce.reduce("all", self) def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. @@ -272,10 +271,13 @@ def any(self, skipna: bool = True) -> bool: elif skipna and self.null_count == self.size: return False - return libcudf.reduce.reduce("any", self, dtype=np.bool_) + return libcudf.reduce.reduce("any", self) def dropna(self) -> Self: - return drop_nulls([self])[0]._with_type_metadata(self.dtype) + if self.has_nulls(): + return drop_nulls([self])[0]._with_type_metadata(self.dtype) + else: + return self.copy() def to_arrow(self) -> pa.Array: """Convert to PyArrow Array @@ -619,7 +621,7 @@ def _scatter_by_column( key: cudf.core.column.NumericalColumn, value: cudf.core.scalar.Scalar | ColumnBase, ) -> Self: - if is_bool_dtype(key.dtype): + if key.dtype.kind == "b": # `key` is boolean mask if len(key) != len(self): raise ValueError( @@ -644,7 +646,7 @@ def _scatter_by_column( self._check_scatter_key_length(num_keys, value) - if is_bool_dtype(key.dtype): + if key.dtype.kind == "b": return libcudf.copying.boolean_mask_scatter([value], [self], key)[ 0 ]._with_type_metadata(self.dtype) @@ -700,6 +702,9 @@ def fillna( def isnull(self) -> ColumnBase: """Identify missing values in a Column.""" + if not self.has_nulls(include_nan=self.dtype.kind == "f"): + return as_column(False, length=len(self)) + result = libcudf.unary.is_null(self) if self.dtype.kind == "f": @@ -711,6 +716,9 @@ def isnull(self) -> ColumnBase: def notnull(self) -> ColumnBase: """Identify non-missing values in a Column.""" + if not self.has_nulls(include_nan=self.dtype.kind == "f"): + return as_column(True, length=len(self)) + result = libcudf.unary.is_valid(self) if self.dtype.kind == "f": @@ -721,7 +729,7 @@ def notnull(self) -> ColumnBase: return result def indices_of( - self, value: ScalarLike | Self + self, value: ScalarLike ) -> cudf.core.column.NumericalColumn: """ Find locations of value in the column @@ -735,10 +743,10 @@ def indices_of( ------- Column of indices that match value """ - if not isinstance(value, ColumnBase): - value = as_column([value], dtype=self.dtype) + if not is_scalar(value): + raise ValueError("value must be a scalar") else: - assert len(value) == 1 + value = as_column(value, dtype=self.dtype, length=1) mask = libcudf.search.contains(value, self) return apply_boolean_mask( [as_column(range(0, len(self)), dtype=size_type_dtype)], mask @@ -923,15 +931,16 @@ def as_mask(self) -> Buffer: @property def is_unique(self) -> bool: + # distinct_count might already be cached return self.distinct_count(dropna=False) == len(self) - @property + @cached_property def is_monotonic_increasing(self) -> bool: return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( [self], [True], None ) - @property + @cached_property def is_monotonic_decreasing(self) -> bool: return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( [self], [False], None @@ -942,6 +951,10 @@ def sort_values( ascending: bool = True, na_position: str = "last", ) -> ColumnBase: + if (not ascending and self.is_monotonic_decreasing) or ( + ascending and self.is_monotonic_increasing + ): + return self.copy() return libcudf.sort.sort( [self], column_order=[ascending], null_precedence=[na_position] )[0] @@ -962,59 +975,59 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: if len(self) == 0: dtype = cudf.dtype(dtype) if self.dtype == dtype: - if copy: - return self.copy() - else: - return self + result = self else: - return column_empty(0, dtype=dtype, masked=self.nullable) - if copy: - col = self.copy() - else: - col = self - if dtype == "category": + result = column_empty(0, dtype=dtype, masked=self.nullable) + elif dtype == "category": # TODO: Figure out why `cudf.dtype("category")` # astype's different than just the string - return col.as_categorical_column(dtype) + result = self.as_categorical_column(dtype) elif ( isinstance(dtype, str) and dtype == "interval" and isinstance(self.dtype, cudf.IntervalDtype) ): # astype("interval") (the string only) should no-op - return col - was_object = dtype == object or dtype == np.dtype(object) - dtype = cudf.dtype(dtype) - if self.dtype == dtype: - return col - elif isinstance(dtype, CategoricalDtype): - return col.as_categorical_column(dtype) - elif isinstance(dtype, IntervalDtype): - return col.as_interval_column(dtype) - elif isinstance(dtype, (ListDtype, StructDtype)): - if not col.dtype == dtype: - raise NotImplementedError( - f"Casting {self.dtype} columns not currently supported" - ) - return col - elif isinstance(dtype, cudf.core.dtypes.DecimalDtype): - return col.as_decimal_column(dtype) - elif dtype.kind == "M": - return col.as_datetime_column(dtype) - elif dtype.kind == "m": - return col.as_timedelta_column(dtype) - elif dtype.kind == "O": - if cudf.get_option("mode.pandas_compatible") and was_object: - raise ValueError( - f"Casting to {dtype} is not supported, use " - "`.astype('str')` instead." - ) - return col.as_string_column(dtype) + result = self else: - return col.as_numerical_column(dtype) + was_object = dtype == object or dtype == np.dtype(object) + dtype = cudf.dtype(dtype) + if self.dtype == dtype: + result = self + elif isinstance(dtype, CategoricalDtype): + result = self.as_categorical_column(dtype) + elif isinstance(dtype, IntervalDtype): + result = self.as_interval_column(dtype) + elif isinstance(dtype, (ListDtype, StructDtype)): + if not self.dtype == dtype: + raise NotImplementedError( + f"Casting {self.dtype} columns not currently supported" + ) + result = self + elif isinstance(dtype, cudf.core.dtypes.DecimalDtype): + result = self.as_decimal_column(dtype) + elif dtype.kind == "M": + result = self.as_datetime_column(dtype) + elif dtype.kind == "m": + result = self.as_timedelta_column(dtype) + elif dtype.kind == "O": + if cudf.get_option("mode.pandas_compatible") and was_object: + raise ValueError( + f"Casting to {dtype} is not supported, use " + "`.astype('str')` instead." + ) + result = self.as_string_column() + else: + result = self.as_numerical_column(dtype) + + if copy and result is self: + return result.copy() + return result def as_categorical_column(self, dtype) -> ColumnBase: - if isinstance(dtype, (cudf.CategoricalDtype, pd.CategoricalDtype)): + if isinstance(dtype, pd.CategoricalDtype): + dtype = cudf.CategoricalDtype.from_pandas(dtype) + if isinstance(dtype, cudf.CategoricalDtype): ordered = dtype.ordered else: ordered = False @@ -1023,14 +1036,11 @@ def as_categorical_column(self, dtype) -> ColumnBase: if ( isinstance(dtype, cudf.CategoricalDtype) and dtype._categories is not None - ) or ( - isinstance(dtype, pd.CategoricalDtype) - and dtype.categories is not None ): - labels = self._label_encoding(cats=as_column(dtype.categories)) - + cat_col = dtype._categories + labels = self._label_encoding(cats=cat_col) return build_categorical_column( - categories=as_column(dtype.categories), + categories=cat_col, codes=labels, mask=self.mask, ordered=dtype.ordered, @@ -1062,8 +1072,8 @@ def as_numerical_column( raise NotImplementedError def as_datetime_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.DatetimeColumn": + self, dtype: Dtype + ) -> cudf.core.column.DatetimeColumn: raise NotImplementedError def as_interval_column( @@ -1072,13 +1082,11 @@ def as_interval_column( raise NotImplementedError def as_timedelta_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.TimeDeltaColumn": + self, dtype: Dtype + ) -> cudf.core.column.TimeDeltaColumn: raise NotImplementedError - def as_string_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.StringColumn": + def as_string_column(self) -> cudf.core.column.StringColumn: raise NotImplementedError def as_decimal_column( @@ -1088,7 +1096,7 @@ def as_decimal_column( def apply_boolean_mask(self, mask) -> ColumnBase: mask = as_column(mask) - if not is_bool_dtype(mask.dtype): + if mask.dtype.kind != "b": raise ValueError("boolean_mask is not boolean type.") return apply_boolean_mask([self], mask)[0]._with_type_metadata( @@ -1096,11 +1104,22 @@ def apply_boolean_mask(self, mask) -> ColumnBase: ) def argsort( - self, ascending: bool = True, na_position: str = "last" - ) -> "cudf.core.column.NumericalColumn": - return libcudf.sort.order_by( - [self], [ascending], na_position, stable=True - ) + self, + ascending: bool = True, + na_position: Literal["first", "last"] = "last", + ) -> cudf.core.column.NumericalColumn: + if (ascending and self.is_monotonic_increasing) or ( + not ascending and self.is_monotonic_decreasing + ): + return as_column(range(len(self))) + elif (ascending and self.is_monotonic_decreasing) or ( + not ascending and self.is_monotonic_increasing + ): + return as_column(range(len(self) - 1, -1, -1)) + else: + return libcudf.sort.order_by( + [self], [ascending], na_position, stable=True + ) def __arrow_array__(self, type=None): raise TypeError( @@ -1163,9 +1182,12 @@ def unique(self) -> ColumnBase: """ Get unique values in the data """ - return drop_duplicates([self], keep="first")[0]._with_type_metadata( - self.dtype - ) + if self.is_unique: + return self.copy() + else: + return drop_duplicates([self], keep="first")[ + 0 + ]._with_type_metadata(self.dtype) def serialize(self) -> tuple[dict, list]: # data model: @@ -1283,7 +1305,10 @@ def _reduce( skipna=skipna, min_count=min_count ) if isinstance(preprocessed, ColumnBase): - return libcudf.reduce.reduce(op, preprocessed, **kwargs) + dtype = kwargs.pop("dtype", None) + return libcudf.reduce.reduce( + op, preprocessed, dtype=dtype, **kwargs + ) return preprocessed def _process_for_reduction( @@ -1314,6 +1339,8 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype: Determine the correct dtype to pass to libcudf based on the input dtype, data dtype, and specific reduction op """ + if reduction_op in {"any", "all"}: + return np.dtype(np.bool_) return self.dtype def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: @@ -1329,7 +1356,7 @@ def _label_encoding( self, cats: ColumnBase, dtype: Dtype | None = None, - na_sentinel: ScalarLike | None = None, + na_sentinel: cudf.Scalar | None = None, ): """ Convert each value in `self` into an integer code, with `cats` @@ -1369,7 +1396,7 @@ def _return_sentinel_column(): return as_column(na_sentinel, dtype=dtype, length=len(self)) if dtype is None: - dtype = min_scalar_type(max(len(cats), na_sentinel), 8) + dtype = min_signed_type(max(len(cats), na_sentinel.value), 8) if is_mixed_with_object_dtype(self, cats): return _return_sentinel_column() @@ -1431,9 +1458,10 @@ def column_empty_like( return column_empty(row_count, dtype, masked) -def _has_any_nan(arbitrary): +def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: + """Check if an object dtype Series or array contains NaN.""" return any( - ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x)) + isinstance(x, (float, np.floating)) and np.isnan(x) for x in np.asarray(arbitrary) ) @@ -2191,25 +2219,26 @@ def as_column( and arbitrary.null_count > 0 ): arbitrary = arbitrary.cast(pa.float64()) - if cudf.get_option( - "default_integer_bitwidth" - ) and pa.types.is_integer(arbitrary.type): - dtype = _maybe_convert_to_default_type("int") - elif cudf.get_option( - "default_float_bitwidth" - ) and pa.types.is_floating(arbitrary.type): - dtype = _maybe_convert_to_default_type("float") + if ( + cudf.get_option("default_integer_bitwidth") + and pa.types.is_integer(arbitrary.type) + ) or ( + cudf.get_option("default_float_bitwidth") + and pa.types.is_floating(arbitrary.type) + ): + dtype = _maybe_convert_to_default_type( + cudf.dtype(arbitrary.type.to_pandas_dtype()) + ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): arbitrary = pd.Series(arbitrary) - if cudf.get_option( - "default_integer_bitwidth" - ) and arbitrary.dtype.kind in set("iu"): - dtype = _maybe_convert_to_default_type("int") - elif ( + if ( + cudf.get_option("default_integer_bitwidth") + and arbitrary.dtype.kind in set("iu") + ) or ( cudf.get_option("default_float_bitwidth") and arbitrary.dtype.kind == "f" ): - dtype = _maybe_convert_to_default_type("float") + dtype = _maybe_convert_to_default_type(arbitrary.dtype) return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype) @@ -2285,9 +2314,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # Notice, we can always cast pure null columns not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)] if len(not_null_col_dtypes) and all( - _is_non_decimal_numeric_dtype(dtyp) - and np.issubdtype(dtyp, np.datetime64) - for dtyp in not_null_col_dtypes + _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M" + for dtype in not_null_col_dtypes ): common_dtype = find_common_type(not_null_col_dtypes) # Cast all columns to the common dtype diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index c10aceba9f4..73902789c11 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -18,7 +18,6 @@ from cudf import _lib as libcudf from cudf._lib.labeling import label_bins from cudf._lib.search import search_sorted -from cudf.api.types import is_datetime64_dtype, is_timedelta64_dtype from cudf.core._compat import PANDAS_GE_220 from cudf.core._internals.timezones import ( check_ambiguous_and_nonexistent, @@ -178,43 +177,6 @@ def _resolve_mixed_dtypes( return cudf.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]") -def _get_datetime_format(col, dtype, time_unit): - format = _dtype_to_format_conversion.get(dtype.name, "%Y-%m-%d %H:%M:%S") - if format.endswith("f"): - sub_second_res_len = 3 - else: - sub_second_res_len = 0 - - has_nanos = time_unit in {"ns"} and col.get_dt_field("nanosecond").any() - has_micros = ( - time_unit in {"ns", "us"} and col.get_dt_field("microsecond").any() - ) - has_millis = ( - time_unit in {"ns", "us", "ms"} - and col.get_dt_field("millisecond").any() - ) - has_seconds = col.get_dt_field("second").any() - has_minutes = col.get_dt_field("minute").any() - has_hours = col.get_dt_field("hour").any() - if sub_second_res_len: - if has_nanos: - # format should be intact and rest of the - # following conditions shouldn't execute. - pass - elif has_micros: - format = format[:-sub_second_res_len] + "%6f" - elif has_millis: - format = format[:-sub_second_res_len] + "%3f" - elif has_seconds or has_minutes or has_hours: - format = format[:-4] - else: - format = format.split(" ")[0] - else: - if not (has_seconds or has_minutes or has_hours): - format = format.split(" ")[0] - return format - - class DatetimeColumn(column.ColumnBase): """ A Column implementation for Date-time types. @@ -381,9 +343,7 @@ def round(self, freq: str) -> ColumnBase: def isocalendar(self) -> dict[str, ColumnBase]: return { - field: self.as_string_column("str", format=directive).astype( - "uint32" - ) + field: self.strftime(format=directive).astype("uint32") for field, directive in zip( ["year", "week", "day"], ["%G", "%V", "%u"] ) @@ -445,17 +405,12 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: return NotImplemented - def as_datetime_column( - self, dtype: Dtype, format: str | None = None - ) -> DatetimeColumn: - dtype = cudf.dtype(dtype) + def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) - def as_timedelta_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.TimeDeltaColumn": + def as_timedelta_column(self, dtype: Dtype) -> None: # type: ignore[override] raise TypeError( f"cannot astype a datetimelike from {self.dtype} to {dtype}" ) @@ -472,40 +427,69 @@ def as_numerical_column( ) return cast("cudf.core.column.NumericalColumn", col.astype(dtype)) - def as_string_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.StringColumn": - if format is None: - format = _dtype_to_format_conversion.get( - self.dtype.name, "%Y-%m-%d %H:%M:%S" + def strftime(self, format: str) -> cudf.core.column.StringColumn: + if len(self) == 0: + return cast( + cudf.core.column.StringColumn, + column.column_empty(0, dtype="object", masked=False), ) - if cudf.get_option("mode.pandas_compatible"): - format = _get_datetime_format( - self, dtype=self.dtype, time_unit=self.time_unit - ) if format in _DATETIME_SPECIAL_FORMATS: names = as_column(_DATETIME_NAMES) else: names = cudf.core.column.column_empty( 0, dtype="object", masked=False ) - if len(self) > 0: - return string._datetime_to_str_typecast_functions[ - cudf.dtype(self.dtype) - ](self, format, names) - else: - return cast( - "cudf.core.column.StringColumn", - column.column_empty(0, dtype="object", masked=False), - ) + return string._datetime_to_str_typecast_functions[self.dtype]( + self, format, names + ) + + def as_string_column(self) -> cudf.core.column.StringColumn: + format = _dtype_to_format_conversion.get( + self.dtype.name, "%Y-%m-%d %H:%M:%S" + ) + if cudf.get_option("mode.pandas_compatible"): + if format.endswith("f"): + sub_second_res_len = 3 + else: + sub_second_res_len = 0 - def mean( - self, skipna=None, min_count: int = 0, dtype=np.float64 - ) -> ScalarLike: + has_nanos = ( + self.time_unit in {"ns"} + and self.get_dt_field("nanosecond").any() + ) + has_micros = ( + self.time_unit in {"ns", "us"} + and self.get_dt_field("microsecond").any() + ) + has_millis = ( + self.time_unit in {"ns", "us", "ms"} + and self.get_dt_field("millisecond").any() + ) + has_seconds = self.get_dt_field("second").any() + has_minutes = self.get_dt_field("minute").any() + has_hours = self.get_dt_field("hour").any() + if sub_second_res_len: + if has_nanos: + # format should be intact and rest of the + # following conditions shouldn't execute. + pass + elif has_micros: + format = format[:-sub_second_res_len] + "%6f" + elif has_millis: + format = format[:-sub_second_res_len] + "%3f" + elif has_seconds or has_minutes or has_hours: + format = format[:-4] + else: + format = format.split(" ")[0] + elif not (has_seconds or has_minutes or has_hours): + format = format.split(" ")[0] + return self.strftime(format) + + def mean(self, skipna=None, min_count: int = 0) -> ScalarLike: return pd.Timestamp( cast( "cudf.core.column.NumericalColumn", self.astype("int64") - ).mean(skipna=skipna, min_count=min_count, dtype=dtype), + ).mean(skipna=skipna, min_count=min_count), unit=self.time_unit, ).as_unit(self.time_unit) @@ -513,12 +497,11 @@ def std( self, skipna: bool | None = None, min_count: int = 0, - dtype: Dtype = np.float64, ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( cast("cudf.core.column.NumericalColumn", self.astype("int64")).std( - skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof + skipna=skipna, min_count=min_count, ddof=ddof ) * _unit_to_nanoseconds_conversion[self.time_unit], ).as_unit(self.time_unit) @@ -578,10 +561,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: # We check this on `other` before reflection since we already know the # dtype of `self`. - other_is_timedelta = is_timedelta64_dtype(other.dtype) - other_is_datetime64 = not other_is_timedelta and is_datetime64_dtype( - other.dtype - ) + other_is_timedelta = other.dtype.kind == "m" + other_is_datetime64 = other.dtype.kind == "M" lhs, rhs = (other, self) if reflect else (self, other) out_dtype = None @@ -645,9 +626,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: def indices_of( self, value: ScalarLike ) -> cudf.core.column.NumericalColumn: - value = column.as_column( - pd.to_datetime(value), dtype=self.dtype - ).astype("int64") + value = ( + pd.to_datetime(value).to_numpy().astype(self.dtype).astype("int64") + ) return self.astype("int64").indices_of(value) @property @@ -658,7 +639,7 @@ def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) def can_cast_safely(self, to_dtype: Dtype) -> bool: - if np.issubdtype(to_dtype, np.datetime64): + if to_dtype.kind == "M": # type: ignore[union-attr] to_res, _ = np.datetime_data(to_dtype) self_res, _ = np.datetime_data(self.dtype) @@ -872,10 +853,11 @@ def _local_time(self): offsets_from_utc = offsets.take(indices, nullify=True) return self + offsets_from_utc - def as_string_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.StringColumn": - return self._local_time.as_string_column(dtype, format) + def strftime(self, format: str) -> cudf.core.column.StringColumn: + return self._local_time.strftime(format) + + def as_string_column(self) -> cudf.core.column.StringColumn: + return self._local_time.as_string_column() def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component( diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 3e238d65cff..6a7f338b065 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -15,7 +15,7 @@ from cudf._lib.strings.convert.convert_fixed_point import ( from_decimal as cpp_from_decimal, ) -from cudf.api.types import is_integer_dtype, is_scalar +from cudf.api.types import is_scalar from cudf.core.buffer import as_buffer from cudf.core.column import ColumnBase from cudf.core.dtypes import ( @@ -62,9 +62,7 @@ def as_decimal_column( return self return libcudf.unary.cast(self, dtype) - def as_string_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.StringColumn": + def as_string_column(self) -> cudf.core.column.StringColumn: if len(self) > 0: return cpp_from_decimal(self) else: @@ -152,7 +150,7 @@ def _validate_fillna_value( def normalize_binop_value(self, other): if isinstance(other, ColumnBase): if isinstance(other, cudf.core.column.NumericalColumn): - if not is_integer_dtype(other.dtype): + if other.dtype.kind not in "iu": raise TypeError( "Decimal columns only support binary operations with " "integer numerical columns." diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c548db67344..1b7cd95b3d0 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -73,10 +73,15 @@ def memory_usage(self): child0_size = ( current_base_child.size + 1 - current_offset ) * current_base_child.base_children[0].dtype.itemsize - current_offset = current_base_child.base_children[ - 0 - ].element_indexing(current_offset) n += child0_size + current_offset_col = current_base_child.base_children[0] + if not len(current_offset_col): + # See https://github.com/rapidsai/cudf/issues/16164 why + # offset column can be uninitialized + break + current_offset = current_offset_col.element_indexing( + current_offset + ) current_base_child = current_base_child.base_children[1] n += ( @@ -248,15 +253,11 @@ def from_sequences( ) return res - def as_string_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.StringColumn": + def as_string_column(self) -> cudf.core.column.StringColumn: """ Create a strings column from a list column """ - lc = self._transform_leaves( - lambda col, dtype: col.as_string_column(dtype), dtype - ) + lc = self._transform_leaves(lambda col: col.as_string_column()) # Separator strings to match the Python format separators = as_column([", ", "[", "]"]) @@ -563,10 +564,11 @@ def take(self, lists_indices: ColumnLike) -> ParentType: raise ValueError( "lists_indices and list column is of different " "size." ) - if not _is_non_decimal_numeric_dtype( - lists_indices_col.children[1].dtype - ) or not np.issubdtype( - lists_indices_col.children[1].dtype, np.integer + if ( + not _is_non_decimal_numeric_dtype( + lists_indices_col.children[1].dtype + ) + or lists_indices_col.children[1].dtype.kind not in "iu" ): raise TypeError( "lists_indices should be column of values of index types." @@ -645,9 +647,17 @@ def sort_values( dtype: list .. pandas-compat:: - **ListMethods.sort_values** + `pandas.Series.list.sort_values` + + This method does not exist in pandas but it can be run + as: - The ``inplace`` and ``kind`` arguments are currently not supported. + >>> import pandas as pd + >>> s = pd.Series([[3, 2, 1], [2, 4, 3]]) + >>> print(s.apply(sorted)) + 0 [1, 2, 3] + 1 [2, 3, 4] + dtype: object """ if inplace: raise NotImplementedError("`inplace` not currently implemented.") diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 76c64e1aea0..f9404eb3b40 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -5,7 +5,6 @@ import functools from typing import TYPE_CHECKING, Any, Callable, Sequence, cast -import cupy as cp import numpy as np import pandas as pd from typing_extensions import Self @@ -13,14 +12,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib import pylibcudf -from cudf._lib.types import size_type_dtype -from cudf.api.types import ( - is_bool_dtype, - is_float_dtype, - is_integer, - is_integer_dtype, - is_scalar, -) +from cudf.api.types import is_integer, is_scalar from cudf.core.column import ( ColumnBase, as_column, @@ -32,10 +24,10 @@ from cudf.core.mixins import BinaryOperand from cudf.errors import MixedTypeError from cudf.utils.dtypes import ( + find_common_type, min_column_type, min_signed_type, np_dtypes_to_pandas_dtypes, - numeric_normalize_types, ) from .numerical_base import NumericalBaseColumn @@ -131,12 +123,8 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn: and self.dtype.kind in {"c", "f"} and np.isnan(value) ): - return column.as_column( - cp.argwhere( - cp.isnan(self.data_array_view(mode="read")) - ).flatten(), - dtype=size_type_dtype, - ) + nan_col = libcudf.unary.is_nan(self) + return nan_col.indices_of(True) else: return super().indices_of(value) @@ -165,7 +153,7 @@ def __setitem__(self, key: Any, value: Any): else as_column(value) ) - if not is_bool_dtype(self.dtype) and is_bool_dtype(device_value.dtype): + if self.dtype.kind != "b" and device_value.dtype.kind == "b": raise TypeError(f"Invalid value {value} for dtype {self.dtype}") else: device_value = device_value.astype(self.dtype) @@ -232,25 +220,17 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: tmp = self if reflect else other # Guard against division by zero for integers. if ( - (tmp.dtype.type in int_float_dtype_mapping) - and (tmp.dtype.type != np.bool_) - and ( - ( - ( - np.isscalar(tmp) - or ( - isinstance(tmp, cudf.Scalar) - # host to device copy - and tmp.is_valid() - ) - ) - and (0 == tmp) - ) - or ((isinstance(tmp, NumericalColumn)) and (0 in tmp)) - ) + tmp.dtype.type in int_float_dtype_mapping + and tmp.dtype.kind != "b" ): - out_dtype = cudf.dtype("float64") - + if isinstance(tmp, NumericalColumn) and 0 in tmp: + out_dtype = cudf.dtype("float64") + elif isinstance(tmp, cudf.Scalar): + if tmp.is_valid() and tmp == 0: + # tmp == 0 can return NA + out_dtype = cudf.dtype("float64") + elif is_scalar(tmp) and tmp == 0: + out_dtype = cudf.dtype("float64") if op in { "__lt__", "__gt__", @@ -264,19 +244,19 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: out_dtype = "bool" if op in {"__and__", "__or__", "__xor__"}: - if is_float_dtype(self.dtype) or is_float_dtype(other.dtype): + if self.dtype.kind == "f" or other.dtype.kind == "f": raise TypeError( f"Operation 'bitwise {op[2:-2]}' not supported between " f"{self.dtype.type.__name__} and " f"{other.dtype.type.__name__}" ) - if is_bool_dtype(self.dtype) or is_bool_dtype(other.dtype): + if self.dtype.kind == "b" or other.dtype.kind == "b": out_dtype = "bool" if ( op == "__pow__" - and is_integer_dtype(self.dtype) - and (is_integer(other) or is_integer_dtype(other.dtype)) + and self.dtype.kind in "iu" + and (is_integer(other) or other.dtype.kind in "iu") ): op = "INT_POW" @@ -301,15 +281,28 @@ def normalize_binop_value( if isinstance(other, cudf.Scalar): if self.dtype == other.dtype: return other + # expensive device-host transfer just to # adjust the dtype other = other.value + + # NumPy 2 needs a Python scalar to do weak promotion, but + # pandas forces weak promotion always + # TODO: We could use 0, 0.0, and 0j for promotion to avoid copies. + if other.dtype.kind in "ifc": + other = other.item() + elif not isinstance(other, (int, float, complex)): + # Go via NumPy to get the value + other = np.array(other) + if other.dtype.kind in "ifc": + other = other.item() + # Try and match pandas and hence numpy. Deduce the common - # dtype via the _value_ of other, and the dtype of self. TODO: - # When NEP50 is accepted, this might want changed or - # simplified. - # This is not at all simple: - # np.result_type(np.int64(0), np.uint8) + # dtype via the _value_ of other, and the dtype of self on NumPy 1.x + # with NumPy 2, we force weak promotion even for our/NumPy scalars + # to match pandas 2.2. + # Weak promotion is not at all simple: + # np.result_type(0, np.uint8) # => np.uint8 # np.result_type(np.asarray([0], dtype=np.int64), np.uint8) # => np.int64 @@ -331,9 +324,7 @@ def int2ip(self) -> "cudf.core.column.StringColumn": return libcudf.string_casting.int2ip(self) - def as_string_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.StringColumn": + def as_string_column(self) -> cudf.core.column.StringColumn: if len(self) > 0: return string._numeric_to_str_typecast_functions[ cudf.dtype(self.dtype) @@ -345,8 +336,8 @@ def as_string_column( ) def as_datetime_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.DatetimeColumn": + self, dtype: Dtype + ) -> cudf.core.column.DatetimeColumn: return cast( "cudf.core.column.DatetimeColumn", build_column( @@ -359,8 +350,8 @@ def as_datetime_column( ) def as_timedelta_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.TimeDeltaColumn": + self, dtype: Dtype + ) -> cudf.core.column.TimeDeltaColumn: return cast( "cudf.core.column.TimeDeltaColumn", build_column( @@ -391,7 +382,7 @@ def all(self, skipna: bool = True) -> bool: if result_col.null_count == result_col.size: return True - return libcudf.reduce.reduce("all", result_col, dtype=np.bool_) + return libcudf.reduce.reduce("all", result_col) def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. @@ -402,7 +393,7 @@ def any(self, skipna: bool = True) -> bool: elif skipna and result_col.null_count == result_col.size: return False - return libcudf.reduce.reduce("any", result_col, dtype=np.bool_) + return libcudf.reduce.reduce("any", result_col) @functools.cached_property def nan_count(self) -> int: @@ -513,11 +504,15 @@ def find_and_replace( ) elif len(replacement_col) == 1 and len(to_replace_col) == 0: return self.copy() - to_replace_col, replacement_col, replaced = numeric_normalize_types( - to_replace_col, replacement_col, self + common_type = find_common_type( + (to_replace_col.dtype, replacement_col.dtype, self.dtype) ) + replaced = self.astype(common_type) df = cudf.DataFrame._from_data( - {"old": to_replace_col, "new": replacement_col} + { + "old": to_replace_col.astype(common_type), + "new": replacement_col.astype(common_type), + } ) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: @@ -628,7 +623,9 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: min_, max_ = iinfo.min, iinfo.max # best we can do is hope to catch it here and avoid compare - if (self.min() >= min_) and (self.max() <= max_): + # Use Python floats, which have precise comparison for float64. + # NOTE(seberg): it would make sense to limit to the mantissa range. + if (float(self.min()) >= min_) and (float(self.max()) <= max_): filled = self.fillna(0) return (cudf.Series(filled) % 1 == 0).all() else: @@ -678,15 +675,16 @@ def to_pandas( return super().to_pandas(nullable=nullable, arrow_type=arrow_type) def _reduction_result_dtype(self, reduction_op: str) -> Dtype: - col_dtype = self.dtype if reduction_op in {"sum", "product"}: - col_dtype = ( - col_dtype if col_dtype.kind == "f" else np.dtype("int64") - ) + if self.dtype.kind == "f": + return self.dtype + return np.dtype("int64") elif reduction_op == "sum_of_squares": - col_dtype = np.result_dtype(col_dtype, np.dtype("uint64")) + return np.result_dtype(self.dtype, np.dtype("uint64")) + elif reduction_op in {"var", "std", "mean"}: + return np.dtype("float64") - return col_dtype + return super()._reduction_result_dtype(reduction_op) def _normalize_find_and_replace_input( diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 95c78c5efcb..f41010062c8 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -144,32 +144,27 @@ def mean( self, skipna: bool | None = None, min_count: int = 0, - dtype=np.float64, ): - return self._reduce( - "mean", skipna=skipna, min_count=min_count, dtype=dtype - ) + return self._reduce("mean", skipna=skipna, min_count=min_count) def var( self, skipna: bool | None = None, min_count: int = 0, - dtype=np.float64, ddof=1, ): return self._reduce( - "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof + "var", skipna=skipna, min_count=min_count, ddof=ddof ) def std( self, skipna: bool | None = None, min_count: int = 0, - dtype=np.float64, ddof=1, ): return self._reduce( - "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof + "std", skipna=skipna, min_count=min_count, ddof=ddof ) def median(self, skipna: bool | None = None) -> NumericalBaseColumn: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 936cd1eccb0..ec95c50f455 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -612,7 +612,7 @@ def extract( dtype: object .. pandas-compat:: - **StringMethods.extract** + :meth:`pandas.Series.str.extract` The `flags` parameter currently only supports re.DOTALL and re.MULTILINE. @@ -738,7 +738,7 @@ def contains( dtype: bool .. pandas-compat:: - **StringMethods.contains** + :meth:`pandas.Series.str.contains` The parameters `case` and `na` are not yet supported and will raise a NotImplementedError if anything other than the default @@ -974,7 +974,7 @@ def replace( dtype: object .. pandas-compat:: - **StringMethods.replace** + :meth:`pandas.Series.str.replace` The parameters `case` and `flags` are not yet supported and will raise a `NotImplementedError` if anything other than the default @@ -2803,7 +2803,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: ) .. pandas-compat:: - **StringMethods.partition** + :meth:`pandas.Series.str.partition` The parameter `expand` is not yet supported and will raise a `NotImplementedError` if anything other than the default @@ -3527,7 +3527,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: Index([0, 0, 2, 1], dtype='int64') .. pandas-compat:: - **StringMethods.count** + :meth:`pandas.Series.str.count` - `flags` parameter currently only supports re.DOTALL and re.MULTILINE. @@ -3607,7 +3607,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: dtype: list .. pandas-compat:: - **StringMethods.findall** + :meth:`pandas.Series.str.findall` The `flags` parameter currently only supports re.DOTALL and re.MULTILINE. @@ -3811,7 +3811,7 @@ def endswith(self, pat: str) -> SeriesOrIndex: dtype: bool .. pandas-compat:: - **StringMethods.endswith** + :meth:`pandas.Series.str.endswith` `na` parameter is not yet supported, as cudf uses native strings instead of Python objects. @@ -4264,7 +4264,7 @@ def match( dtype: bool .. pandas-compat:: - **StringMethods.match** + :meth:`pandas.Series.str.match` Parameters `case` and `na` are currently not supported. The `flags` parameter currently only supports re.DOTALL and @@ -5669,16 +5669,25 @@ def as_numerical_column( result_col = _str_to_numeric_typecast_functions[out_dtype](string_col) return result_col - def _as_datetime_or_timedelta_column(self, dtype, format): - if len(self) == 0: - return cudf.core.column.column_empty(0, dtype=dtype) - - # Check for None strings - if (self == "None").any(): - raise ValueError("Could not convert `None` value to datetime") - - is_nat = self == "NaT" - if dtype.kind == "M": + def strptime( + self, dtype: Dtype, format: str + ) -> cudf.core.column.DatetimeColumn | cudf.core.column.TimeDeltaColumn: + if dtype.kind not in "Mm": # type: ignore[union-attr] + raise ValueError( + f"dtype must be datetime or timedelta type, not {dtype}" + ) + elif self.null_count == len(self): + return column.column_empty(len(self), dtype=dtype, masked=True) # type: ignore[return-value] + elif (self == "None").any(): + raise ValueError( + "Cannot convert `None` value to datetime or timedelta." + ) + elif dtype.kind == "M": # type: ignore[union-attr] + if format.endswith("%z"): + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) + is_nat = self == "NaT" without_nat = self.apply_boolean_mask(is_nat.unary_operator("not")) all_same_length = ( libstrings.count_characters(without_nat).distinct_count( @@ -5699,61 +5708,43 @@ def _as_datetime_or_timedelta_column(self, dtype, format): if not valid.all(): raise ValueError(f"Column contains invalid data for {format=}") - casting_func = ( - str_cast.timestamp2int - if dtype.type == np.datetime64 - else str_cast.timedelta2int - ) + casting_func = str_cast.timestamp2int + add_back_nat = is_nat.any() + elif dtype.kind == "m": # type: ignore[union-attr] + casting_func = str_cast.timedelta2int + add_back_nat = False + result_col = casting_func(self, dtype, format) - if is_nat.any(): + if add_back_nat: result_col[is_nat] = None return result_col def as_datetime_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.DatetimeColumn": - out_dtype = cudf.api.types.dtype(dtype) - - # infer on host from the first not na element - # or return all null column if all values - # are null in current column - if format is None: - if self.null_count == len(self): - return cast( - "cudf.core.column.DatetimeColumn", - column.column_empty( - len(self), dtype=out_dtype, masked=True - ), - ) - else: - format = datetime.infer_format( - self.apply_boolean_mask(self.notnull()).element_indexing(0) - ) - - if format.endswith("%z"): - raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" - ) - return self._as_datetime_or_timedelta_column(out_dtype, format) + self, dtype: Dtype + ) -> cudf.core.column.DatetimeColumn: + not_null = self.apply_boolean_mask(self.notnull()) + if len(not_null) == 0: + # We should hit the self.null_count == len(self) condition + # so format doesn't matter + format = "" + else: + # infer on host from the first not na element + format = datetime.infer_format(not_null.element_indexing(0)) + return self.strptime(dtype, format) # type: ignore[return-value] def as_timedelta_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.TimeDeltaColumn": - out_dtype = cudf.api.types.dtype(dtype) - if format is None: - format = "%D days %H:%M:%S" - return self._as_datetime_or_timedelta_column(out_dtype, format) + self, dtype: Dtype + ) -> cudf.core.column.TimeDeltaColumn: + return self.strptime(dtype, "%D days %H:%M:%S") # type: ignore[return-value] def as_decimal_column( self, dtype: Dtype ) -> "cudf.core.column.DecimalBaseColumn": return libstrings.to_decimal(self, dtype) - def as_string_column( - self, dtype: Dtype, format: str | None = None - ) -> StringColumn: + def as_string_column(self) -> StringColumn: return self @property diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 5a0171bbbdc..59ea1cc002c 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -12,7 +12,7 @@ import cudf from cudf import _lib as libcudf -from cudf.api.types import is_scalar, is_timedelta64_dtype +from cudf.api.types import is_scalar from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import np_to_pa_dtype @@ -153,7 +153,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: this: ColumnBinaryOperand = self out_dtype = None - if is_timedelta64_dtype(other.dtype): + if other.dtype.kind == "m": # TODO: pandas will allow these operators to work but return false # when comparing to non-timedelta dtypes. We should do the same. if op in { @@ -263,41 +263,35 @@ def as_numerical_column( ) return cast("cudf.core.column.NumericalColumn", col.astype(dtype)) - def as_datetime_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.DatetimeColumn": + def as_datetime_column(self, dtype: Dtype) -> None: # type: ignore[override] raise TypeError( f"cannot astype a timedelta from {self.dtype} to {dtype}" ) - def as_string_column( - self, dtype: Dtype, format: str | None = None - ) -> "cudf.core.column.StringColumn": - if format is None: - format = "%D days %H:%M:%S" - if len(self) > 0: - return string._timedelta_to_str_typecast_functions[ - cudf.dtype(self.dtype) - ](self, format=format) - else: + def strftime(self, format: str) -> cudf.core.column.StringColumn: + if len(self) == 0: return cast( - "cudf.core.column.StringColumn", + cudf.core.column.StringColumn, column.column_empty(0, dtype="object", masked=False), ) + else: + return string._timedelta_to_str_typecast_functions[self.dtype]( + self, format=format + ) - def as_timedelta_column( - self, dtype: Dtype, format: str | None = None - ) -> TimeDeltaColumn: - dtype = cudf.dtype(dtype) + def as_string_column(self) -> cudf.core.column.StringColumn: + return self.strftime("%D days %H:%M:%S") + + def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn: if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) - def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: + def mean(self, skipna=None) -> pd.Timedelta: return pd.Timedelta( cast( "cudf.core.column.NumericalColumn", self.astype("int64") - ).mean(skipna=skipna, dtype=dtype), + ).mean(skipna=skipna), unit=self.time_unit, ).as_unit(self.time_unit) @@ -351,12 +345,11 @@ def std( self, skipna: bool | None = None, min_count: int = 0, - dtype: Dtype = np.float64, ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( cast("cudf.core.column.NumericalColumn", self.astype("int64")).std( - skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype + skipna=skipna, min_count=min_count, ddof=ddof ), unit=self.time_unit, ).as_unit(self.time_unit) diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index d9f62f51f92..197f46ee9fe 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -188,9 +188,6 @@ def cut( # adjust bin edges decimal precision int_label_bins = np.around(bins, precision) - # the inputs is a column of the values in the array x - input_arr = as_column(x) - # checking for the correct inclusivity values if right: closed = "right" @@ -242,6 +239,9 @@ def cut( labels if len(set(labels)) == len(labels) else None ) + # the inputs is a column of the values in the array x + input_arr = as_column(x) + if isinstance(bins, pd.IntervalIndex): # get the left and right edges of the bins as columns # we cannot typecast an IntervalIndex, so we need to diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b249410c2e4..7e07078c95b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -32,8 +32,6 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, - is_bool_dtype, - is_datetime_dtype, is_dict_like, is_dtype_equal, is_list_like, @@ -85,8 +83,7 @@ cudf_dtype_from_pydata_dtype, find_common_type, is_column_like, - min_scalar_type, - numeric_normalize_types, + min_signed_type, ) from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api @@ -105,20 +102,6 @@ "var": "nanvar", } -_numeric_reduction_ops = ( - "mean", - "min", - "max", - "sum", - "product", - "prod", - "std", - "var", - "kurtosis", - "kurt", - "skew", -) - def _shape_mismatch_error(x, y): raise ValueError( @@ -172,7 +155,7 @@ def _can_downcast_to_series(self, df, arg): ): return False else: - if is_bool_dtype(as_column(arg[0]).dtype) and not isinstance( + if as_column(arg[0]).dtype.kind == "b" and not isinstance( arg[1], slice ): return True @@ -321,7 +304,7 @@ def _getitem_tuple_arg(self, arg): tmp_arg[1], ) - if is_bool_dtype(tmp_arg[0].dtype): + if tmp_arg[0].dtype.kind == "b": df = columns_df._apply_boolean_mask( BooleanMask(tmp_arg[0], len(columns_df)) ) @@ -430,7 +413,7 @@ def _setitem_tuple_arg(self, key, value): else: value = cupy.asarray(value) - if cupy.ndim(value) == 2: + if value.ndim == 2: # If the inner dimension is 1, it's broadcastable to # all columns of the dataframe. indexed_shape = columns_df.loc[key[0]].shape @@ -462,6 +445,10 @@ def _setitem_tuple_arg(self, key, value): self._frame[col].loc[key[0]] = value[i] +class _DataFrameAtIndexer(_DataFrameLocIndexer): + pass + + class _DataFrameIlocIndexer(_DataFrameIndexer): """ For selection by index. @@ -563,7 +550,7 @@ def _setitem_tuple_arg(self, key, value): # TODO: consolidate code path with identical counterpart # in `_DataFrameLocIndexer._setitem_tuple_arg` value = cupy.asarray(value) - if cupy.ndim(value) == 2: + if value.ndim == 2: indexed_shape = columns_df.iloc[key[0]].shape if value.shape[1] == 1: if value.shape[0] != indexed_shape[0]: @@ -584,6 +571,10 @@ def _setitem_tuple_arg(self, key, value): self._frame[col].iloc[key[0]] = value[i] +class _DataFrameiAtIndexer(_DataFrameIlocIndexer): + pass + + class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): """ A GPU Dataframe object. @@ -603,6 +594,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. + copy : bool or None, default None + Copy data from inputs. + Currently not implemented. nan_as_null : bool, Default True If ``None``/``True``, converts ``np.nan`` values to ``null`` values. @@ -689,8 +683,11 @@ def __init__( index=None, columns=None, dtype=None, + copy=None, nan_as_null=no_default, ): + if copy is not None: + raise NotImplementedError("copy is not currently implemented.") super().__init__() if nan_as_null is no_default: nan_as_null = not cudf.get_option("mode.pandas_compatible") @@ -917,7 +914,8 @@ def _init_from_series_list(self, data, columns, index): final_index = ensure_index(index) series_lengths = list(map(len, data)) - data = numeric_normalize_types(*data) + common_dtype = find_common_type([obj.dtype for obj in data]) + data = [obj.astype(common_dtype) for obj in data] if series_lengths.count(series_lengths[0]) == len(series_lengths): # Calculating the final dataframe columns by # getting union of all `index` of the Series objects. @@ -1532,6 +1530,25 @@ def __array_function__(self, func, types, args, kwargs): pass return NotImplemented + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the cudf DataFrame as an Arrow C stream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. Currently not implemented. + + Returns + ------- + PyCapsule + """ + if requested_schema is not None: + raise NotImplementedError("requested_schema is not supported") + return self.to_arrow().__arrow_c_stream__() + # The _get_numeric_data method is necessary for dask compatibility. @_performance_tracking def _get_numeric_data(self): @@ -2192,8 +2209,8 @@ def from_dict( orient = orient.lower() if orient == "index": - if len(data) > 0 and isinstance( - next(iter(data.values())), (cudf.Series, cupy.ndarray) + if isinstance( + next(iter(data.values()), None), (cudf.Series, cupy.ndarray) ): result = cls(data).T result.columns = ( @@ -2243,6 +2260,7 @@ def to_dict( self, orient: str = "dict", into: type[dict] = dict, + index: bool = True, ) -> dict | list[dict]: """ Convert the DataFrame to a dictionary. @@ -2276,6 +2294,13 @@ def to_dict( instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. + index : bool, default True + Whether to include the index item (and index_names item if `orient` + is 'tight') in the returned dictionary. Can only be ``False`` + when `orient` is 'split' or 'tight'. Note that when `orient` is + 'records', this parameter does not take effect (index item always + not included). + Returns ------- dict, list or collections.abc.Mapping @@ -2357,7 +2382,7 @@ def to_dict( raise TypeError(f"unsupported type: {into}") return cons(self.items()) # type: ignore[misc] - return self.to_pandas().to_dict(orient=orient, into=into) + return self.to_pandas().to_dict(orient=orient, into=into, index=index) @_performance_tracking def scatter_by_map( @@ -2581,14 +2606,14 @@ def iat(self): """ Alias for ``DataFrame.iloc``; provided for compatibility with Pandas. """ - return self.iloc + return _DataFrameiAtIndexer(self) @property def at(self): """ Alias for ``DataFrame.loc``; provided for compatibility with Pandas. """ - return self.loc + return _DataFrameAtIndexer(self) @property # type: ignore @_external_only_api( @@ -2744,7 +2769,7 @@ def reindex( Chrome 200 0.02 .. pandas-compat:: - **DataFrame.reindex** + :meth:`pandas.DataFrame.reindex` Note: One difference from Pandas is that ``NA`` is used for rows that do not match, rather than ``NaN``. One side effect of this is @@ -3012,7 +3037,12 @@ def fillna( ) @_performance_tracking - def where(self, cond, other=None, inplace=False): + def where(self, cond, other=None, inplace=False, axis=None, level=None): + if axis is not None: + raise NotImplementedError("axis is not supported.") + elif level is not None: + raise NotImplementedError("level is not supported.") + from cudf.core._internals.where import ( _check_and_cast_columns_with_other, _make_categorical_like, @@ -3344,7 +3374,7 @@ def diff(self, periods=1, axis=0): 5 2 5 20 .. pandas-compat:: - **DataFrame.diff** + :meth:`pandas.DataFrame.diff` Diff currently only supports numeric dtype columns. """ @@ -3549,7 +3579,7 @@ def rename( 30 3 6 .. pandas-compat:: - **DataFrame.rename** + :meth:`pandas.DataFrame.rename` * Not Supporting: level @@ -3585,15 +3615,15 @@ def rename( if level is not None and isinstance(self.index, MultiIndex): level = self.index._get_level_label(level) - out_index = self.index.copy(deep=copy) - level_values = out_index.get_level_values(level) - level_values.to_frame().replace( + level_values = self.index.get_level_values(level) + ca = self.index._data.copy(deep=copy) + ca[level] = level_values._column.find_and_replace( to_replace=list(index.keys()), - value=list(index.values()), - inplace=True, + replacement=list(index.values()), + ) + out_index = type(self.index)._from_data( + ca, name=self.index.name ) - out_index._data[level] = column.as_column(level_values) - out_index._compute_levels_and_codes() else: to_replace = list(index.keys()) vals = list(index.values()) @@ -3622,7 +3652,9 @@ def rename( return result @_performance_tracking - def add_prefix(self, prefix): + def add_prefix(self, prefix, axis=None): + if axis is not None: + raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) out.columns = [ @@ -3664,15 +3696,15 @@ def agg(self, aggs, axis=None): ``DataFrame`` is returned. .. pandas-compat:: - **DataFrame.agg** + :meth:`pandas.DataFrame.agg` * Not supporting: ``axis``, ``*args``, ``**kwargs`` """ dtypes = [self[col].dtype for col in self._column_names] common_dtype = find_common_type(dtypes) - if not is_bool_dtype(common_dtype) and any( - is_bool_dtype(dtype) for dtype in dtypes + if common_dtype.kind != "b" and any( + dtype.kind == "b" for dtype in dtypes ): raise MixedTypeError("Cannot create a column with mixed types") @@ -3837,7 +3869,7 @@ def nlargest(self, n, columns, keep="first"): Brunei 434000 12128 BN .. pandas-compat:: - **DataFrame.nlargest** + :meth:`pandas.DataFrame.nlargest` - Only a single column is supported in *columns* """ @@ -3909,7 +3941,7 @@ def nsmallest(self, n, columns, keep="first"): Nauru 337000 182 NR .. pandas-compat:: - **DataFrame.nsmallest** + :meth:`pandas.DataFrame.nsmallest` - Only a single column is supported in *columns* """ @@ -3991,7 +4023,7 @@ def transpose(self): a new (ncol x nrow) dataframe. self is (nrow x ncol) .. pandas-compat:: - **DataFrame.transpose, DataFrame.T** + :meth:`pandas.DataFrame.transpose`, :attr:`pandas.DataFrame.T` Not supporting *copy* because default and only behavior is copy=True @@ -4182,7 +4214,7 @@ def merge( from both sides. .. pandas-compat:: - **DataFrame.merge** + :meth:`pandas.DataFrame.merge` DataFrames merges in cuDF result in non-deterministic row ordering. @@ -4238,6 +4270,7 @@ def join( lsuffix="", rsuffix="", sort=False, + validate: str | None = None, ): """Join columns with other DataFrame on index or on a key column. @@ -4251,19 +4284,33 @@ def join( column names when avoiding conflicts. sort : bool Set to True to ensure sorted ordering. + validate : str, optional + If specified, checks if join is of specified type. + + * "one_to_one" or "1:1": check if join keys are unique in both left + and right datasets. + * "one_to_many" or "1:m": check if join keys are unique in left dataset. + * "many_to_one" or "m:1": check if join keys are unique in right dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + + Currently not supported. Returns ------- joined : DataFrame .. pandas-compat:: - **DataFrame.join** + :meth:`pandas.DataFrame.join` - *other* must be a single DataFrame for now. - *on* is not supported yet due to lack of multi-index support. """ if on is not None: raise NotImplementedError("The on parameter is not yet supported") + elif validate is not None: + raise NotImplementedError( + "The validate parameter is not yet supported" + ) df = self.merge( other, @@ -4300,7 +4347,6 @@ def groupby( as_index=True, sort=no_default, group_keys=False, - squeeze=False, observed=True, dropna=True, ): @@ -4311,7 +4357,6 @@ def groupby( as_index, sort, group_keys, - squeeze, observed, dropna, ) @@ -4379,7 +4424,7 @@ def query(self, expr, local_dict=None): 1 2018-10-08 .. pandas-compat:: - **DataFrame.query** + :meth:`pandas.DataFrame.query` One difference from pandas is that ``query`` currently only supports numeric, datetime, timedelta, or bool dtypes. @@ -4414,7 +4459,16 @@ def query(self, expr, local_dict=None): @_performance_tracking def apply( - self, func, axis=1, raw=False, result_type=None, args=(), **kwargs + self, + func, + axis=1, + raw=False, + result_type=None, + args=(), + by_row: Literal[False, "compat"] = "compat", + engine: Literal["python", "numba"] = "python", + engine_kwargs: dict[str, bool] | None = None, + **kwargs, ): """ Apply a function along an axis of the DataFrame. @@ -4442,6 +4496,25 @@ def apply( Not yet supported args: tuple Positional arguments to pass to func in addition to the dataframe. + by_row : False or "compat", default "compat" + Only has an effect when ``func`` is a listlike or dictlike of funcs + and the func isn't a string. + If "compat", will if possible first translate the func into pandas + methods (e.g. ``Series().apply(np.sum)`` will be translated to + ``Series().sum()``). If that doesn't work, will try call to apply again with + ``by_row=True`` and if that fails, will call apply again with + ``by_row=False`` (backward compatible). + If False, the funcs will be passed the whole Series at once. + + Currently not supported. + + engine : {'python', 'numba'}, default 'python' + Unused. Added for compatibility with pandas. + engine_kwargs : dict + Unused. Added for compatibility with pandas. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `func`. Examples -------- @@ -4592,13 +4665,17 @@ def apply( """ if axis != 1: - raise ValueError( + raise NotImplementedError( "DataFrame.apply currently only supports row wise ops" ) if raw: - raise ValueError("The `raw` kwarg is not yet supported.") + raise NotImplementedError("The `raw` kwarg is not yet supported.") if result_type is not None: - raise ValueError("The `result_type` kwarg is not yet supported.") + raise NotImplementedError( + "The `result_type` kwarg is not yet supported." + ) + if by_row != "compat": + raise NotImplementedError("by_row is currently not supported.") return self._apply(func, _get_row_kernel, *args, **kwargs) @@ -5441,10 +5518,11 @@ def from_arrow(cls, table): 2 3 6 .. pandas-compat:: - **DataFrame.from_arrow** + `pandas.DataFrame.from_arrow` - - Does not support automatically setting index column(s) similar - to how ``to_pandas`` works for PyArrow Tables. + This method does not exist in pandas but it is similar to + how :meth:`pyarrow.Table.to_pandas` works for PyArrow Tables i.e. + it does not support automatically setting index column(s). """ index_col = None col_index_names = None @@ -5498,7 +5576,7 @@ def from_arrow(cls, table): return out @_performance_tracking - def to_arrow(self, preserve_index=None): + def to_arrow(self, preserve_index=None) -> pa.Table: """ Convert to a PyArrow Table. @@ -5588,18 +5666,36 @@ def to_arrow(self, preserve_index=None): return out.replace_schema_metadata(metadata) @_performance_tracking - def to_records(self, index=True): + def to_records(self, index=True, column_dtypes=None, index_dtypes=None): """Convert to a numpy recarray Parameters ---------- index : bool Whether to include the index in the output. + column_dtypes : str, type, dict, default None + If a string or type, the data type to store all columns. If + a dictionary, a mapping of column names and indices (zero-indexed) + to specific data types. Currently not supported. + index_dtypes : str, type, dict, default None + If a string or type, the data type to store all index levels. If + a dictionary, a mapping of index level names and indices + (zero-indexed) to specific data types. + This mapping is applied only if `index=True`. + Currently not supported. Returns ------- numpy recarray """ + if column_dtypes is not None: + raise NotImplementedError( + "column_dtypes is currently not supported." + ) + elif index_dtypes is not None: + raise NotImplementedError( + "column_dtypes is currently not supported." + ) members = [("index", self.index.dtype)] if index else [] members += [(col, self[col].dtype) for col in self._data.names] dtype = np.dtype(members) @@ -5612,7 +5708,16 @@ def to_records(self, index=True): @classmethod @_performance_tracking - def from_records(cls, data, index=None, columns=None, nan_as_null=False): + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float: bool = False, + nrows: int | None = None, + nan_as_null=False, + ): """ Convert structured or record ndarray to DataFrame. @@ -5622,13 +5727,32 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): index : str, array-like The name of the index column in *data*. If None, the default index is used. + exclude : sequence, default None + Columns or fields to exclude. + Currently not implemented. columns : list of str List of column names to include. + coerce_float : bool, default False + Attempt to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + Currently not implemented. + nrows : int, default None + Number of rows to read if data is an iterator. + Currently not implemented. Returns ------- DataFrame """ + if exclude is not None: + raise NotImplementedError("exclude is currently not supported.") + if coerce_float is not False: + raise NotImplementedError( + "coerce_float is currently not supported." + ) + if nrows is not None: + raise NotImplementedError("nrows is currently not supported.") + if data.ndim != 1 and data.ndim != 2: raise ValueError( f"records dimension expected 1 or 2 but found {data.ndim}" @@ -5691,7 +5815,13 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): @classmethod @_performance_tracking - def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): + def _from_arrays( + cls, + data: np.ndarray | cupy.ndarray, + index=None, + columns=None, + nan_as_null=False, + ): """Convert a numpy/cupy array to DataFrame. Parameters @@ -5709,8 +5839,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): ------- DataFrame """ - - data = cupy.asarray(data) if data.ndim != 1 and data.ndim != 2: raise ValueError( f"records dimension expected 1 or 2 but found: {data.ndim}" @@ -5874,7 +6002,7 @@ def quantile( 0.5 2.5 55.0 .. pandas-compat:: - **DataFrame.quantile** + :meth:`pandas.DataFrame.quantile` One notable difference from Pandas is when DataFrame is of non-numeric types and result is expected to be a Series in case of @@ -6105,7 +6233,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): else: filtered = self.copy(deep=False) - is_pure_dt = all(is_datetime_dtype(dt) for dt in filtered.dtypes) + is_pure_dt = all(dt.kind == "M" for dt in filtered.dtypes) common_dtype = find_common_type(filtered.dtypes) if ( @@ -6164,7 +6292,7 @@ def count(self, axis=0, numeric_only=False): dtype: int64 .. pandas-compat:: - **DataFrame.count** + :meth:`pandas.DataFrame.count` Parameters currently not supported are `axis` and `numeric_only`. """ @@ -6294,8 +6422,8 @@ def _reduce( and any( not is_object_dtype(dtype) for dtype in source_dtypes ) - or not is_bool_dtype(common_dtype) - and any(is_bool_dtype(dtype) for dtype in source_dtypes) + or common_dtype.kind != "b" + and any(dtype.kind == "b" for dtype in source_dtypes) ): raise TypeError( "Columns must all have the same dtype to " @@ -6402,7 +6530,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): 1 2.0 .. pandas-compat:: - **DataFrame.mode** + :meth:`pandas.DataFrame.transpose` ``axis`` parameter is currently not supported. """ @@ -6502,7 +6630,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): cudf.utils.dtypes.get_min_float_dtype( prepared._data[col] ) - if not is_datetime_dtype(common_dtype) + if common_dtype.kind != "M" else cudf.dtype("float64") ) .fillna(np.nan) @@ -6529,7 +6657,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result_dtype = ( common_dtype if method in type_coerced_methods - or is_datetime_dtype(common_dtype) + or (common_dtype is not None and common_dtype.kind == "M") else None ) result = column.as_column(result, dtype=result_dtype) @@ -7050,12 +7178,8 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): # Assemble the final index new_index_columns = [*repeated_index._columns, *tiled_index] index_names = [*self.index.names, *unique_named_levels.names] - new_index = MultiIndex.from_frame( - DataFrame._from_data( - dict(zip(range(0, len(new_index_columns)), new_index_columns)) - ), - names=index_names, - ) + new_index = MultiIndex._from_data(dict(enumerate(new_index_columns))) + new_index.names = index_names # Compute the column indices that serves as the input for # `interleave_columns` @@ -7353,9 +7477,9 @@ def pivot_table( @_performance_tracking @copy_docstring(reshape.unstack) - def unstack(self, level=-1, fill_value=None): + def unstack(self, level=-1, fill_value=None, sort: bool = True): return cudf.core.reshape.unstack( - self, level=level, fill_value=fill_value + self, level=level, fill_value=fill_value, sort=sort ) @_performance_tracking @@ -7401,7 +7525,12 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) def pct_change( - self, periods=1, fill_method=no_default, limit=no_default, freq=None + self, + periods=1, + fill_method=no_default, + limit=no_default, + freq=None, + **kwargs, ): """ Calculates the percent change between sequential elements @@ -7426,6 +7555,9 @@ def pct_change( freq : str, optional Increment to use from time series API. Not yet implemented. + **kwargs + Additional keyword arguments are passed into + `DataFrame.shift`. Returns ------- @@ -7471,7 +7603,7 @@ def pct_change( data = self.fillna(method=fill_method, limit=limit) return data.diff(periods=periods) / data.shift( - periods=periods, freq=freq + periods=periods, freq=freq, **kwargs ) def __dataframe__( @@ -7588,7 +7720,7 @@ def interleave_columns(self): The interleaved columns as a single column .. pandas-compat:: - **DataFrame.interleave_columns** + `pandas.DataFrame.interleave_columns` This method does not exist in pandas but it can be run as ``pd.Series(np.vstack(df.to_numpy()).reshape((-1,)))``. @@ -7690,7 +7822,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): 4 5 2 7 3 .. pandas-compat:: - **DataFrame.eval** + :meth:`pandas.DataFrame.eval` * Additional kwargs are not supported. * Bitwise and logical operators are not dtype-dependent. @@ -7841,7 +7973,26 @@ def value_counts( return result -def from_dataframe(df, allow_copy=False): +def from_dataframe(df, allow_copy: bool = False) -> DataFrame: + """ + Build a :class:`DataFrame` from an object supporting the dataframe interchange protocol. + + .. note:: + + If you have a ``pandas.DataFrame``, use :func:`from_pandas` instead. + + Parameters + ---------- + df : DataFrameXchg + Object supporting the interchange protocol, i.e. ``__dataframe__`` method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + :class:`DataFrame` + """ return df_protocol.from_dataframe(df, allow_copy=allow_copy) @@ -8280,7 +8431,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): )._column.unique() # Set the column dtype to the codes' dtype. The categories # will be re-assigned at the end - dtypes[idx] = min_scalar_type(len(categories[idx])) + dtypes[idx] = min_signed_type(len(categories[idx])) # Otherwise raise an error if columns have different dtypes elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols): raise ValueError("All columns must be the same type") diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 9cd573aceb9..a70a42c04af 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -648,7 +648,7 @@ def __dataframe__( def from_dataframe( df: DataFrameObject, allow_copy: bool = False -) -> _CuDFDataFrame: +) -> cudf.DataFrame: """ Construct a ``DataFrame`` from ``df`` if it supports the dataframe interchange protocol (``__dataframe__``). diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 253d200f7d4..c82e073d7b7 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -389,7 +389,7 @@ def values_host(self) -> np.ndarray: return self.to_numpy() @_performance_tracking - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): raise TypeError( "Implicit conversion to a host NumPy array via __array__ is not " "allowed, To explicitly construct a GPU matrix, consider using " @@ -591,7 +591,7 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None: dtype: int64 .. pandas-compat:: - **DataFrame.where, Series.where** + :meth:`pandas.DataFrame.where`, :meth:`pandas.Series.where` Note that ``where`` treats missing values as falsy, in parallel with pandas treatment of nullable data: @@ -1189,7 +1189,7 @@ def searchsorted( side: Literal["left", "right"] = "left", ascending: bool = True, na_position: Literal["first", "last"] = "last", - ): + ) -> ScalarLike | cupy.ndarray: """Find indices where elements should be inserted to maintain order Parameters @@ -1527,7 +1527,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @acquire_spill_lock() def _apply_cupy_ufunc_to_operands( self, ufunc, cupy_func, operands, **kwargs - ): + ) -> list[dict[Any, ColumnBase]]: # Note: There are some operations that may be supported by libcudf but # are not supported by pandas APIs. In particular, libcudf binary # operations support logical and/or operations as well as @@ -1538,7 +1538,7 @@ def _apply_cupy_ufunc_to_operands( # without cupy. mask = None - data = [{} for _ in range(ufunc.nout)] + data: list[dict[Any, ColumnBase]] = [{} for _ in range(ufunc.nout)] for name, (left, right, _, _) in operands.items(): cupy_inputs = [] for inp in (left, right) if ufunc.nin == 2 else (left,): @@ -1587,6 +1587,12 @@ def __pos__(self): def __abs__(self): return self._unaryop("abs") + def __bool__(self): + raise ValueError( + f"The truth value of a {type(self).__name__} is ambiguous. Use " + "a.empty, a.bool(), a.item(), a.any() or a.all()." + ) + # Reductions @classmethod @_performance_tracking @@ -1641,7 +1647,7 @@ def min( 1 .. pandas-compat:: - **DataFrame.min, Series.min** + :meth:`pandas.DataFrame.min`, :meth:`pandas.Series.min` Parameters currently not supported are `level`, `numeric_only`. """ @@ -1689,7 +1695,7 @@ def max( dtype: int64 .. pandas-compat:: - **DataFrame.max, Series.max** + :meth:`pandas.DataFrame.max`, :meth:`pandas.Series.max` Parameters currently not supported are `level`, `numeric_only`. """ @@ -1742,7 +1748,7 @@ def all(self, axis=0, skipna=True, **kwargs): dtype: bool .. pandas-compat:: - **DataFrame.all, Series.all** + :meth:`pandas.DataFrame.all`, :meth:`pandas.Series.all` Parameters currently not supported are `axis`, `bool_only`, `level`. @@ -1795,7 +1801,7 @@ def any(self, axis=0, skipna=True, **kwargs): dtype: bool .. pandas-compat:: - **DataFrame.any, Series.any** + :meth:`pandas.DataFrame.any`, :meth:`pandas.Series.any` Parameters currently not supported are `axis`, `bool_only`, `level`. diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index eccb3acabf6..3f91be71f29 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -22,7 +22,7 @@ from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default -from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype +from cudf.api.types import is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -35,7 +35,12 @@ from cudf.utils.utils import GetAttrGetItemMixin if TYPE_CHECKING: - from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType + from cudf._typing import ( + AggType, + DataFrameOrSeries, + MultiColumnAggType, + ScalarLike, + ) def _deprecate_collect(): @@ -357,7 +362,7 @@ def groups(self): ) @cached_property - def indices(self): + def indices(self) -> dict[ScalarLike, cp.ndarray]: """ Dict {group name -> group indices}. @@ -739,7 +744,8 @@ def _reduce( Computed {op} of values within each group. .. pandas-compat:: - **{cls}.{op}** + :meth:`pandas.core.groupby.DataFrameGroupBy.{op}`, + :meth:`pandas.core.groupby.SeriesGroupBy.{op}` The numeric_only, min_count """ @@ -1015,18 +1021,16 @@ def ngroup(self, ascending=True): if ascending: # Count ascending from 0 to num_groups - 1 - group_ids = cudf.Series._from_data({None: cp.arange(num_groups)}) + groups = range(num_groups) elif has_null_group: # Count descending from num_groups - 1 to 0, but subtract one more # for the null group making it num_groups - 2 to -1. - group_ids = cudf.Series._from_data( - {None: cp.arange(num_groups - 2, -2, -1)} - ) + groups = range(num_groups - 2, -2, -1) else: # Count descending from num_groups - 1 to 0 - group_ids = cudf.Series._from_data( - {None: cp.arange(num_groups - 1, -1, -1)} - ) + groups = range(num_groups - 1, -1, -1) + + group_ids = cudf.Series._from_data({None: as_column(groups)}) if has_null_group: group_ids.iloc[-1] = cudf.NA @@ -1479,7 +1483,8 @@ def mult(df): 6 2 6 12 .. pandas-compat:: - **GroupBy.apply** + :meth:`pandas.core.groupby.DataFrameGroupBy.apply`, + :meth:`pandas.core.groupby.SeriesGroupBy.apply` cuDF's ``groupby.apply`` is limited compared to pandas. In some situations, Pandas returns the grouped keys as part of @@ -1531,7 +1536,7 @@ def mult(df): # For `sum` & `product`, boolean types # will need to result in `int64` type. for name, col in res._data.items(): - if is_bool_dtype(col.dtype): + if col.dtype.kind == "b": res._data[name] = col.astype("int") return res @@ -1713,7 +1718,7 @@ def rolling_avg(val, avg): return grouped_values.apply_chunks(function, **kwargs) @_performance_tracking - def _broadcast(self, values): + def _broadcast(self, values: cudf.Series) -> cudf.Series: """ Broadcast the results of an aggregation to the group @@ -2355,7 +2360,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): Object shifted within each group. .. pandas-compat:: - **GroupBy.shift** + :meth:`pandas.core.groupby.DataFrameGroupBy.shift`, + :meth:`pandas.core.groupby.SeriesGroupBy.shift` Parameter ``freq`` is unsupported. """ diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b398ee2343e..ae20fcd5d9c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -52,11 +52,9 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( - _NUMPY_SCTYPES, _maybe_convert_to_default_type, find_common_type, is_mixed_with_object_dtype, - numeric_normalize_types, ) from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import _warn_no_dask_cudf, search_range @@ -103,7 +101,7 @@ def __subclasscheck__(self, subclass): def _lexsorted_equal_range( idx: Index | cudf.MultiIndex, - key_as_table: Frame, + keys: list[ColumnBase], is_sorted: bool, ) -> tuple[int, int, ColumnBase | None]: """Get equal range for key in lexicographically sorted index. If index @@ -118,13 +116,13 @@ def _lexsorted_equal_range( sort_vals = idx lower_bound = search_sorted( [*sort_vals._data.columns], - [*key_as_table._columns], + keys, side="left", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) upper_bound = search_sorted( [*sort_vals._data.columns], - [*key_as_table._columns], + keys, side="right", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) @@ -260,7 +258,9 @@ def searchsorted( ), "Invalid ascending flag" return search_range(value, self._range, side=side) - def factorize(self, sort: bool = False, use_na_sentinel: bool = True): + def factorize( + self, sort: bool = False, use_na_sentinel: bool = True + ) -> tuple[cupy.ndarray, Self]: if sort and self.step < 0: codes = cupy.arange(len(self) - 1, -1, -1) uniques = self[::-1] @@ -355,12 +355,10 @@ def _data(self): @_performance_tracking def __contains__(self, item): hash(item) - if isinstance(item, bool) or not isinstance( - item, - tuple( - _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float] - ), - ): + if not isinstance(item, (np.floating, np.integer, int, float)): + return False + elif isinstance(item, (np.timedelta64, np.datetime64, bool)): + # Cases that would pass the above check return False try: int_item = int(item) @@ -753,15 +751,16 @@ def difference(self, other, sort=None): super().difference(other, sort=sort) ) - def _try_reconstruct_range_index(self, index): - if isinstance(index, RangeIndex) or index.dtype.kind == "f": + def _try_reconstruct_range_index( + self, index: BaseIndex + ) -> Self | BaseIndex: + if isinstance(index, RangeIndex) or index.dtype.kind not in "iu": return index # Evenly spaced values can return a # RangeIndex instead of a materialized Index. - if not index._column.has_nulls(): + if not index._column.has_nulls(): # type: ignore[attr-defined] uniques = cupy.unique(cupy.diff(index.values)) - if len(uniques) == 1 and uniques[0].get() != 0: - diff = uniques[0].get() + if len(uniques) == 1 and (diff := uniques[0].get()) != 0: new_range = range(index[0], index[-1] + diff, diff) return type(self)(new_range, name=index.name) return index @@ -1309,7 +1308,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return _return_get_indexer_result(result_series.to_cupy()) @_performance_tracking - def get_loc(self, key): + def get_loc(self, key) -> int | slice | cupy.ndarray: if not is_scalar(key): raise TypeError("Should be a scalar-like") @@ -1317,9 +1316,8 @@ def get_loc(self, key): self.is_monotonic_increasing or self.is_monotonic_decreasing ) - target_as_table = cudf.core.frame.Frame({"None": as_column([key])}) lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( - self, target_as_table, is_sorted + self, [as_column([key])], is_sorted ) if lower_bound == upper_bound: @@ -1330,7 +1328,7 @@ def get_loc(self, key): return ( lower_bound if is_sorted - else sort_inds.element_indexing(lower_bound) + else sort_inds.element_indexing(lower_bound) # type: ignore[union-attr] ) if is_sorted: @@ -1339,8 +1337,8 @@ def get_loc(self, key): return slice(lower_bound, upper_bound) # Not sorted and not unique. Return a boolean mask - mask = cupy.full(self._data.nrows, False) - true_inds = sort_inds.slice(lower_bound, upper_bound).values + mask = cupy.full(len(self), False) + true_inds = sort_inds.slice(lower_bound, upper_bound).values # type: ignore[union-attr] mask[true_inds] = True return mask @@ -1458,18 +1456,19 @@ def notna(self): notnull = notna def _is_numeric(self): - return isinstance( - self._values, cudf.core.column.NumericalColumn - ) and self.dtype != cudf.dtype("bool") + return ( + isinstance(self._values, cudf.core.column.NumericalColumn) + and self.dtype.kind != "b" + ) def _is_boolean(self): - return self.dtype == cudf.dtype("bool") + return self.dtype.kind == "b" def _is_integer(self): - return cudf.api.types.is_integer_dtype(self.dtype) + return self.dtype.kind in "iu" def _is_floating(self): - return cudf.api.types.is_float_dtype(self.dtype) + return self.dtype.kind == "f" def _is_object(self): return isinstance(self._values, cudf.core.column.StringColumn) @@ -1599,9 +1598,13 @@ def append(self, other): f"either one of them to same dtypes." ) - if isinstance(self._values, cudf.core.column.NumericalColumn): - if self.dtype != other.dtype: - this, other = numeric_normalize_types(self, other) + if ( + isinstance(self._column, cudf.core.column.NumericalColumn) + and self.dtype != other.dtype + ): + common_type = find_common_type((self.dtype, other.dtype)) + this = this.astype(common_type) + other = other.astype(common_type) to_concat = [this, other] return self._concat(to_concat) @@ -2076,7 +2079,7 @@ def day_of_year(self): @property # type: ignore @_performance_tracking - def is_leap_year(self): + def is_leap_year(self) -> cupy.ndarray: """ Boolean indicator if the date belongs to a leap year. diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ff10051c52d..60cd142db4b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -26,6 +26,8 @@ import cudf import cudf._lib as libcudf +import cudf.core +import cudf.core.algorithms from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -495,7 +497,7 @@ def empty(self): True .. pandas-compat:: - **DataFrame.empty, Series.empty** + :attr:`pandas.DataFrame.empty`, :attr:`pandas.Series.empty` If DataFrame/Series contains only `null` values, it is still not considered empty. See the example above. @@ -829,7 +831,7 @@ def replace( 4 4 9 e .. pandas-compat:: - **DataFrame.replace, Series.replace** + :meth:`pandas.DataFrame.replace`, :meth:`pandas.Series.replace` Parameters that are currently not supported are: `limit`, `regex`, `method` @@ -1370,7 +1372,7 @@ def sum( dtype: int64 .. pandas-compat:: - **DataFrame.sum, Series.sum** + :meth:`pandas.DataFrame.sum`, :meth:`pandas.Series.sum` Parameters currently not supported are `level`, `numeric_only`. """ @@ -1431,7 +1433,7 @@ def product( dtype: int64 .. pandas-compat:: - **DataFrame.product, Series.product** + :meth:`pandas.DataFrame.product`, :meth:`pandas.Series.product` Parameters currently not supported are level`, `numeric_only`. """ @@ -1528,7 +1530,7 @@ def median( 17.0 .. pandas-compat:: - **DataFrame.median, Series.median** + :meth:`pandas.DataFrame.median`, :meth:`pandas.Series.median` Parameters currently not supported are `level` and `numeric_only`. """ @@ -1584,7 +1586,7 @@ def std( dtype: float64 .. pandas-compat:: - **DataFrame.std, Series.std** + :meth:`pandas.DataFrame.std`, :meth:`pandas.Series.std` Parameters currently not supported are `level` and `numeric_only` @@ -1643,7 +1645,7 @@ def var( dtype: float64 .. pandas-compat:: - **DataFrame.var, Series.var** + :meth:`pandas.DataFrame.var`, :meth:`pandas.Series.var` Parameters currently not supported are `level` and `numeric_only` @@ -1699,7 +1701,7 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs): dtype: float64 .. pandas-compat:: - **DataFrame.kurtosis** + :meth:`pandas.DataFrame.kurtosis` Parameters currently not supported are `level` and `numeric_only` """ @@ -1761,7 +1763,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs): dtype: float64 .. pandas-compat:: - **DataFrame.skew, Series.skew, Frame.skew** + :meth:`pandas.DataFrame.skew`, :meth:`pandas.Series.skew` The `axis` parameter is not currently supported. """ @@ -1987,6 +1989,8 @@ def interpolate( "Use obj.ffill() or obj.bfill() instead.", FutureWarning, ) + elif method not in {"linear", "values", "index"}: + raise ValueError(f"Interpolation method `{method}` not found") data = self @@ -2000,7 +2004,10 @@ def interpolate( ) ) - interpolator = cudf.core.algorithms.get_column_interpolator(method) + if method == "linear": + interp_index = RangeIndex(self._num_rows) + else: + interp_index = data.index columns = [] for col in data._columns: if isinstance(col, cudf.core.column.StringColumn): @@ -2012,8 +2019,9 @@ def interpolate( if col.nullable: col = col.astype("float64").fillna(np.nan) - # Interpolation methods may or may not need the index - columns.append(interpolator(col, index=data.index)) + columns.append( + cudf.core.algorithms._interpolation(col, index=interp_index) + ) result = self._from_data_like_self( self._data._from_columns_like_self(columns) @@ -2221,7 +2229,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True): 2021-01-01 23:45:27 1 2 .. pandas-compat:: - **DataFrame.truncate, Series.truncate** + :meth:`pandas.DataFrame.truncate`, :meth:`pandas.Series.truncate` The ``copy`` parameter is only present for API compatibility, but ``copy=False`` is not supported. This method always generates a @@ -2657,7 +2665,7 @@ def sort_index( 2 3 1 .. pandas-compat:: - **DataFrame.sort_index, Series.sort_index** + :meth:`pandas.DataFrame.sort_index`, :meth:`pandas.Series.sort_index` * Not supporting: kind, sort_remaining=False """ @@ -3294,7 +3302,7 @@ def pad(self, value=None, axis=None, inplace=None, limit=None): ) return self.ffill(value=value, axis=axis, inplace=inplace, limit=limit) - def add_prefix(self, prefix): + def add_prefix(self, prefix, axis=None): """ Prefix labels with string `prefix`. @@ -3456,6 +3464,7 @@ def sort_values( kind="quicksort", na_position="last", ignore_index=False, + key=None, ): """Sort by the values along either axis. @@ -3471,6 +3480,14 @@ def sort_values( 'first' puts nulls at the beginning, 'last' puts nulls at the end ignore_index : bool, default False If True, index will not be sorted. + key : callable, optional + Apply the key function to the values + before sorting. This is similar to the ``key`` argument in the + builtin ``sorted`` function, with the notable difference that + this ``key`` function should be *vectorized*. It should expect a + ``Series`` and return a Series with the same shape as the input. + It will be applied to each column in `by` independently. + Currently not supported. Returns ------- @@ -3489,7 +3506,7 @@ def sort_values( 1 1 2 .. pandas-compat:: - **DataFrame.sort_values, Series.sort_values** + :meth:`pandas.DataFrame.sort_values`, :meth:`pandas.Series.sort_values` * Support axis='index' only. * Not supporting: inplace, kind @@ -3510,6 +3527,8 @@ def sort_values( ) if axis != 0: raise NotImplementedError("`axis` not currently implemented.") + if key is not None: + raise NotImplementedError("key is not currently supported.") if len(self) == 0: return self @@ -4000,7 +4019,7 @@ def resample( .. pandas-compat:: - **DataFrame.resample, Series.resample** + :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample` Note that the dtype of the index (or the 'on' column if using 'on=') in the result will be of a frequency closest to the @@ -4556,7 +4575,7 @@ def sample( 1 2 4 .. pandas-compat:: - **DataFrame.sample, Series.sample** + :meth:`pandas.DataFrame.sample`, :meth:`pandas.Series.sample` When sampling from ``axis=0/'index'``, ``random_state`` can be either a numpy random state (``numpy.random.RandomState``) @@ -5241,7 +5260,6 @@ def groupby( as_index=True, sort=no_default, group_keys=False, - squeeze=False, observed=True, dropna=True, ): @@ -5251,11 +5269,6 @@ def groupby( if axis not in (0, "index"): raise NotImplementedError("axis parameter is not yet implemented") - if squeeze is not False: - raise NotImplementedError( - "squeeze parameter is not yet implemented" - ) - if not observed: raise NotImplementedError( "observed parameter is not yet implemented" @@ -6227,13 +6240,13 @@ def rank( def convert_dtypes( self, - infer_objects=True, - convert_string=True, - convert_integer=True, - convert_boolean=True, - convert_floating=True, + infer_objects: bool = True, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, + convert_floating: bool = True, dtype_backend=None, - ): + ) -> Self: """ Convert columns to the best possible nullable dtypes. @@ -6244,17 +6257,21 @@ def convert_dtypes( All other dtypes are always returned as-is as all dtypes in cudf are nullable. """ - result = self.copy() - - if convert_floating: - # cast any floating columns to int64 if - # they are all integer data: - for name, col in result._data.items(): + if not (convert_floating and convert_integer): + return self.copy() + else: + cols = [] + for col in self._columns: if col.dtype.kind == "f": col = col.fillna(0) - if cp.allclose(col, col.astype("int64")): - result._data[name] = col.astype("int64") - return result + as_int = col.astype("int64") + if cp.allclose(col, as_int): + cols.append(as_int) + continue + cols.append(col) + return self._from_data_like_self( + self._data._from_columns_like_self(cols, verify=False) + ) @_warn_no_dask_cudf def __dask_tokenize__(self): diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index a5fed02cbed..a0089242909 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -8,12 +8,7 @@ from typing_extensions import TypeAlias import cudf -from cudf.api.types import ( - _is_scalar_or_zero_d_array, - is_bool_dtype, - is_integer, - is_integer_dtype, -) +from cudf.api.types import _is_scalar_or_zero_d_array, is_integer from cudf.core.copy_types import BooleanMask, GatherMap @@ -230,11 +225,11 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec: key = cudf.core.column.as_column(key) if isinstance(key, cudf.core.column.CategoricalColumn): key = key.astype(key.codes.dtype) - if is_bool_dtype(key.dtype): + if key.dtype.kind == "b": return MaskIndexer(BooleanMask(key, n)) elif len(key) == 0: return EmptyIndexer() - elif is_integer_dtype(key.dtype): + elif key.dtype.kind in "iu": return MapIndexer(GatherMap(key, n, nullify=False)) else: raise TypeError( diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index dd0a4f666a1..32c84763401 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -9,7 +9,7 @@ import numpy as np import cudf -from cudf.api.types import is_decimal_dtype, is_dtype_equal +from cudf.api.types import is_decimal_dtype, is_dtype_equal, is_numeric_dtype from cudf.core.column import CategoricalColumn from cudf.core.dtypes import CategoricalDtype @@ -88,38 +88,25 @@ def _match_join_keys( ) if ( - np.issubdtype(ltype, np.number) - and np.issubdtype(rtype, np.number) - and not ( - np.issubdtype(ltype, np.timedelta64) - or np.issubdtype(rtype, np.timedelta64) - ) + is_numeric_dtype(ltype) + and is_numeric_dtype(rtype) + and not (ltype.kind == "m" or rtype.kind == "m") ): common_type = ( max(ltype, rtype) if ltype.kind == rtype.kind else np.result_type(ltype, rtype) ) - elif ( - np.issubdtype(ltype, np.datetime64) - and np.issubdtype(rtype, np.datetime64) - ) or ( - np.issubdtype(ltype, np.timedelta64) - and np.issubdtype(rtype, np.timedelta64) + elif (ltype.kind == "M" and rtype.kind == "M") or ( + ltype.kind == "m" and rtype.kind == "m" ): common_type = max(ltype, rtype) - elif ( - np.issubdtype(ltype, np.datetime64) - or np.issubdtype(ltype, np.timedelta64) - ) and not rcol.fillna(0).can_cast_safely(ltype): + elif ltype.kind in "mM" and not rcol.fillna(0).can_cast_safely(ltype): raise TypeError( f"Cannot join between {ltype} and {rtype}, please type-cast both " "columns to the same type." ) - elif ( - np.issubdtype(rtype, np.datetime64) - or np.issubdtype(rtype, np.timedelta64) - ) and not lcol.fillna(0).can_cast_safely(rtype): + elif rtype.kind in "mM" and not lcol.fillna(0).can_cast_safely(rtype): raise TypeError( f"Cannot join between {rtype} and {ltype}, please type-cast both " "columns to the same type." diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 9cbe863142b..ff4b06c6334 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -7,9 +7,7 @@ import operator import pickle import warnings -from collections import abc from functools import cached_property -from numbers import Integral from typing import TYPE_CHECKING, Any, MutableMapping import cupy as cp @@ -20,9 +18,10 @@ import cudf._lib as libcudf from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default -from cudf.api.types import is_integer, is_list_like, is_object_dtype +from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column from cudf.core._base_index import _return_get_indexer_result +from cudf.core.algorithms import factorize from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame from cudf.core.index import ( @@ -63,6 +62,20 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray: return indices +def _compute_levels_and_codes( + data: MutableMapping, +) -> tuple[list[cudf.Index], list[column.ColumnBase]]: + """Return MultiIndex level and codes from a ColumnAccessor-like mapping.""" + levels = [] + codes = [] + for col in data.values(): + code, cats = factorize(col) + codes.append(column.as_column(code.astype(np.int64))) + levels.append(cats) + + return levels, codes + + class MultiIndex(Frame, BaseIndex, NotIterable): """A multi-level or hierarchical index. @@ -145,50 +158,36 @@ def __init__( raise NotImplementedError( "Use `names`, `name` is not yet supported" ) - if len(levels) == 0: - raise ValueError("Must pass non-zero number of levels/codes") - if not isinstance(codes, cudf.DataFrame) and not isinstance( - codes[0], (abc.Sequence, np.ndarray, cp.ndarray) - ): - raise TypeError("Codes is not a Sequence of sequences") - - if copy: - if isinstance(codes, cudf.DataFrame): - codes = codes.copy(deep=True) - if len(levels) > 0 and isinstance( - levels[0], (cudf.Index, cudf.Series) - ): - levels = [level.copy(deep=True) for level in levels] - - if not isinstance(codes, cudf.DataFrame): - if len(levels) == len(codes): - codes = cudf.DataFrame._from_data( - { - i: column.as_column(code).astype(np.int64) - for i, code in enumerate(codes) - } - ) - else: - raise ValueError( - "MultiIndex has unequal number of levels and " - "codes and is inconsistent!" - ) - - levels = [ensure_index(level) for level in levels] - - if len(levels) != len(codes._data): + if levels is None or codes is None: + raise TypeError("Must pass both levels and codes") + elif not (is_list_like(levels) and len(levels) > 0): + raise ValueError("Must pass non-zero length sequence of levels") + elif not (is_list_like(codes) and len(codes) > 0): + raise ValueError("Must pass non-zero length sequence of codes") + elif len(codes) != len(levels): raise ValueError( - "MultiIndex has unequal number of levels and " - "codes and is inconsistent!" - ) - if len({c.size for c in codes._data.columns}) != 1: - raise ValueError( - "MultiIndex length of codes does not match " - "and is inconsistent!" + f"levels must have the same length ({len(levels)}) " + f"as codes ({len(codes)})." ) + new_levels = [] + for level in levels: + new_level = ensure_index(level) + if copy and new_level is level: + new_level = new_level.copy(deep=True) + new_levels.append(new_level) + + new_codes = [] + for code in codes: + if not (is_list_like(code) or is_column_like(code)): + raise TypeError("Each code must be list-like") + new_code = column.as_column(code).astype("int64") + if copy and new_code is code: + new_code = new_code.copy(deep=True) + new_codes.append(new_code) + source_data = {} - for (column_name, code), level in zip(codes._data.items(), levels): + for i, (code, level) in enumerate(zip(new_codes, new_levels)): if len(code): lo, hi = libcudf.reduce.minmax(code) if lo.value < -1 or hi.value > len(level) - 1: @@ -201,13 +200,11 @@ def __init__( result_col = libcudf.copying.gather( [level._column], code, nullify=True ) - source_data[column_name] = result_col[0]._with_type_metadata( - level.dtype - ) + source_data[i] = result_col[0]._with_type_metadata(level.dtype) - super().__init__(source_data) - self._levels = levels - self._codes = codes + super().__init__(ColumnAccessor(source_data)) + self._levels = new_levels + self._codes = new_codes self._name = None self.names = names @@ -349,10 +346,37 @@ def _from_data( data: MutableMapping, name: Any = None, ) -> MultiIndex: - obj = cls.from_frame(cudf.DataFrame._from_data(data=data)) - if name is not None: - obj.name = name - return obj + """ + Use when you have a ColumnAccessor-like mapping but no codes and levels. + """ + levels, codes = _compute_levels_and_codes(data) + return cls._simple_new( + data=ColumnAccessor(data), + levels=levels, + codes=codes, + names=pd.core.indexes.frozen.FrozenList(data.keys()), + name=name, + ) + + @classmethod + def _simple_new( + cls, + data: ColumnAccessor, + levels: list[cudf.Index], + codes: list[column.ColumnBase], + names: pd.core.indexes.frozen.FrozenList, + name: Any = None, + ) -> Self: + """ + Use when you have a ColumnAccessor-like mapping, codes, and levels. + """ + mi = object.__new__(cls) + mi._data = data + mi._levels = levels + mi._codes = codes + mi._names = names + mi._name = name + return mi @property # type: ignore @_performance_tracking @@ -420,18 +444,17 @@ def copy( 2020-08-28 AMZN 3401.80 MSFT 228.91 """ - - mi = MultiIndex._from_data(self._data.copy(deep=deep)) - if self._levels is not None: - mi._levels = [idx.copy(deep=deep) for idx in self._levels] - if self._codes is not None: - mi._codes = self._codes.copy(deep) if names is not None: - mi.names = names - elif self.names is not None: - mi.names = self.names.copy() - - return mi + names = pd.core.indexes.frozen.FrozenList(names) + else: + names = self.names + return type(self)._simple_new( + data=self._data.copy(deep=deep), + levels=[idx.copy(deep=deep) for idx in self._levels], + codes=[code.copy(deep=deep) for code in self._codes], + names=names, + name=name, + ) @_performance_tracking def __repr__(self): @@ -477,14 +500,8 @@ def __repr__(self): data_output = "\n".join(lines) return output_prefix + data_output - @property - def _codes_frame(self): - if self._codes is None: - self._compute_levels_and_codes() - return self._codes - @property # type: ignore - @_external_only_api("Use ._codes_frame instead") + @_external_only_api("Use ._codes instead") @_performance_tracking def codes(self): """ @@ -504,7 +521,7 @@ def codes(self): FrozenList([[0, 1, 2], [0, 1, 2]]) """ return pd.core.indexes.frozen.FrozenList( - col.values for col in self._codes_frame._columns + col.values for col in self._codes ) def get_slice_bound(self, label, side, kind=None): @@ -518,13 +535,13 @@ def nlevels(self): @property # type: ignore @_performance_tracking - def levels(self): + def levels(self) -> list[cudf.Index]: """ Returns list of levels in the MultiIndex Returns ------- - List of Series objects + List of Index objects Examples -------- @@ -544,9 +561,9 @@ def levels(self): >>> midx.levels [Index([1, 2, 3], dtype='int64', name='a'), Index([10, 11, 12], dtype='int64', name='b')] """ # noqa: E501 - if self._levels is None: - self._compute_levels_and_codes() - return self._levels + return [ + idx.rename(name) for idx, name in zip(self._levels, self.names) + ] @property # type: ignore @_performance_tracking @@ -565,11 +582,10 @@ def _get_level_label(self, level): else if level is index of the level, then level label will be returned as per the index. """ - - if level in self._data.names: + if level in self.names: return level else: - return self._data.names[level] + return self.names[level] @_performance_tracking def isin(self, values, level=None): @@ -670,20 +686,6 @@ def where(self, cond, other=None, inplace=False): ".where is not supported for MultiIndex operations" ) - @_performance_tracking - def _compute_levels_and_codes(self): - levels = [] - - codes = {} - for name, col in self._data.items(): - code, cats = cudf.Series._from_data({None: col}).factorize() - cats.name = name - codes[name] = code.astype(np.int64) - levels.append(cats) - - self._levels = levels - self._codes = cudf.DataFrame._from_data(codes) - @_performance_tracking def _compute_validity_mask(self, index, row_tuple, max_length): """Computes the valid set of indices of values in the lookup""" @@ -822,7 +824,7 @@ def _index_and_downcast(self, result, index, index_key): result.names = index.names[size:] index = MultiIndex( levels=index.levels[size:], - codes=index._codes_frame.iloc[:, size:], + codes=index._codes[size:], names=index.names[size:], ) @@ -839,10 +841,6 @@ def _get_row_major( | tuple[Any, ...] | list[tuple[Any, ...]], ) -> DataFrameOrSeries: - if pd.api.types.is_bool_dtype( - list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple - ): - return df[row_tuple] if isinstance(row_tuple, slice): if row_tuple.start is None: row_tuple = slice(self[0], row_tuple.stop, row_tuple.step) @@ -932,28 +930,29 @@ def deserialize(cls, header, frames): def __getitem__(self, index): flatten = isinstance(index, int) - if isinstance(index, (Integral, abc.Sequence)): - index = np.array(index) - elif isinstance(index, slice): + if isinstance(index, slice): start, stop, step = index.indices(len(self)) - index = column.as_column(range(start, stop, step)) - result = MultiIndex.from_frame( - self.to_frame(index=False, name=range(0, self.nlevels)).take( - index - ), - names=self.names, + idx = range(start, stop, step) + elif is_scalar(index): + idx = [index] + else: + idx = index + + indexer = column.as_column(idx) + ca = self._data._from_columns_like_self( + (col.take(indexer) for col in self._columns), verify=False + ) + codes = [code.take(indexer) for code in self._codes] + result = type(self)._simple_new( + data=ca, codes=codes, levels=self._levels, names=self.names ) # we are indexing into a single row of the MultiIndex, # return that row as a tuple: if flatten: return result.to_pandas()[0] - - if self._codes_frame is not None: - result._codes = self._codes_frame.take(index) - if self._levels is not None: - result._levels = self._levels - return result + else: + return result @_performance_tracking def to_frame(self, index=True, name=no_default, allow_duplicates=False): @@ -1269,25 +1268,12 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None): ('NJ', 'Precip')], names=['state', 'observation']) """ - obj = cls.__new__(cls) - super(cls, obj).__init__() - - source_data = df.copy(deep=False) - source_data.reset_index(drop=True, inplace=True) - if isinstance(source_data, pd.DataFrame): - source_data = cudf.DataFrame.from_pandas(source_data) - - names = names if names is not None else source_data._data.names - # if names are unique - # try using those as the source_data column names: - if len(dict.fromkeys(names)) == len(names): - source_data.columns = names - obj._name = None - obj._data = source_data._data - obj.names = names - obj._codes = None - obj._levels = None - return obj + if isinstance(df, pd.DataFrame): + source_data = cudf.DataFrame.from_pandas(df) + else: + source_data = df + names = names if names is not None else source_data._column_names + return cls.from_arrays(source_data._columns, names=names) @classmethod @_performance_tracking @@ -1373,9 +1359,6 @@ def from_arrays( (2, 'blue')], names=['number', 'color']) """ - # Imported here due to circular import - from cudf.core.algorithms import factorize - error_msg = "Input must be a list / sequence of array-likes." if not is_list_like(arrays): raise TypeError(error_msg) @@ -1438,7 +1421,7 @@ def _poplevels(self, level): # update self self.names = names - self._compute_levels_and_codes() + self._levels, self._codes = _compute_levels_and_codes(self._data) return popped @@ -1562,13 +1545,19 @@ def to_pandas( ) -> pd.MultiIndex: # cudf uses np.iinfo(size_type_dtype).min as missing code # pandas uses -1 as missing code - pd_codes = self._codes_frame.replace(np.iinfo(size_type_dtype).min, -1) + pd_codes = ( + code.find_and_replace( + column.as_column(np.iinfo(size_type_dtype).min, length=1), + column.as_column(-1, length=1), + ) + for code in self._codes + ) return pd.MultiIndex( levels=[ level.to_pandas(nullable=nullable, arrow_type=arrow_type) for level in self.levels ], - codes=[col.values_host for col in pd_codes._columns], + codes=[col.values_host for col in pd_codes], names=self.names, ) @@ -1743,13 +1732,9 @@ def _clean_nulls_from_index(self): @_performance_tracking def memory_usage(self, deep=False): - usage = sum(col.memory_usage for col in self._data.columns) - if self.levels: - for level in self.levels: - usage += level.memory_usage(deep=deep) - if self._codes_frame: - for col in self._codes_frame._data.columns: - usage += col.memory_usage + usage = sum(col.memory_usage for col in self._columns) + usage += sum(level.memory_usage(deep=deep) for level in self._levels) + usage += sum(code.memory_usage for code in self._codes) return usage @_performance_tracking @@ -1937,17 +1922,18 @@ def get_loc(self, key): # Handle partial key search. If length of `key` is less than `nlevels`, # Only search levels up to `len(key)` level. - key_as_table = cudf.core.frame.Frame( - {i: column.as_column(k, length=1) for i, k in enumerate(key)} - ) partial_index = self.__class__._from_data( - data=self._data.select_by_index(slice(key_as_table._num_columns)) + data=self._data.select_by_index(slice(len(key))) ) ( lower_bound, upper_bound, sort_inds, - ) = _lexsorted_equal_range(partial_index, key_as_table, is_sorted) + ) = _lexsorted_equal_range( + partial_index, + [column.as_column(k, length=1) for k in key], + is_sorted, + ) if lower_bound == upper_bound: raise KeyError(key) @@ -1972,7 +1958,7 @@ def get_loc(self, key): return true_inds # Not sorted and not unique. Return a boolean mask - mask = cp.full(self._data.nrows, False) + mask = cp.full(len(self), False) mask[true_inds] = True return mask @@ -2045,7 +2031,7 @@ def _union(self, other, sort=None): ignore_index=True, ) - midx = MultiIndex.from_frame(result_df.iloc[:, : self.nlevels]) + midx = type(self)._from_data(result_df.iloc[:, : self.nlevels]._data) midx.names = self.names if self.names == other.names else None if sort in {None, True} and len(other): return midx.sort_values() @@ -2069,7 +2055,8 @@ def _intersection(self, other, sort=None): self_df.columns = col_names result_df = cudf.merge(self_df, other_df, how="inner") - midx = self.__class__.from_frame(result_df, names=res_name) + midx = type(self)._from_data(result_df._data) + midx.names = res_name if sort in {None, True} and len(other): return midx.sort_values() return midx @@ -2079,6 +2066,7 @@ def _copy_type_metadata(self: Self, other: Self) -> Self: res = super()._copy_type_metadata(other) if isinstance(other, MultiIndex): res._names = other._names + self._levels, self._codes = _compute_levels_and_codes(res._data) return res @_performance_tracking diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index cdd4ec6f8e5..4e0c5bd86b9 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -13,9 +13,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations import pickle import warnings +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -23,7 +25,6 @@ import cudf import cudf._lib.labeling import cudf.core.index -from cudf._typing import DataFrameOrSeries from cudf.core.groupby.groupby import ( DataFrameGroupBy, GroupBy, @@ -31,6 +32,9 @@ _Grouping, ) +if TYPE_CHECKING: + from cudf._typing import DataFrameOrSeries + class _Resampler(GroupBy): grouping: "_ResampleGrouping" diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 1120642947b..b538ae34b6f 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1060,7 +1060,7 @@ def pivot(data, columns=None, index=no_default, values=no_default): return result -def unstack(df, level, fill_value=None): +def unstack(df, level, fill_value=None, sort: bool = True): """ Pivot one or more levels of the (necessarily hierarchical) index labels. @@ -1080,6 +1080,9 @@ def unstack(df, level, fill_value=None): levels of the index to pivot fill_value Non-functional argument provided for compatibility with Pandas. + sort : bool, default True + Sort the level(s) in the resulting MultiIndex columns. + Returns ------- @@ -1156,6 +1159,8 @@ def unstack(df, level, fill_value=None): if fill_value is not None: raise NotImplementedError("fill_value is not supported.") + elif sort is False: + raise NotImplementedError(f"{sort=} is not supported.") if pd.api.types.is_list_like(level): if not level: return df diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 29460d8c67e..f6331aa1f49 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -8,7 +8,7 @@ import pyarrow as pa import cudf -from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype +from cudf.api.types import is_scalar from cudf.core.dtypes import ListDtype, StructDtype from cudf.core.missing import NA, NaT from cudf.core.mixins import BinaryOperand @@ -245,11 +245,7 @@ def _preprocess_host_value(self, value, dtype): dtype = cudf.dtype(dtype) if not valid: - value = ( - NaT - if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype) - else NA - ) + value = NaT if dtype.kind in "mM" else NA return value, dtype diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4a60470fafa..d8dbaa897e7 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -22,10 +22,8 @@ from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, - is_bool_dtype, is_dict_like, is_integer, - is_integer_dtype, is_scalar, ) from cudf.core import indexing_utils @@ -214,17 +212,17 @@ def __setitem__(self, key, value): and self._frame.dtype.categories.dtype.kind == "f" ) ) - and isinstance(value, (np.float32, np.float64)) + and isinstance(value, np.floating) and np.isnan(value) ): raise MixedTypeError( f"Cannot assign {value=} to " f"non-float dtype={self._frame.dtype}" ) - elif ( - self._frame.dtype.kind == "b" - and not is_bool_dtype(value) - and value not in {None, cudf.NA} + elif self._frame.dtype.kind == "b" and not ( + value in {None, cudf.NA} + or isinstance(value, (np.bool_, bool)) + or (isinstance(value, cudf.Scalar) and value.dtype.kind == "b") ): raise MixedTypeError( f"Cannot assign {value=} to " @@ -357,12 +355,10 @@ def _loc_to_iloc(self, arg): ) if not _is_non_decimal_numeric_dtype(index_dtype) and not ( isinstance(index_dtype, cudf.CategoricalDtype) - and is_integer_dtype(index_dtype.categories.dtype) + and index_dtype.categories.dtype.kind in "iu" ): # TODO: switch to cudf.utils.dtypes.is_integer(arg) - if isinstance(arg, cudf.Scalar) and is_integer_dtype( - arg.dtype - ): + if isinstance(arg, cudf.Scalar) and arg.dtype.kind in "iu": # Do not remove until pandas 3.0 support is added. assert ( PANDAS_LT_300 @@ -961,7 +957,7 @@ def reindex(self, *args, **kwargs): dtype: int64 .. pandas-compat:: - **Series.reindex** + :meth:`pandas.Series.reindex` Note: One difference from Pandas is that ``NA`` is used for rows that do not match, rather than ``NaN``. One side effect of this is @@ -1244,7 +1240,7 @@ def map(self, arg, na_action=None) -> "Series": dtype: int64 .. pandas-compat:: - **Series.map** + :meth:`pandas.Series.map` Please note map currently only supports fixed-width numeric type functions. @@ -2064,6 +2060,7 @@ def sort_values( kind="quicksort", na_position="last", ignore_index=False, + key=None, ): """Sort by the values along either axis. @@ -2077,6 +2074,14 @@ def sort_values( 'first' puts nulls at the beginning, 'last' puts nulls at the end ignore_index : bool, default False If True, index will not be sorted. + key : callable, optional + Apply the key function to the values + before sorting. This is similar to the ``key`` argument in the + builtin ``sorted`` function, with the notable difference that + this ``key`` function should be *vectorized*. It should expect a + ``Series`` and return a Series with the same shape as the input. + It will be applied to each column in `by` independently. + Currently not supported. Returns ------- @@ -2095,7 +2100,7 @@ def sort_values( dtype: int64 .. pandas-compat:: - **Series.sort_values** + :meth:`pandas.Series.sort_values` * Support axis='index' only. * The inplace and kind argument is currently unsupported @@ -2108,6 +2113,7 @@ def sort_values( kind=kind, na_position=na_position, ignore_index=ignore_index, + key=key, ) @_performance_tracking @@ -2551,7 +2557,7 @@ def count(self): 5 .. pandas-compat:: - **Series.count** + :meth:`pandas.Series.count` Parameters currently not supported is `level`. """ @@ -2662,7 +2668,7 @@ def cov(self, other, min_periods=None): -0.015750000000000004 .. pandas-compat:: - **Series.cov** + :meth:`pandas.Series.cov` `min_periods` parameter is not yet supported. """ @@ -3221,7 +3227,7 @@ def describe( percentiles = np.array([0.25, 0.5, 0.75]) dtype = "str" - if is_bool_dtype(self.dtype): + if self.dtype.kind == "b": data = _describe_categorical(self, percentiles) elif isinstance(self._column, cudf.core.column.NumericalColumn): data = _describe_numeric(self, percentiles) @@ -3369,7 +3375,6 @@ def groupby( as_index=True, sort=no_default, group_keys=False, - squeeze=False, observed=True, dropna=True, ): @@ -3380,7 +3385,6 @@ def groupby( as_index, sort, group_keys, - squeeze, observed, dropna, ) @@ -3423,7 +3427,7 @@ def rename(self, index=None, copy=True): 'numeric_series' .. pandas-compat:: - **Series.rename** + :meth:`pandas.Series.rename` - Supports scalar values only for changing name attribute - The ``inplace`` and ``level`` is not supported @@ -3432,7 +3436,9 @@ def rename(self, index=None, copy=True): return Series._from_data(out_data, self.index, name=index) @_performance_tracking - def add_prefix(self, prefix): + def add_prefix(self, prefix, axis=None): + if axis is not None: + raise NotImplementedError("axis is currently not implemented.") return Series._from_data( # TODO: Change to deep=False when copy-on-write is default data=self._data.copy(deep=True), @@ -3530,7 +3536,12 @@ def explode(self, ignore_index=False): @_performance_tracking def pct_change( - self, periods=1, fill_method=no_default, limit=no_default, freq=None + self, + periods=1, + fill_method=no_default, + limit=no_default, + freq=None, + **kwargs, ): """ Calculates the percent change between sequential elements @@ -3555,6 +3566,9 @@ def pct_change( freq : str, optional Increment to use from time series API. Not yet implemented. + **kwargs + Additional keyword arguments are passed into + `Series.shift`. Returns ------- @@ -3599,11 +3613,15 @@ def pct_change( warnings.simplefilter("ignore") data = self.fillna(method=fill_method, limit=limit) diff = data.diff(periods=periods) - change = diff / data.shift(periods=periods, freq=freq) + change = diff / data.shift(periods=periods, freq=freq, **kwargs) return change @_performance_tracking - def where(self, cond, other=None, inplace=False): + def where(self, cond, other=None, inplace=False, axis=None, level=None): + if axis is not None: + raise NotImplementedError("axis is not supported.") + elif level is not None: + raise NotImplementedError("level is not supported.") result_col = super().where(cond, other, inplace) return self._mimic_inplace( self._from_data_like_self( @@ -4703,7 +4721,7 @@ def strftime(self, date_format: str, *args, **kwargs) -> Series: dtype: object .. pandas-compat:: - **series.DatetimeProperties.strftime** + :meth:`pandas.DatetimeIndex.strftime` The following date format identifiers are not yet supported: ``%c``, ``%x``,``%X`` @@ -4731,9 +4749,7 @@ def strftime(self, date_format: str, *args, **kwargs) -> Series: f"for tracking purposes." ) return self._return_result_like_self( - self.series._column.as_string_column( - dtype="str", format=date_format - ) + self.series._column.strftime(format=date_format) ) @copy_docstring(DatetimeIndex.tz_localize) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index f9555aee6a2..b93528f9693 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -11,9 +11,7 @@ from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, - is_bool_dtype, is_integer, - is_integer_dtype, is_numeric_dtype, ) from cudf.core.column import ColumnBase, as_column @@ -92,12 +90,6 @@ def shape(self) -> tuple[int]: """Get a tuple representing the dimensionality of the Index.""" return (len(self),) - def __bool__(self): - raise TypeError( - f"The truth value of a {type(self)} is ambiguous. Use " - "a.empty, a.bool(), a.item(), a.any() or a.all()." - ) - @property # type: ignore @_performance_tracking def _num_columns(self) -> int: @@ -359,9 +351,9 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase: arg = as_column(arg) if len(arg) == 0: arg = cudf.core.column.column_empty(0, dtype="int32") - if is_integer_dtype(arg.dtype): + if arg.dtype.kind in "iu": return self._column.take(arg) - if is_bool_dtype(arg.dtype): + if arg.dtype.kind == "b": if (bn := len(arg)) != (n := len(self)): raise IndexError( f"Boolean mask has wrong length: {bn} not {n}" diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 397bfe1d472..c6e2b5d10e1 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -6,7 +6,6 @@ import warnings from typing import Literal, Sequence -import cupy as cp import numpy as np import pandas as pd import pandas.tseries.offsets as pd_offset @@ -216,25 +215,25 @@ def to_datetime( + arg[unit_rev["day"]].astype("str").str.zfill(2) ) format = "%Y-%m-%d" - col = new_series._column.as_datetime_column( - "datetime64[s]", format=format - ) - for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) if value is not None and value in arg: arg_col = arg._data[value] - if arg_col.dtype.kind in ("f"): - col = new_series._column.as_datetime_column( - "datetime64[ns]", format=format + if arg_col.dtype.kind == "f": + col = new_series._column.strptime( + cudf.dtype("datetime64[ns]"), format=format ) break - elif arg_col.dtype.kind in ("O"): + elif arg_col.dtype.kind == "O": if not cpp_is_integer(arg_col).all(): - col = new_series._column.as_datetime_column( - "datetime64[ns]", format=format + col = new_series._column.strptime( + cudf.dtype("datetime64[ns]"), format=format ) break + else: + col = new_series._column.strptime( + cudf.dtype("datetime64[s]"), format=format + ) times_column = None for u in ["h", "m", "s", "ms", "us", "ns"]: @@ -334,15 +333,15 @@ def _process_col( col = ( col.astype("int") .astype("str") - .as_datetime_column( - dtype="datetime64[us]" + .strptime( + dtype=cudf.dtype("datetime64[us]") if "%f" in format - else "datetime64[s]", + else cudf.dtype("datetime64[s]"), format=format, ) ) else: - col = col.as_datetime_column(dtype="datetime64[ns]") + col = col.astype(dtype="datetime64[ns]") elif col.dtype.kind in "iu": if unit in ("D", "h", "m"): @@ -353,11 +352,11 @@ def _process_col( col = col * factor if format is not None: - col = col.astype("str").as_datetime_column( - dtype=_unit_dtype_map[unit], format=format + col = col.astype("str").strptime( + dtype=cudf.dtype(_unit_dtype_map[unit]), format=format ) else: - col = col.as_datetime_column(dtype=_unit_dtype_map[unit]) + col = col.astype(dtype=cudf.dtype(_unit_dtype_map[unit])) elif col.dtype.kind == "O": if unit not in (None, "ns") or col.null_count == len(col): @@ -384,8 +383,8 @@ def _process_col( element=col.element_indexing(0), dayfirst=dayfirst, ) - col = col.as_datetime_column( - dtype=_unit_dtype_map[unit], + col = col.strptime( + dtype=cudf.dtype(_unit_dtype_map[unit]), format=format, ) elif col.dtype.kind != "M": @@ -894,7 +893,7 @@ def date_range( # integers and divide the number range evenly with `periods` elements. start = cudf.Scalar(start, dtype=dtype).value.astype("int64") end = cudf.Scalar(end, dtype=dtype).value.astype("int64") - arr = cp.linspace(start=start, stop=end, num=periods) + arr = np.linspace(start=start, stop=end, num=periods) result = cudf.core.column.as_column(arr).astype("datetime64[ns]") return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz) @@ -991,8 +990,10 @@ def date_range( stop = end_estim.astype("int64") start = start.value.astype("int64") step = _offset_to_nanoseconds_lower_bound(offset) - arr = cp.arange(start=start, stop=stop, step=step, dtype="int64") - res = cudf.core.column.as_column(arr).astype("datetime64[ns]") + arr = range(int(start), int(stop), step) + res = cudf.core.column.as_column(arr, dtype="int64").astype( + "datetime64[ns]" + ) return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize( tz diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index ef6b86a04a7..07158e4ee61 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -8,12 +8,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib import strings as libstrings -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - is_datetime_dtype, - is_string_dtype, - is_timedelta_dtype, -) +from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype from cudf.core.column import as_column from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import can_convert_to_column @@ -85,7 +80,7 @@ def to_numeric(arg, errors="raise", downcast=None): dtype: float64 .. pandas-compat:: - **cudf.to_numeric** + :func:`pandas.to_numeric` An important difference from pandas is that this function does not accept mixed numeric/non-numeric type sequences. @@ -114,7 +109,7 @@ def to_numeric(arg, errors="raise", downcast=None): col = as_column(arg) dtype = col.dtype - if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype): + if dtype.kind in "mM": col = col.astype(cudf.dtype("int64")) elif isinstance(dtype, CategoricalDtype): cat_dtype = col.dtype.type diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py index 21693e106bd..bb153d4b549 100644 --- a/python/cudf/cudf/core/window/ewm.py +++ b/python/cudf/cudf/core/window/ewm.py @@ -56,7 +56,7 @@ class ExponentialMovingWindow(_RollingBase): the equivalent pandas method. .. pandas-compat:: - **cudf.core.window.ExponentialMovingWindow** + :meth:`pandas.DataFrame.ewm` The parameters ``min_periods``, ``ignore_na``, ``axis``, and ``times`` are not yet supported. Behavior is defined only for data that begins diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index e909d96309e..0f2820a01e9 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -50,7 +50,7 @@ def read_csv( comment=None, delim_whitespace=False, byte_range=None, - use_python_file_object=True, + use_python_file_object=None, storage_options=None, bytes_per_thread=None, ): diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 7082a85237a..289292b5182 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -10,6 +10,7 @@ from cudf._lib import orc as liborc from cudf.api.types import is_list_like from cudf.utils import ioutils +from cudf.utils.utils import maybe_filter_deprecation def _make_empty_df(filepath_or_buffer, columns): @@ -280,7 +281,7 @@ def read_orc( num_rows=None, use_index=True, timestamp_type=None, - use_python_file_object=True, + use_python_file_object=None, storage_options=None, bytes_per_thread=None, ): @@ -320,6 +321,9 @@ def read_orc( ) filepaths_or_buffers = [] + have_nativefile = any( + isinstance(source, pa.NativeFile) for source in filepath_or_buffer + ) for source in filepath_or_buffer: if ioutils.is_directory( path_or_data=source, storage_options=storage_options @@ -360,17 +364,24 @@ def read_orc( stripes = selected_stripes if engine == "cudf": - return DataFrame._from_data( - *liborc.read_orc( - filepaths_or_buffers, - columns, - stripes, - skiprows, - num_rows, - use_index, - timestamp_type, + # Don't want to warn if use_python_file_object causes us to get + # a NativeFile (there is a separate deprecation warning for that) + with maybe_filter_deprecation( + not have_nativefile, + message="Support for reading pyarrow's NativeFile is deprecated", + category=FutureWarning, + ): + return DataFrame._from_data( + *liborc.read_orc( + filepaths_or_buffers, + columns, + stripes, + skiprows, + num_rows, + use_index, + timestamp_type, + ) ) - ) else: from pyarrow import orc diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 7733e770d99..0f0a240b5d0 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -15,6 +15,7 @@ import numpy as np import pandas as pd +import pyarrow as pa from pyarrow import dataset as ds import cudf @@ -23,6 +24,7 @@ from cudf.core.column import as_column, build_categorical_column, column_empty from cudf.utils import ioutils from cudf.utils.performance_tracking import _performance_tracking +from cudf.utils.utils import maybe_filter_deprecation BYTE_SIZES = { "kb": 1000, @@ -73,6 +75,7 @@ def _write_parquet( column_encoding=None, column_type_length=None, output_as_binary=None, + write_arrow_schema=True, ): if is_list_like(paths) and len(paths) > 1: if partitions_info is None: @@ -110,6 +113,7 @@ def _write_parquet( "column_encoding": column_encoding, "column_type_length": column_type_length, "output_as_binary": output_as_binary, + "write_arrow_schema": write_arrow_schema, } if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs): with ExitStack() as stack: @@ -154,6 +158,7 @@ def write_to_dataset( column_encoding=None, column_type_length=None, output_as_binary=None, + store_schema=False, ): """Wraps `to_parquet` to write partitioned Parquet datasets. For each combination of partition group and value, @@ -242,6 +247,9 @@ def write_to_dataset( output_as_binary : set, optional, default None If a column name is present in the set, that column will be output as unannotated binary, rather than the default 'UTF-8'. + store_schema : bool, default False + If ``True``, enable computing and writing arrow schema to Parquet + file footer's key-value metadata section for faithful round-tripping. """ fs = ioutils._ensure_filesystem(fs, root_path, storage_options) @@ -285,6 +293,7 @@ def write_to_dataset( column_encoding=column_encoding, column_type_length=column_type_length, output_as_binary=output_as_binary, + store_schema=store_schema, ) else: @@ -312,6 +321,7 @@ def write_to_dataset( column_encoding=column_encoding, column_type_length=column_type_length, output_as_binary=output_as_binary, + store_schema=store_schema, ) return metadata @@ -342,7 +352,7 @@ def read_parquet_metadata(filepath_or_buffer): path_or_data=source, compression=None, fs=fs, - use_python_file_object=True, + use_python_file_object=None, open_file_options=None, storage_options=None, bytes_per_thread=None, @@ -524,7 +534,7 @@ def read_parquet( filters=None, row_groups=None, use_pandas_metadata=True, - use_python_file_object=True, + use_python_file_object=None, categorical_partitions=True, open_file_options=None, bytes_per_thread=None, @@ -607,6 +617,9 @@ def read_parquet( row_groups=row_groups, fs=fs, ) + have_nativefile = any( + isinstance(source, pa.NativeFile) for source in filepath_or_buffer + ) for source in filepath_or_buffer: tmp_source, compression = ioutils.get_reader_filepath_or_buffer( path_or_data=source, @@ -654,19 +667,26 @@ def read_parquet( ) # Convert parquet data to a cudf.DataFrame - df = _parquet_to_frame( - filepaths_or_buffers, - engine, - *args, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, - partition_keys=partition_keys, - partition_categories=partition_categories, - dataset_kwargs=dataset_kwargs, - **kwargs, - ) + # Don't want to warn if use_python_file_object causes us to get + # a NativeFile (there is a separate deprecation warning for that) + with maybe_filter_deprecation( + not have_nativefile, + message="Support for reading pyarrow's NativeFile is deprecated", + category=FutureWarning, + ): + df = _parquet_to_frame( + filepaths_or_buffers, + engine, + *args, + columns=columns, + row_groups=row_groups, + use_pandas_metadata=use_pandas_metadata, + partition_keys=partition_keys, + partition_categories=partition_categories, + dataset_kwargs=dataset_kwargs, + **kwargs, + ) # Apply filters row-wise (if any are defined), and return df = _apply_post_filters(df, filters) if projected_columns: @@ -908,7 +928,7 @@ def _read_parquet( "cudf engine doesn't support the " f"following positional arguments: {list(args)}" ) - if cudf.get_option("mode.pandas_compatible"): + if cudf.get_option("io.parquet.low_memory"): return libparquet.ParquetReader( filepaths_or_buffers, columns=columns, @@ -968,6 +988,7 @@ def to_parquet( column_encoding=None, column_type_length=None, output_as_binary=None, + store_schema=False, *args, **kwargs, ): @@ -1023,6 +1044,7 @@ def to_parquet( column_encoding=column_encoding, column_type_length=column_type_length, output_as_binary=output_as_binary, + store_schema=store_schema, ) partition_info = ( @@ -1055,6 +1077,7 @@ def to_parquet( column_encoding=column_encoding, column_type_length=column_type_length, output_as_binary=output_as_binary, + write_arrow_schema=store_schema, ) else: diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index 1f539e7f266..94e73021cec 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -325,6 +325,32 @@ def _integer_and_none_validator(val): _make_contains_validator([False, True]), ) +_register_option( + "io.parquet.low_memory", + False, + textwrap.dedent( + """ + If set to `False`, reads entire parquet in one go. + If set to `True`, reads parquet file in chunks. + \tValid values are True or False. Default is False. + """ + ), + _make_contains_validator([False, True]), +) + +_register_option( + "io.json.low_memory", + False, + textwrap.dedent( + """ + If set to `False`, reads entire json in one go. + If set to `True`, reads json file in chunks. + \tValid values are True or False. Default is False. + """ + ), + _make_contains_validator([False, True]), +) + class option_context(ContextDecorator): """ diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index a64bf7772fe..59a243dd7c4 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -260,6 +260,19 @@ def Index__new__(cls, *args, **kwargs): return self +def Index__setattr__(self, name, value): + if name.startswith("_"): + object.__setattr__(self, name, value) + return + if name == "name": + setattr(self._fsproxy_wrapped, "name", value) + if name == "names": + setattr(self._fsproxy_wrapped, "names", value) + return _FastSlowAttribute("__setattr__").__get__(self, type(self))( + name, value + ) + + Index = make_final_proxy_type( "Index", cudf.Index, @@ -277,11 +290,13 @@ def Index__new__(cls, *args, **kwargs): "__iter__": custom_iter, "__init__": _DELETE, "__new__": Index__new__, + "__setattr__": Index__setattr__, "_constructor": _FastSlowAttribute("_constructor"), "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), "_accessors": set(), "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), + "name": _FastSlowAttribute("name"), }, ) @@ -292,7 +307,11 @@ def Index__new__(cls, *args, **kwargs): fast_to_slow=lambda fast: fast.to_pandas(), slow_to_fast=cudf.from_pandas, bases=(Index,), - additional_attributes={"__init__": _DELETE}, + additional_attributes={ + "__init__": _DELETE, + "__setattr__": Index__setattr__, + "name": _FastSlowAttribute("name"), + }, ) SparseDtype = make_final_proxy_type( @@ -319,7 +338,11 @@ def Index__new__(cls, *args, **kwargs): fast_to_slow=lambda fast: fast.to_pandas(), slow_to_fast=cudf.from_pandas, bases=(Index,), - additional_attributes={"__init__": _DELETE}, + additional_attributes={ + "__init__": _DELETE, + "__setattr__": Index__setattr__, + "name": _FastSlowAttribute("name"), + }, ) Categorical = make_final_proxy_type( @@ -348,8 +371,10 @@ def Index__new__(cls, *args, **kwargs): bases=(Index,), additional_attributes={ "__init__": _DELETE, + "__setattr__": Index__setattr__, "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), + "name": _FastSlowAttribute("name"), }, ) @@ -383,8 +408,10 @@ def Index__new__(cls, *args, **kwargs): bases=(Index,), additional_attributes={ "__init__": _DELETE, + "__setattr__": Index__setattr__, "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), + "name": _FastSlowAttribute("name"), }, ) @@ -439,8 +466,10 @@ def Index__new__(cls, *args, **kwargs): bases=(Index,), additional_attributes={ "__init__": _DELETE, + "__setattr__": Index__setattr__, "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), + "name": _FastSlowAttribute("name"), }, ) @@ -474,6 +503,7 @@ def Index__new__(cls, *args, **kwargs): additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, ) + MultiIndex = make_final_proxy_type( "MultiIndex", cudf.MultiIndex, @@ -481,7 +511,11 @@ def Index__new__(cls, *args, **kwargs): fast_to_slow=lambda fast: fast.to_pandas(), slow_to_fast=cudf.from_pandas, bases=(Index,), - additional_attributes={"__init__": _DELETE}, + additional_attributes={ + "__init__": _DELETE, + "__setattr__": Index__setattr__, + "names": _FastSlowAttribute("names"), + }, ) TimeGrouper = make_intermediate_proxy_type( @@ -667,8 +701,10 @@ def Index__new__(cls, *args, **kwargs): bases=(Index,), additional_attributes={ "__init__": _DELETE, + "__setattr__": Index__setattr__, "_data": _FastSlowAttribute("_data", private=True), "_mask": _FastSlowAttribute("_mask", private=True), + "name": _FastSlowAttribute("name"), }, ) @@ -775,6 +811,18 @@ def Index__new__(cls, *args, **kwargs): pd.core.indexing._LocIndexer, ) +_AtIndexer = make_intermediate_proxy_type( + "_AtIndexer", + cudf.core.dataframe._DataFrameAtIndexer, + pd.core.indexing._AtIndexer, +) + +_iAtIndexer = make_intermediate_proxy_type( + "_iAtIndexer", + cudf.core.dataframe._DataFrameiAtIndexer, + pd.core.indexing._iAtIndexer, +) + FixedForwardWindowIndexer = make_final_proxy_type( "FixedForwardWindowIndexer", _Unusable, @@ -907,6 +955,12 @@ def Index__new__(cls, *args, **kwargs): _eval_func = _FunctionProxy(_Unusable(), pd.eval) +register_proxy_func(pd.read_pickle)( + _FunctionProxy(_Unusable(), pd.read_pickle) +) + +register_proxy_func(pd.to_pickle)(_FunctionProxy(_Unusable(), pd.to_pickle)) + def _get_eval_locals_and_globals(level, local_dict=None, global_dict=None): frame = sys._getframe(level + 3) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index f8bfe340ae5..ed2c5ca06c9 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -4,17 +4,19 @@ import io import os +import numpy as np import pyarrow as pa import pytest from cudf._lib import pylibcudf as plc +from cudf._lib.pylibcudf.io.types import CompressionType def metadata_from_arrow_type( pa_type: pa.Array, name: str = "", ) -> plc.interop.ColumnMetadata | None: - metadata = plc.interop.ColumnMetadata(name) # None + metadata = plc.interop.ColumnMetadata(name) if pa.types.is_list(pa_type): child_meta = [plc.interop.ColumnMetadata("offsets")] for i in range(pa_type.num_fields): @@ -39,9 +41,25 @@ def metadata_from_arrow_type( def assert_column_eq( - lhs: pa.Array | plc.Column, rhs: pa.Array | plc.Column + lhs: pa.Array | plc.Column, + rhs: pa.Array | plc.Column, + check_field_nullability=True, ) -> None: - """Verify that a pylibcudf array and PyArrow array are equal.""" + """Verify that a pylibcudf array and PyArrow array are equal. + + Parameters + ---------- + lhs: Union[pa.Array, plc.Column] + The array with the expected values + rhs: Union[pa.Array, plc.Column] + The array to check + check_field_nullability: + For list/struct dtypes, whether to check if the nullable attributes + on child fields are equal. + + Useful for checking roundtripping of lossy formats like JSON that may not + preserve this information. + """ # Nested types require children metadata to be passed to the conversion function. if isinstance(lhs, (pa.Array, pa.ChunkedArray)) and isinstance( rhs, plc.Column @@ -65,7 +83,47 @@ def assert_column_eq( if isinstance(rhs, pa.ChunkedArray): rhs = rhs.combine_chunks() - assert lhs.equals(rhs) + def _make_fields_nullable(typ): + new_fields = [] + for i in range(typ.num_fields): + child_field = typ.field(i) + if not child_field.nullable: + child_type = child_field.type + if isinstance(child_field.type, (pa.StructType, pa.ListType)): + child_type = _make_fields_nullable(child_type) + new_fields.append( + pa.field(child_field.name, child_type, nullable=True) + ) + else: + new_fields.append(child_field) + + if isinstance(typ, pa.StructType): + return pa.struct(new_fields) + elif isinstance(typ, pa.ListType): + return pa.list_(new_fields[0]) + return typ + + if not check_field_nullability: + rhs_type = _make_fields_nullable(rhs.type) + rhs = rhs.cast(rhs_type) + + lhs_type = _make_fields_nullable(lhs.type) + lhs = rhs.cast(lhs_type) + + if pa.types.is_floating(lhs.type) and pa.types.is_floating(rhs.type): + lhs_nans = pa.compute.is_nan(lhs) + rhs_nans = pa.compute.is_nan(rhs) + assert lhs_nans.equals(rhs_nans) + + if pa.compute.any(lhs_nans) or pa.compute.any(rhs_nans): + # masks must be equal at this point + mask = pa.compute.fill_null(pa.compute.invert(lhs_nans), True) + lhs = lhs.filter(mask) + rhs = rhs.filter(mask) + + np.testing.assert_array_almost_equal(lhs, rhs) + else: + assert lhs.equals(rhs) def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None: @@ -78,20 +136,32 @@ def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None: def assert_table_and_meta_eq( - plc_table_w_meta: plc.io.types.TableWithMetadata, pa_table: pa.Table + pa_table: pa.Table, + plc_table_w_meta: plc.io.types.TableWithMetadata, + check_field_nullability=True, + check_types_if_empty=True, + check_names=True, ) -> None: """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal""" plc_table = plc_table_w_meta.tbl plc_shape = (plc_table.num_rows(), plc_table.num_columns()) - assert plc_shape == pa_table.shape + assert ( + plc_shape == pa_table.shape + ), f"{plc_shape} is not equal to {pa_table.shape}" + + if not check_types_if_empty and plc_table.num_rows() == 0: + return for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): - assert_column_eq(plc_col, pa_col) + assert_column_eq(pa_col, plc_col, check_field_nullability) # Check column name equality - assert plc_table_w_meta.column_names == pa_table.column_names + if check_names: + assert ( + plc_table_w_meta.column_names() == pa_table.column_names + ), f"{plc_table_w_meta.column_names()} != {pa_table.column_names}" def cudf_raises(expected_exception: BaseException, *args, **kwargs): @@ -102,49 +172,10 @@ def cudf_raises(expected_exception: BaseException, *args, **kwargs): return pytest.raises(expected_exception, *args, **kwargs) -# TODO: Consider moving these type utilities into pylibcudf.types itself. -def is_signed_integer(plc_dtype: plc.DataType): - return ( - plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value - ) - - -def is_integer(plc_dtype: plc.DataType): - return plc_dtype.id() in ( - plc.TypeId.INT8, - plc.TypeId.INT16, - plc.TypeId.INT32, - plc.TypeId.INT64, - plc.TypeId.UINT8, - plc.TypeId.UINT16, - plc.TypeId.UINT32, - plc.TypeId.UINT64, - ) - - -def is_floating(plc_dtype: plc.DataType): - return plc_dtype.id() in ( - plc.TypeId.FLOAT32, - plc.TypeId.FLOAT64, - ) - - -def is_boolean(plc_dtype: plc.DataType): - return plc_dtype.id() == plc.TypeId.BOOL8 - - def is_string(plc_dtype: plc.DataType): return plc_dtype.id() == plc.TypeId.STRING -def is_fixed_width(plc_dtype: plc.DataType): - return ( - is_integer(plc_dtype) - or is_floating(plc_dtype) - or is_boolean(plc_dtype) - ) - - def nesting_level(typ) -> tuple[int, int]: """Return list and struct nesting of a pyarrow type.""" if isinstance(typ, pa.ListType): @@ -165,6 +196,48 @@ def is_nested_list(typ): return nesting_level(typ)[0] > 1 +def _convert_numeric_types_to_floating(pa_table): + """ + Useful little helper for testing the + dtypes option in I/O readers. + + Returns a tuple containing the pylibcudf dtypes + and the new pyarrow schema + """ + dtypes = [] + new_fields = [] + for i in range(len(pa_table.schema)): + field = pa_table.schema.field(i) + child_types = [] + + plc_type = plc.interop.from_arrow(field.type) + if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer( + field.type + ): + plc_type = plc.interop.from_arrow(pa.float64()) + field = field.with_type(pa.float64()) + + dtypes.append((field.name, plc_type, child_types)) + + new_fields.append(field) + return dtypes, new_fields + + +def write_source_str(source, input_str): + """ + Write a string to the source + (useful for testing CSV/JSON I/O) + """ + if not isinstance(source, io.IOBase): + with open(source, "w") as source_f: + source_f.write(input_str) + else: + if isinstance(source, io.BytesIO): + input_str = input_str.encode("utf-8") + source.write(input_str) + source.seek(0) + + def sink_to_str(sink): """ Takes a sink (e.g. StringIO/BytesIO, filepath, etc.) @@ -183,6 +256,31 @@ def sink_to_str(sink): return str_result +def make_source(path_or_buf, pa_table, format, **kwargs): + """ + Write a pyarrow Table to a specific format using pandas + by dispatching to the appropriate to_* call. + The caller is responsible for making sure that no arguments + unsupported by pandas are passed in. + """ + df = pa_table.to_pandas() + mode = "w" + if "compression" in kwargs: + kwargs["compression"] = COMPRESSION_TYPE_TO_PANDAS[ + kwargs["compression"] + ] + if kwargs["compression"] is not None and format != "json": + # pandas json method only supports mode="w"/"a" + mode = "wb" + if format == "json": + df.to_json(path_or_buf, mode=mode, **kwargs) + elif format == "csv": + df.to_csv(path_or_buf, mode=mode, **kwargs) + if isinstance(path_or_buf, io.IOBase): + path_or_buf.seek(0) + return path_or_buf + + NUMERIC_PA_TYPES = [pa.int64(), pa.float64(), pa.uint64()] STRING_PA_TYPES = [pa.string()] BOOL_PA_TYPES = [pa.bool_()] @@ -221,4 +319,26 @@ def sink_to_str(sink): + DEFAULT_PA_STRUCT_TESTING_TYPES ) +# Map pylibcudf compression types to pandas ones +# Not all compression types map cleanly, read the comments to learn more! +# If a compression type is unsupported, it maps to False. + +COMPRESSION_TYPE_TO_PANDAS = { + CompressionType.NONE: None, + # Users of this dict will have to special case + # AUTO + CompressionType.AUTO: None, + CompressionType.GZIP: "gzip", + CompressionType.BZIP2: "bz2", + CompressionType.ZIP: "zip", + CompressionType.XZ: "xz", + CompressionType.ZSTD: "zstd", + # Unsupported + CompressionType.ZLIB: False, + CompressionType.LZ4: False, + CompressionType.LZO: False, + # These only work for parquet + CompressionType.SNAPPY: "snappy", + CompressionType.BROTLI: "brotli", +} ALL_PA_TYPES = DEFAULT_PA_TYPES diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index e4760ea7ac8..4a7194a6d8d 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -11,6 +11,7 @@ import pytest import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.io.types import CompressionType sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) @@ -37,6 +38,37 @@ def numeric_pa_type(request): return request.param +def _get_vals_of_type(pa_type, length, seed): + """ + Returns an list-like of random values of that type + """ + rng = np.random.default_rng(seed=seed) + if pa_type == pa.int64(): + half = length // 2 + negs = rng.integers(-length, 0, half, dtype=np.int64) + pos = rng.integers(0, length, length - half, dtype=np.int64) + return np.concatenate([negs, pos]) + elif pa_type == pa.uint64(): + return rng.integers(0, length, length, dtype=np.uint64) + elif pa_type == pa.float64(): + # Round to 6 decimal places or else we have problems comparing our + # output to pandas due to floating point/rounding differences + return rng.uniform(-length, length, length).round(6) + elif pa_type == pa.bool_(): + return rng.integers(0, 2, length, dtype=bool) + elif pa_type == pa.string(): + # Generate random ASCII strings + strs = [] + for _ in range(length): + chrs = rng.integers(33, 128, length) + strs.append("".join(chr(x) for x in chrs)) + return strs + else: + raise NotImplementedError( + f"random data generation not implemented for {pa_type}" + ) + + # TODO: Consider adding another fixture/adapting this # fixture to consider nullability @pytest.fixture(scope="session", params=[0, 100]) @@ -57,10 +89,9 @@ def table_data(request): # plc.io.TableWithMetadata colnames = [] - np.random.seed(42) + seed = 42 for typ in ALL_PA_TYPES: - rand_vals = np.random.randint(0, nrows, nrows) child_colnames = [] def _generate_nested_data(typ): @@ -88,13 +119,17 @@ def _generate_nested_data(typ): child_colnames.append(("", grandchild_colnames)) else: # typ is scalar type - pa_array = pa.array(rand_vals).cast(typ) + pa_array = pa.array( + _get_vals_of_type(typ, nrows, seed=seed), type=typ + ) return pa_array, child_colnames if isinstance(typ, (pa.ListType, pa.StructType)): rand_arr, child_colnames = _generate_nested_data(typ) else: - rand_arr = pa.array(rand_vals).cast(typ) + rand_arr = pa.array( + _get_vals_of_type(typ, nrows, seed=seed), type=typ + ) table_dict[f"col_{typ}"] = rand_arr colnames.append((f"col_{typ}", child_colnames)) @@ -106,6 +141,20 @@ def _generate_nested_data(typ): ), pa_table +@pytest.fixture(params=[(0, 0), ("half", 0), (-1, "half")]) +def nrows_skiprows(table_data, request): + """ + Parametrized nrows fixture that accompanies table_data + """ + _, pa_table = table_data + nrows, skiprows = request.param + if nrows == "half": + nrows = len(pa_table) // 2 + if skiprows == "half": + skiprows = (len(pa_table) - nrows) // 2 + return nrows, skiprows + + @pytest.fixture( params=["a.txt", pathlib.Path("a.txt"), io.BytesIO, io.StringIO], ) @@ -121,6 +170,38 @@ def source_or_sink(request, tmp_path): return fp_or_buf() +unsupported_types = { + # Not supported by pandas + # TODO: find a way to test these + CompressionType.SNAPPY, + CompressionType.BROTLI, + CompressionType.LZ4, + CompressionType.LZO, + CompressionType.ZLIB, +} + +unsupported_text_compression_types = unsupported_types.union( + { + # compressions not supported by libcudf + # for csv/json + CompressionType.XZ, + CompressionType.ZSTD, + } +) + + +@pytest.fixture( + params=set(CompressionType).difference(unsupported_text_compression_types) +) +def text_compression_type(request): + return request.param + + +@pytest.fixture(params=[opt for opt in plc.io.types.CompressionType]) +def compression_type(request): + return request.param + + @pytest.fixture( scope="session", params=[opt for opt in plc.types.Interpolation] ) @@ -136,6 +217,15 @@ def sorted_opt(request): return request.param -@pytest.fixture(scope="session", params=[False, True]) +@pytest.fixture( + scope="session", params=[False, True], ids=["without_nulls", "with_nulls"] +) def has_nulls(request): return request.param + + +@pytest.fixture( + scope="session", params=[False, True], ids=["without_nans", "with_nans"] +) +def has_nans(request): + return request.param diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py index d6cd86768cd..061d6792ce3 100644 --- a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py +++ b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py @@ -120,4 +120,4 @@ def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable): if columns != []: expected = expected.select(columns) - assert_table_and_meta_eq(res, expected) + assert_table_and_meta_eq(expected, res) diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_csv.py b/python/cudf/cudf/pylibcudf_tests/io/test_csv.py new file mode 100644 index 00000000000..95326a8b681 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/io/test_csv.py @@ -0,0 +1,280 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import io +import os +from io import StringIO + +import pandas as pd +import pyarrow as pa +import pytest +from utils import ( + _convert_numeric_types_to_floating, + assert_table_and_meta_eq, + make_source, + write_source_str, +) + +import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.io.types import CompressionType + +# Shared kwargs to pass to make_source +_COMMON_CSV_SOURCE_KWARGS = { + "format": "csv", + "index": False, +} + + +@pytest.fixture(scope="module") +def csv_table_data(table_data): + """ + Like the table_data but with nested types dropped + since the CSV reader can't handle that + uint64 is also dropped since it can get confused with int64 + """ + _, pa_table = table_data + pa_table = pa_table.drop_columns( + [ + "col_uint64", + "col_list", + "col_list>", + "col_struct", + "col_struct not null>", + ] + ) + return plc.interop.from_arrow(pa_table), pa_table + + +@pytest.mark.parametrize("delimiter", [",", ";"]) +def test_read_csv_basic( + csv_table_data, + source_or_sink, + text_compression_type, + nrows_skiprows, + delimiter, +): + _, pa_table = csv_table_data + compression_type = text_compression_type + nrows, skiprows = nrows_skiprows + + # can't compress non-binary data with pandas + if isinstance(source_or_sink, io.StringIO): + compression_type = CompressionType.NONE + + source = make_source( + source_or_sink, + pa_table, + compression=compression_type, + sep=delimiter, + **_COMMON_CSV_SOURCE_KWARGS, + ) + + # Rename the table (by reversing the names) to test names argument + pa_table = pa_table.rename_columns(pa_table.column_names[::-1]) + column_names = pa_table.column_names + + # Adapt to nrows/skiprows + pa_table = pa_table.slice( + offset=skiprows, length=nrows if nrows != -1 else None + ) + + res = plc.io.csv.read_csv( + plc.io.SourceInfo([source]), + delimiter=delimiter, + compression=compression_type, + col_names=column_names, + nrows=nrows, + skiprows=skiprows, + ) + + assert_table_and_meta_eq( + pa_table, + res, + check_types_if_empty=False, + check_names=False if skiprows > 0 and column_names is None else True, + ) + + +# Note: make sure chunk size is big enough so that dtype inference +# infers correctly +@pytest.mark.parametrize("chunk_size", [1000, 5999]) +def test_read_csv_byte_range(table_data, chunk_size, tmp_path): + _, pa_table = table_data + if len(pa_table) == 0: + # pandas writes nothing when we have empty table + # and header=None + pytest.skip("Don't test empty table case") + source = f"{tmp_path}/a.csv" + source = make_source( + source, pa_table, header=False, **_COMMON_CSV_SOURCE_KWARGS + ) + file_size = os.stat(source).st_size + tbls_w_meta = [] + for segment in range((file_size + chunk_size - 1) // chunk_size): + tbls_w_meta.append( + plc.io.csv.read_csv( + plc.io.SourceInfo([source]), + byte_range_offset=segment * chunk_size, + byte_range_size=chunk_size, + header=-1, + col_names=pa_table.column_names, + ) + ) + if isinstance(source, io.IOBase): + source.seek(0) + exp = pd.read_csv(source, names=pa_table.column_names, header=None) + tbls = [] + for tbl_w_meta in tbls_w_meta: + if tbl_w_meta.tbl.num_rows() > 0: + tbls.append(plc.interop.to_arrow(tbl_w_meta.tbl)) + full_tbl = pa.concat_tables(tbls) + + full_tbl_plc = plc.io.TableWithMetadata( + plc.interop.from_arrow(full_tbl), + tbls_w_meta[0].column_names(include_children=True), + ) + assert_table_and_meta_eq(pa.Table.from_pandas(exp), full_tbl_plc) + + +@pytest.mark.parametrize("usecols", [None, ["col_int64", "col_bool"], [0, 1]]) +def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): + # Simple test for dtypes where we read in + # all numeric data as floats + _, pa_table = csv_table_data + + source = make_source( + source_or_sink, + pa_table, + **_COMMON_CSV_SOURCE_KWARGS, + ) + # Adjust table for usecols + if usecols is not None: + pa_table = pa_table.select(usecols) + + dtypes, new_fields = _convert_numeric_types_to_floating(pa_table) + # Extract the dtype out of the (name, type, child_types) tuple + # (read_csv doesn't support this format since it doesn't support nested columns) + dtypes = {name: dtype for name, dtype, _ in dtypes} + + new_schema = pa.schema(new_fields) + + res = plc.io.csv.read_csv( + plc.io.SourceInfo([source]), dtypes=dtypes, usecols=usecols + ) + new_table = pa_table.cast(new_schema) + + assert_table_and_meta_eq(new_table, res) + + +@pytest.mark.parametrize("skip_blanks", [True, False]) +@pytest.mark.parametrize("decimal, quotechar", [(".", "'"), ("_", '"')]) +@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) +def test_read_csv_parse_options( + source_or_sink, decimal, quotechar, skip_blanks, lineterminator +): + lines = [ + "# first comment line", + "# third comment line", + "1,2,3,4_4,'z'", + '4,5,6,5_5,""', + "7,8,9,9_87,'123'", + "# last comment line", + "1,1,1,10_11,abc", + ] + buffer = lineterminator.join(lines) + + write_source_str(source_or_sink, buffer) + + plc_table_w_meta = plc.io.csv.read_csv( + plc.io.SourceInfo([source_or_sink]), + comment="#", + decimal=decimal, + skip_blank_lines=skip_blanks, + quotechar=quotechar, + ) + df = pd.read_csv( + StringIO(buffer), + comment="#", + decimal=decimal, + skip_blank_lines=skip_blanks, + quotechar=quotechar, + ) + assert_table_and_meta_eq(pa.Table.from_pandas(df), plc_table_w_meta) + + +@pytest.mark.parametrize("na_filter", [True, False]) +@pytest.mark.parametrize("na_values", [["n/a"], ["NV_NAN"]]) +@pytest.mark.parametrize("keep_default_na", [True, False]) +def test_read_csv_na_values( + source_or_sink, na_filter, na_values, keep_default_na +): + lines = ["a,b,c", "n/a,NaN,NV_NAN", "1.0,2.0,3.0"] + buffer = "\n".join(lines) + + write_source_str(source_or_sink, buffer) + + plc_table_w_meta = plc.io.csv.read_csv( + plc.io.SourceInfo([source_or_sink]), + na_filter=na_filter, + na_values=na_values if na_filter else None, + keep_default_na=keep_default_na, + ) + df = pd.read_csv( + StringIO(buffer), + na_filter=na_filter, + na_values=na_values if na_filter else None, + keep_default_na=keep_default_na, + ) + assert_table_and_meta_eq(pa.Table.from_pandas(df), plc_table_w_meta) + + +@pytest.mark.parametrize("header", [0, 10, -1]) +def test_read_csv_header(csv_table_data, source_or_sink, header): + _, pa_table = csv_table_data + + source = make_source( + source_or_sink, + pa_table, + **_COMMON_CSV_SOURCE_KWARGS, + ) + + plc_table_w_meta = plc.io.csv.read_csv( + plc.io.SourceInfo([source]), header=header + ) + if header > 0: + if header < len(pa_table): + names_row = pa_table.take([header - 1]).to_pylist()[0].values() + pa_table = pa_table.slice(header) + col_names = [str(name) for name in names_row] + pa_table = pa_table.rename_columns(col_names) + else: + pa_table = pa.table([]) + elif header < 0: + # neg header means use user-provided names (in this case nothing) + # (the original column names are now data) + tbl_dict = pa_table.to_pydict() + new_tbl_dict = {} + for i, (name, vals) in enumerate(tbl_dict.items()): + str_vals = [str(val) for val in vals] + new_tbl_dict[str(i)] = [name] + str_vals + pa_table = pa.table(new_tbl_dict) + + assert_table_and_meta_eq( + pa_table, + plc_table_w_meta, + check_types_if_empty=False, + ) + + +# TODO: test these +# str prefix = "", +# bool mangle_dupe_cols = True, +# size_type skipfooter = 0, +# str thousands = None, +# bool delim_whitespace = False, +# bool skipinitialspace = False, +# quote_style quoting = quote_style.MINIMAL, +# bool doublequote = True, +# bool detect_whitespace_around_quotes = False, +# list parse_dates = None, +# list true_values = None, +# list false_values = None, +# bool dayfirst = False, diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/cudf/cudf/pylibcudf_tests/io/test_json.py index d6b8bfa6976..4239f2438bb 100644 --- a/python/cudf/cudf/pylibcudf_tests/io/test_json.py +++ b/python/cudf/cudf/pylibcudf_tests/io/test_json.py @@ -1,11 +1,21 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import io +import pandas as pd import pyarrow as pa import pytest -from utils import sink_to_str +from utils import ( + assert_table_and_meta_eq, + make_source, + sink_to_str, + write_source_str, +) import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.io.types import CompressionType + +# Shared kwargs to pass to make_source +_COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"} @pytest.mark.parametrize("rows_per_chunk", [8, 100]) @@ -114,3 +124,217 @@ def test_write_json_bool_opts(true_value, false_value): pd_result = pd_result.replace("false", false_value) assert str_result == pd_result + + +@pytest.mark.parametrize("lines", [True, False]) +def test_read_json_basic( + table_data, source_or_sink, lines, text_compression_type +): + compression_type = text_compression_type + + # can't compress non-binary data with pandas + if isinstance(source_or_sink, io.StringIO): + compression_type = CompressionType.NONE + + _, pa_table = table_data + + source = make_source( + source_or_sink, + pa_table, + lines=lines, + compression=compression_type, + **_COMMON_JSON_SOURCE_KWARGS, + ) + + if isinstance(source, io.IOBase): + source.seek(0) + + res = plc.io.json.read_json( + plc.io.SourceInfo([source]), + compression=compression_type, + lines=lines, + ) + + # Adjustments to correct for the fact orient=records is lossy + # and doesn't + # 1) preserve colnames when zero rows in table + # 2) preserve struct nullability + # 3) differentiate int64/uint64 + if len(pa_table) == 0: + pa_table = pa.table([]) + + new_fields = [] + for i in range(len(pa_table.schema)): + curr_field = pa_table.schema.field(i) + if curr_field.type == pa.uint64(): + try: + curr_field = curr_field.with_type(pa.int64()) + except OverflowError: + # There will be no confusion, values are too large + # for int64 anyways + pass + new_fields.append(curr_field) + + pa_table = pa_table.cast(pa.schema(new_fields)) + + # Convert non-nullable struct fields to nullable fields + # since nullable=False cannot roundtrip through orient='records' + # JSON format + assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) + + +def test_read_json_dtypes(table_data, source_or_sink): + # Simple test for dtypes where we read in + # all numeric data as floats + _, pa_table = table_data + source = make_source( + source_or_sink, + pa_table, + lines=True, + **_COMMON_JSON_SOURCE_KWARGS, + ) + + dtypes = [] + new_fields = [] + for i in range(len(pa_table.schema)): + field = pa_table.schema.field(i) + child_types = [] + + def get_child_types(typ): + typ_child_types = [] + for i in range(typ.num_fields): + curr_field = typ.field(i) + typ_child_types.append( + ( + curr_field.name, + curr_field.type, + get_child_types(curr_field.type), + ) + ) + return typ_child_types + + plc_type = plc.interop.from_arrow(field.type) + if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer( + field.type + ): + plc_type = plc.interop.from_arrow(pa.float64()) + field = field.with_type(pa.float64()) + + dtypes.append((field.name, plc_type, child_types)) + + new_fields.append(field) + + new_schema = pa.schema(new_fields) + + res = plc.io.json.read_json( + plc.io.SourceInfo([source]), dtypes=dtypes, lines=True + ) + new_table = pa_table.cast(new_schema) + + # orient=records is lossy + # and doesn't preserve column names when there's zero rows in the table + if len(new_table) == 0: + new_table = pa.table([]) + + assert_table_and_meta_eq(new_table, res, check_field_nullability=False) + + +@pytest.mark.parametrize("chunk_size", [10, 15, 20]) +def test_read_json_lines_byte_range(source_or_sink, chunk_size): + source = source_or_sink + if isinstance(source_or_sink, io.StringIO): + pytest.skip("byte_range doesn't work on StringIO") + + json_str = "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n" + write_source_str(source, json_str) + + tbls_w_meta = [] + for chunk_start in range(0, len(json_str.encode("utf-8")), chunk_size): + tbls_w_meta.append( + plc.io.json.read_json( + plc.io.SourceInfo([source]), + lines=True, + byte_range_offset=chunk_start, + byte_range_size=chunk_start + chunk_size, + ) + ) + + if isinstance(source, io.IOBase): + source.seek(0) + exp = pd.read_json(source, orient="records", lines=True) + + # TODO: can do this operation using pylibcudf + tbls = [] + for tbl_w_meta in tbls_w_meta: + if tbl_w_meta.tbl.num_rows() > 0: + tbls.append(plc.interop.to_arrow(tbl_w_meta.tbl)) + full_tbl = pa.concat_tables(tbls) + + full_tbl_plc = plc.io.TableWithMetadata( + plc.interop.from_arrow(full_tbl), + tbls_w_meta[0].column_names(include_children=True), + ) + assert_table_and_meta_eq(pa.Table.from_pandas(exp), full_tbl_plc) + + +@pytest.mark.parametrize("keep_quotes", [True, False]) +def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink): + source = source_or_sink + + json_bytes = '["a", "b", "c"]\n' + write_source_str(source, json_bytes) + + tbl_w_meta = plc.io.json.read_json( + plc.io.SourceInfo([source]), lines=True, keep_quotes=keep_quotes + ) + + template = "{0}" + if keep_quotes: + template = '"{0}"' + + exp = pa.Table.from_arrays( + [ + [template.format("a")], + [template.format("b")], + [template.format("c")], + ], + names=["0", "1", "2"], + ) + + assert_table_and_meta_eq(exp, tbl_w_meta) + + +@pytest.mark.parametrize( + "recovery_mode", [opt for opt in plc.io.types.JSONRecoveryMode] +) +def test_read_json_lines_recovery_mode(recovery_mode, source_or_sink): + source = source_or_sink + + json_str = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n' + write_source_str(source, json_str) + + if recovery_mode == plc.io.types.JSONRecoveryMode.FAIL: + with pytest.raises(RuntimeError): + plc.io.json.read_json( + plc.io.SourceInfo([source]), + lines=True, + recovery_mode=recovery_mode, + ) + else: + # Recover case (bad values replaced with nulls) + tbl_w_meta = plc.io.json.read_json( + plc.io.SourceInfo([source]), + lines=True, + recovery_mode=recovery_mode, + ) + exp = pa.Table.from_arrays( + [[1, 2, None, 3], [10, 11, None, 12]], names=["a", "b"] + ) + assert_table_and_meta_eq(exp, tbl_w_meta) + + +# TODO: Add tests for these! +# Tests were not added in the initial PR porting the JSON reader to pylibcudf +# to save time (and since there are no existing tests for these in Python cuDF) +# mixed_types_as_string = mixed_types_as_string, +# prune_columns = prune_columns, diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py index 287dd8f21c8..438c482b77a 100644 --- a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py +++ b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py @@ -2,11 +2,9 @@ import io -import pyarrow as pa import pytest import cudf._lib.pylibcudf as plc -from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo]) @@ -18,10 +16,8 @@ def _skip_invalid_sinks(io_class, sink): """ Skip invalid sinks for SinkInfo """ - if io_class is plc.io.SinkInfo and isinstance( - sink, (bytes, NativeFileDatasource) - ): - pytest.skip(f"{sink} is not a valid input for SinkInfo") + if io_class is plc.io.SinkInfo and isinstance(sink, bytes): + pytest.skip("bytes is not a valid input for SinkInfo") @pytest.mark.parametrize( @@ -30,7 +26,6 @@ def _skip_invalid_sinks(io_class, sink): "a.txt", b"hello world", io.BytesIO(b"hello world"), - NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), ], ) def test_source_info_ctor(io_class, source, tmp_path): @@ -47,13 +42,12 @@ def test_source_info_ctor(io_class, source, tmp_path): @pytest.mark.parametrize( "sources", [ + ["a.txt"], + [b"hello world"], + [io.BytesIO(b"hello world")], ["a.txt", "a.txt"], [b"hello world", b"hello there"], [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")], - [ - NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), - NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), - ], ], ) def test_source_info_ctor_multiple(io_class, sources, tmp_path): @@ -79,11 +73,6 @@ def test_source_info_ctor_multiple(io_class, sources, tmp_path): io.BytesIO(b"hello there"), b"hello world", ], - [ - NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), - "awef.txt", - b"hello world", - ], ], ) def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path): diff --git a/python/cudf/cudf/pylibcudf_tests/test_binaryops.py b/python/cudf/cudf/pylibcudf_tests/test_binaryops.py new file mode 100644 index 00000000000..a83caf39ead --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_binaryops.py @@ -0,0 +1,786 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import math + +import numpy as np +import pyarrow as pa +import pytest +from utils import assert_column_eq + +from cudf._lib import pylibcudf as plc + + +def idfn(param): + ltype, rtype, outtype, plc_op, _ = param + params = (plc_op.name, ltype, rtype, outtype) + return "-".join(map(str, params)) + + +@pytest.fixture(params=[True, False], ids=["nulls", "no_nulls"]) +def nulls(request): + return request.param + + +def make_col(dtype, nulls): + if dtype == "int64": + data = [1, 2, 3, 4, 5] + pa_type = pa.int64() + elif dtype == "uint64": + data = [1, 2, 3, 4, 5] + pa_type = pa.uint64() + elif dtype == "float64": + data = [1.0, 2.0, 3.0, 4.0, 5.0] + pa_type = pa.float64() + elif dtype == "bool": + data = [True, False, True, False, True] + pa_type = pa.bool_() + elif dtype == "timestamp64[ns]": + data = [ + np.datetime64("2022-01-01"), + np.datetime64("2022-01-02"), + np.datetime64("2022-01-03"), + np.datetime64("2022-01-04"), + np.datetime64("2022-01-05"), + ] + pa_type = pa.timestamp("ns") + elif dtype == "timedelta64[ns]": + data = [ + np.timedelta64(1, "ns"), + np.timedelta64(2, "ns"), + np.timedelta64(3, "ns"), + np.timedelta64(4, "ns"), + np.timedelta64(5, "ns"), + ] + pa_type = pa.duration("ns") + else: + raise ValueError("Unsupported dtype") + + if nulls: + data[3] = None + + return pa.array(data, type=pa_type) + + +@pytest.fixture +def pa_data(request, nulls): + ltype, rtype, outtype = request.param + values = make_col(ltype, nulls), make_col(rtype, nulls), outtype + return values + + +@pytest.fixture +def plc_data(pa_data): + lhs, rhs, outtype = pa_data + return ( + plc.interop.from_arrow(lhs), + plc.interop.from_arrow(rhs), + plc.interop.from_arrow(pa.from_numpy_dtype(np.dtype(outtype))), + ) + + +@pytest.fixture +def tests(request, nulls): + ltype, rtype, py_outtype, plc_op, py_op = request.param + pa_lhs, pa_rhs = make_col(ltype, nulls), make_col(rtype, nulls) + plc_lhs, plc_rhs = ( + plc.interop.from_arrow(pa_lhs), + plc.interop.from_arrow(pa_rhs), + ) + plc_dtype = plc.interop.from_arrow( + pa.from_numpy_dtype(np.dtype(py_outtype)) + ) + return ( + pa_lhs, + pa_rhs, + py_outtype, + plc_lhs, + plc_rhs, + plc_dtype, + py_op, + plc_op, + ) + + +def custom_pyop(func): + def wrapper(x, y): + x = x.to_pylist() + y = y.to_pylist() + + def inner(x, y): + if x is None or y is None: + return None + return func(x, y) + + return pa.array([inner(x, y) for x, y in zip(x, y)]) + + return wrapper + + +@custom_pyop +def py_floordiv(x, y): + return x // y + + +@custom_pyop +def py_pmod(x, y): + return (x % y + y) % y + + +@custom_pyop +def py_mod(x, y): + return x % y + + +@custom_pyop +def py_atan2(x, y): + return math.atan2(x, y) + + +@custom_pyop +def py_shift_right_unsigned(x, y): + unsigned_x = np.uint32(x) + result = unsigned_x >> y + return result + + +@pytest.mark.parametrize( + "tests", + [ + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.ADD, + pa.compute.add, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.ADD, + pa.compute.add, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.ADD, + pa.compute.add, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.SUB, + pa.compute.subtract, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.SUB, + pa.compute.subtract, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.SUB, + pa.compute.subtract, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.MUL, + pa.compute.multiply, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.MUL, + pa.compute.multiply, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.MUL, + pa.compute.multiply, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.DIV, + pa.compute.divide, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.DIV, + pa.compute.divide, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.DIV, + pa.compute.divide, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.TRUE_DIV, + pa.compute.divide, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.TRUE_DIV, + pa.compute.divide, + ), + ( + "int64", + "int64", + "timedelta64[ns]", + plc.binaryop.BinaryOperator.TRUE_DIV, + pa.compute.divide, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.FLOOR_DIV, + py_floordiv, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.FLOOR_DIV, + py_floordiv, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.FLOOR_DIV, + py_floordiv, + ), + ("int64", "int64", "int64", plc.binaryop.BinaryOperator.MOD, py_mod), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.MOD, + py_mod, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.MOD, + py_mod, + ), + ("int64", "int64", "int64", plc.binaryop.BinaryOperator.PMOD, py_pmod), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.PMOD, + py_pmod, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.PMOD, + py_pmod, + ), + ("int64", "int64", "int64", plc.binaryop.BinaryOperator.PYMOD, py_mod), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.PYMOD, + py_mod, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.PYMOD, + py_mod, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.POW, + pa.compute.power, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.POW, + pa.compute.power, + ), + ( + "int64", + "int64", + "timedelta64[ns]", + plc.binaryop.BinaryOperator.POW, + pa.compute.power, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.INT_POW, + pa.compute.power, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.INT_POW, + pa.compute.power, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.INT_POW, + pa.compute.power, + ), + ( + "float64", + "float64", + "float64", + plc.binaryop.BinaryOperator.LOG_BASE, + pa.compute.logb, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.LOG_BASE, + pa.compute.logb, + ), + ( + "int64", + "int64", + "timedelta64[ns]", + plc.binaryop.BinaryOperator.LOG_BASE, + pa.compute.logb, + ), + ( + "float64", + "float64", + "float64", + plc.binaryop.BinaryOperator.ATAN2, + py_atan2, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.ATAN2, + py_atan2, + ), + ( + "int64", + "int64", + "timedelta64[ns]", + plc.binaryop.BinaryOperator.ATAN2, + py_atan2, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.SHIFT_LEFT, + pa.compute.shift_left, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.SHIFT_LEFT, + pa.compute.shift_left, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.SHIFT_LEFT, + pa.compute.shift_left, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.SHIFT_RIGHT, + pa.compute.shift_right, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.SHIFT_RIGHT, + pa.compute.shift_right, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.SHIFT_RIGHT, + pa.compute.shift_right, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED, + py_shift_right_unsigned, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED, + py_shift_right_unsigned, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED, + py_shift_right_unsigned, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.BITWISE_AND, + pa.compute.bit_wise_and, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.BITWISE_AND, + pa.compute.bit_wise_and, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.BITWISE_AND, + pa.compute.bit_wise_and, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.BITWISE_OR, + pa.compute.bit_wise_or, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.BITWISE_OR, + pa.compute.bit_wise_or, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.BITWISE_OR, + pa.compute.bit_wise_or, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.BITWISE_XOR, + pa.compute.bit_wise_xor, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.BITWISE_XOR, + pa.compute.bit_wise_xor, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.BITWISE_XOR, + pa.compute.bit_wise_xor, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.LOGICAL_AND, + pa.compute.and_, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.LOGICAL_AND, + pa.compute.and_, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.LOGICAL_AND, + pa.compute.and_, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.LOGICAL_OR, + pa.compute.or_, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.LOGICAL_OR, + pa.compute.or_, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.LOGICAL_OR, + pa.compute.or_, + ), + ( + "int64", + "int64", + "bool", + plc.binaryop.BinaryOperator.EQUAL, + pa.compute.equal, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.EQUAL, + pa.compute.equal, + ), + ( + "int64", + "int64", + "bool", + plc.binaryop.BinaryOperator.NOT_EQUAL, + pa.compute.not_equal, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.NOT_EQUAL, + pa.compute.not_equal, + ), + ( + "int64", + "int64", + "bool", + plc.binaryop.BinaryOperator.LESS, + pa.compute.less, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.LESS, + pa.compute.less, + ), + ( + "int64", + "int64", + "bool", + plc.binaryop.BinaryOperator.GREATER, + pa.compute.greater, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.GREATER, + pa.compute.greater, + ), + ( + "int64", + "int64", + "bool", + plc.binaryop.BinaryOperator.LESS_EQUAL, + pa.compute.less_equal, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.LESS_EQUAL, + pa.compute.less_equal, + ), + ( + "int64", + "int64", + "bool", + plc.binaryop.BinaryOperator.GREATER_EQUAL, + pa.compute.greater_equal, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.GREATER_EQUAL, + pa.compute.greater_equal, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.NULL_EQUALS, + pa.compute.equal, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.NULL_EQUALS, + pa.compute.equal, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.NULL_MAX, + pa.compute.max_element_wise, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.NULL_MAX, + pa.compute.max_element_wise, + ), + ( + "int64", + "int64", + "datetime64[ns]", + plc.binaryop.BinaryOperator.NULL_MIN, + pa.compute.min_element_wise, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.NULL_MIN, + pa.compute.min_element_wise, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.NULL_NOT_EQUALS, + pa.compute.not_equal, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.NULL_NOT_EQUALS, + pa.compute.not_equal, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + pa.compute.and_, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + pa.compute.and_, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + pa.compute.or_, + ), + ( + "int64", + "float64", + "float64", + plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + pa.compute.or_, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.GENERIC_BINARY, + None, + ), + ( + "int64", + "int64", + "int64", + plc.binaryop.BinaryOperator.INVALID_BINARY, + None, + ), + ], + indirect=True, + ids=idfn, +) +def test_binaryops(tests): + ( + pa_lhs, + pa_rhs, + py_outtype, + plc_lhs, + plc_rhs, + plc_outtype, + py_op, + plc_op, + ) = tests + + def get_result(): + return plc.binaryop.binary_operation( + plc_lhs, + plc_rhs, + plc_op, + plc_outtype, + ) + + if not plc.binaryop.is_supported_operation( + plc_outtype, plc_lhs.type(), plc_rhs.type(), plc_op + ): + with pytest.raises(TypeError): + get_result() + else: + expect = py_op(pa_lhs, pa_rhs).cast(py_outtype) + got = get_result() + assert_column_eq(expect, got) diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index 0a6df198d46..f27fe4e942e 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -9,9 +9,6 @@ assert_column_eq, assert_table_eq, cudf_raises, - is_fixed_width, - is_floating, - is_integer, is_nested_list, is_nested_struct, is_string, @@ -359,9 +356,9 @@ def test_scatter_table_type_mismatch(source_table, index_column, target_table): _, plc_index_column = index_column _, plc_target_table = target_table with cudf_raises(TypeError): - if is_integer( + if plc.traits.is_integral_not_bool( dtype := plc_target_table.columns()[0].type() - ) or is_floating(dtype): + ) or plc.traits.is_floating_point(dtype): pa_array = pa.array([True] * plc_source_table.num_rows()) else: pa_array = pa.array([1] * plc_source_table.num_rows()) @@ -428,9 +425,9 @@ def test_scatter_scalars_type_mismatch(index_column, target_table): _, plc_index_column = index_column _, plc_target_table = target_table with cudf_raises(TypeError): - if is_integer( + if plc.traits.is_integral_not_bool( dtype := plc_target_table.columns()[0].type() - ) or is_floating(dtype): + ) or plc.traits.is_floating_point(dtype): plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))] else: plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))] @@ -458,7 +455,7 @@ def test_empty_like_table(source_table): @pytest.mark.parametrize("size", [None, 10]) def test_allocate_like(input_column, size): _, plc_input_column = input_column - if is_fixed_width(plc_input_column.type()): + if plc.traits.is_fixed_width(plc_input_column.type()): result = plc.copying.allocate_like( plc_input_column, plc.copying.MaskAllocationPolicy.RETAIN, @@ -484,7 +481,7 @@ def test_copy_range_in_place( pa_target_column, _ = target_column - if not is_fixed_width(mutable_target_column.type()): + if not plc.traits.is_fixed_width(mutable_target_column.type()): with pytest.raises(TypeError): plc.copying.copy_range_in_place( plc_input_column, @@ -516,7 +513,7 @@ def test_copy_range_in_place_out_of_bounds( ): _, plc_input_column = input_column - if is_fixed_width(mutable_target_column.type()): + if plc.traits.is_fixed_width(mutable_target_column.type()): with cudf_raises(IndexError): plc.copying.copy_range_in_place( plc_input_column, @@ -528,7 +525,9 @@ def test_copy_range_in_place_out_of_bounds( def test_copy_range_in_place_different_types(mutable_target_column): - if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := mutable_target_column.type() + ) or plc.traits.is_floating_point(dtype): plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) @@ -548,7 +547,7 @@ def test_copy_range_in_place_null_mismatch( ): pa_input_column, _ = input_column - if is_fixed_width(mutable_target_column.type()): + if plc.traits.is_fixed_width(mutable_target_column.type()): pa_input_column = pc.if_else( _pyarrow_index_to_mask([0], len(pa_input_column)), pa_input_column, @@ -568,7 +567,9 @@ def test_copy_range_in_place_null_mismatch( def test_copy_range(input_column, target_column): pa_input_column, plc_input_column = input_column pa_target_column, plc_target_column = target_column - if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): + if plc.traits.is_fixed_width( + dtype := plc_target_column.type() + ) or is_string(dtype): result = plc.copying.copy_range( plc_input_column, plc_target_column, @@ -610,7 +611,9 @@ def test_copy_range_out_of_bounds(input_column, target_column): def test_copy_range_different_types(target_column): _, plc_target_column = target_column - if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := plc_target_column.type() + ) or plc.traits.is_floating_point(dtype): plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) @@ -629,7 +632,9 @@ def test_shift(target_column, source_scalar): pa_source_scalar, plc_source_scalar = source_scalar pa_target_column, plc_target_column = target_column shift = 2 - if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): + if plc.traits.is_fixed_width( + dtype := plc_target_column.type() + ) or is_string(dtype): result = plc.copying.shift(plc_target_column, shift, plc_source_scalar) expected = pa.concat_arrays( [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]] @@ -642,7 +647,9 @@ def test_shift(target_column, source_scalar): def test_shift_type_mismatch(target_column): _, plc_target_column = target_column - if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := plc_target_column.type() + ) or plc.traits.is_floating_point(dtype): fill_value = plc.interop.from_arrow(pa.scalar("a")) else: fill_value = plc.interop.from_arrow(pa.scalar(1)) @@ -747,7 +754,9 @@ def test_copy_if_else_column_column(target_column, mask, source_scalar): def test_copy_if_else_wrong_type(target_column, mask): _, plc_target_column = target_column _, plc_mask = mask - if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := plc_target_column.type() + ) or plc.traits.is_floating_point(dtype): plc_input_column = plc.interop.from_arrow( pa.array(["a"] * plc_target_column.size()) ) @@ -951,9 +960,9 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table): def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask): _, plc_target_table = target_table _, plc_mask = mask - if is_integer( + if plc.traits.is_integral_not_bool( dtype := plc_target_table.columns()[0].type() - ) or is_floating(dtype): + ) or plc.traits.is_floating_point(dtype): input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) diff --git a/python/cudf/cudf/pylibcudf_tests/test_expressions.py b/python/cudf/cudf/pylibcudf_tests/test_expressions.py new file mode 100644 index 00000000000..f661512caad --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_expressions.py @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pytest + +import cudf._lib.pylibcudf as plc + +# We can't really evaluate these expressions, so just make sure +# construction works properly + + +def test_literal_construction_invalid(): + with pytest.raises(ValueError): + plc.expressions.Literal( + plc.interop.from_arrow(pa.scalar(None, type=pa.list_(pa.int64()))) + ) + + +@pytest.mark.parametrize( + "tableref", + [ + plc.expressions.TableReference.LEFT, + plc.expressions.TableReference.RIGHT, + ], +) +def test_columnref_construction(tableref): + plc.expressions.ColumnReference(1.0, tableref) + + +def test_columnnameref_construction(): + plc.expressions.ColumnNameReference("abc") + + +@pytest.mark.parametrize( + "kwargs", + [ + # Unary op + { + "op": plc.expressions.ASTOperator.IDENTITY, + "left": plc.expressions.ColumnReference(1), + }, + # Binop + { + "op": plc.expressions.ASTOperator.ADD, + "left": plc.expressions.ColumnReference(1), + "right": plc.expressions.ColumnReference(2), + }, + ], +) +def test_astoperation_construction(kwargs): + plc.expressions.Operation(**kwargs) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index c781126e388..7cfed884f90 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -134,3 +134,60 @@ def test_index_of_list_column(test_data, column): expect = pa.array(column[1], type=pa.int32()) assert_column_eq(expect, res) + + +def test_reverse(test_data): + list_column = test_data[0][0] + arr = pa.array(list_column) + plc_column = plc.interop.from_arrow(arr) + + res = plc.lists.reverse(plc_column) + + expect = pa.array([lst[::-1] for lst in list_column]) + + assert_column_eq(expect, res) + + +def test_segmented_gather(test_data): + list_column1 = test_data[0][0] + list_column2 = test_data[0][1] + + plc_column1 = plc.interop.from_arrow(pa.array(list_column1)) + plc_column2 = plc.interop.from_arrow(pa.array(list_column2)) + + res = plc.lists.segmented_gather(plc_column2, plc_column1) + + expect = pa.array([[8, 9], [14], [0], [0, 0]]) + + assert_column_eq(expect, res) + + +def test_extract_list_element_scalar(test_data): + arr = pa.array(test_data[0][0]) + plc_column = plc.interop.from_arrow(arr) + + res = plc.lists.extract_list_element(plc_column, 0) + expect = pa.compute.list_element(test_data[0][0], 0) + + assert_column_eq(expect, res) + + +def test_extract_list_element_column(test_data): + arr = pa.array(test_data[0][0]) + plc_column = plc.interop.from_arrow(arr) + indices = plc.interop.from_arrow(pa.array([0, 1, -4, -1])) + + res = plc.lists.extract_list_element(plc_column, indices) + expect = pa.array([0, None, None, 7]) + + assert_column_eq(expect, res) + + +def test_count_elements(test_data): + arr = pa.array(test_data[0][1]) + plc_column = plc.interop.from_arrow(arr) + res = plc.lists.count_elements(plc_column) + + expect = pa.array([1, 1, 0, 3], type=pa.int32()) + + assert_column_eq(expect, res) diff --git a/python/cudf/cudf/pylibcudf_tests/test_traits.py b/python/cudf/cudf/pylibcudf_tests/test_traits.py new file mode 100644 index 00000000000..6c22cb02f21 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_traits.py @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib import pylibcudf as plc + + +def test_is_relationally_comparable(): + assert plc.traits.is_relationally_comparable(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_relationally_comparable( + plc.DataType(plc.TypeId.LIST) + ) + + +def test_is_equality_comparable(): + assert plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.LIST)) + + +def test_is_numeric(): + assert plc.traits.is_numeric(plc.DataType(plc.TypeId.FLOAT64)) + assert not plc.traits.is_numeric(plc.DataType(plc.TypeId.LIST)) + + +def test_is_index_type(): + assert plc.traits.is_index_type(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_index_type(plc.DataType(plc.TypeId.BOOL8)) + + +def test_is_unsigned(): + assert plc.traits.is_unsigned(plc.DataType(plc.TypeId.UINT8)) + assert not plc.traits.is_unsigned(plc.DataType(plc.TypeId.INT8)) + + +def test_is_integral(): + assert plc.traits.is_integral(plc.DataType(plc.TypeId.BOOL8)) + assert not plc.traits.is_integral(plc.DataType(plc.TypeId.DECIMAL32)) + + +def test_is_integral_not_bool(): + assert plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.BOOL8)) + + +def test_is_floating_point(): + assert plc.traits.is_floating_point(plc.DataType(plc.TypeId.FLOAT64)) + assert not plc.traits.is_floating_point(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_boolean(): + assert plc.traits.is_boolean(plc.DataType(plc.TypeId.BOOL8)) + assert not plc.traits.is_boolean(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_timestamp(): + assert plc.traits.is_timestamp( + plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + ) + assert not plc.traits.is_timestamp( + plc.DataType(plc.TypeId.DURATION_MICROSECONDS) + ) + + +def test_is_fixed_point(): + assert plc.traits.is_fixed_point(plc.DataType(plc.TypeId.DECIMAL128)) + assert not plc.traits.is_fixed_point(plc.DataType(plc.TypeId.FLOAT32)) + + +def test_is_duration(): + assert plc.traits.is_duration( + plc.DataType(plc.TypeId.DURATION_MICROSECONDS) + ) + assert not plc.traits.is_duration( + plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + ) + + +def test_is_chrono(): + assert plc.traits.is_chrono(plc.DataType(plc.TypeId.DURATION_MICROSECONDS)) + assert plc.traits.is_chrono( + plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + ) + assert not plc.traits.is_chrono(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_dictionary(): + assert plc.traits.is_dictionary(plc.DataType(plc.TypeId.DICTIONARY32)) + assert not plc.traits.is_dictionary(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_fixed_width(): + assert plc.traits.is_fixed_width(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_fixed_width(plc.DataType(plc.TypeId.STRING)) + + +def test_is_compound(): + assert plc.traits.is_compound(plc.DataType(plc.TypeId.STRUCT)) + assert not plc.traits.is_compound(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_nested(): + assert plc.traits.is_nested(plc.DataType(plc.TypeId.STRUCT)) + assert not plc.traits.is_nested(plc.DataType(plc.TypeId.STRING)) + + +def test_is_bit_castable(): + assert plc.traits.is_bit_castable( + plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT8) + ) + assert not plc.traits.is_bit_castable( + plc.DataType(plc.TypeId.UINT8), plc.DataType(plc.TypeId.UINT16) + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_transform.py b/python/cudf/cudf/pylibcudf_tests/test_transform.py new file mode 100644 index 00000000000..312939888dd --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_transform.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import math + +import pyarrow as pa +from utils import assert_column_eq + +from cudf._lib import pylibcudf as plc + + +def test_nans_to_nulls(has_nans): + if has_nans: + values = [1, float("nan"), float("nan"), None, 3, None] + else: + values = [1, 4, 5, None, 3, None] + + replaced = [ + None if (v is None or (v is not None and math.isnan(v))) else v + for v in values + ] + + h_input = pa.array(values, type=pa.float32()) + input = plc.interop.from_arrow(h_input) + assert input.null_count() == h_input.null_count + expect = pa.array(replaced, type=pa.float32()) + + mask, null_count = plc.transform.nans_to_nulls(input) + + assert null_count == expect.null_count + got = input.with_mask(mask, null_count) + + assert_column_eq(expect, got) diff --git a/python/cudf/cudf/pylibcudf_tests/test_unary.py b/python/cudf/cudf/pylibcudf_tests/test_unary.py new file mode 100644 index 00000000000..b5e4f0cb0e8 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_unary.py @@ -0,0 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib import pylibcudf as plc + + +def test_is_supported_cast(): + assert plc.unary.is_supported_cast( + plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT64) + ) + assert plc.unary.is_supported_cast( + plc.DataType(plc.TypeId.DURATION_MILLISECONDS), + plc.DataType(plc.TypeId.UINT64), + ) + assert not plc.unary.is_supported_cast( + plc.DataType(plc.TypeId.INT32), plc.DataType(plc.TypeId.TIMESTAMP_DAYS) + ) + assert not plc.unary.is_supported_cast( + plc.DataType(plc.TypeId.INT32), plc.DataType(plc.TypeId.STRING) + ) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index e56c8d867cb..c2072d90e98 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -158,12 +158,12 @@ def assert_column_equal( return True if check_datetimelike_compat: - if np.issubdtype(left.dtype, np.datetime64): + if left.dtype.kind == "M": right = right.astype(left.dtype) - elif np.issubdtype(right.dtype, np.datetime64): + elif right.dtype.kind == "M": left = left.astype(right.dtype) - if np.issubdtype(left.dtype, np.datetime64): + if left.dtype.kind == "M": if not left.equals(right): raise AssertionError( f"[datetimelike_compat=True] {left.values} " @@ -779,9 +779,7 @@ def assert_eq(left, right, **kwargs): tm.assert_index_equal(left, right, **kwargs) elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray): - if np.issubdtype(left.dtype, np.floating) and np.issubdtype( - right.dtype, np.floating - ): + if left.dtype.kind == "f" and right.dtype.kind == "f": assert np.allclose(left, right, equal_nan=True) else: assert np.array_equal(left, right) diff --git a/python/cudf/cudf/tests/series/test_conversion.py b/python/cudf/cudf/tests/series/test_conversion.py index e1dd359e1ba..1d680d7860d 100644 --- a/python/cudf/cudf/tests/series/test_conversion.py +++ b/python/cudf/cudf/tests/series/test_conversion.py @@ -31,5 +31,18 @@ def test_convert_dtypes(data, dtype): assert_eq(expect, got) +def test_convert_integer_false_convert_floating_true(): + data = [1.000000000000000000000000001, 1] + expected = pd.Series(data).convert_dtypes( + convert_integer=False, convert_floating=True + ) + result = ( + cudf.Series(data) + .convert_dtypes(convert_integer=False, convert_floating=True) + .to_pandas(nullable=True) + ) + assert_eq(result, expected) + + # Now write the same test, but construct a DataFrame # as input instead of parametrizing: diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 7d8c3b53115..503b1a975b4 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -539,7 +539,14 @@ def test_series_reflected_ops_scalar(func, dtype, obj_class): if obj_class == "Index": gs = Index(gs) - gs_result = func(gs) + try: + gs_result = func(gs) + except OverflowError: + # An error is fine, if pandas raises the same error: + with pytest.raises(OverflowError): + func(random_series) + + return # class typing if obj_class == "Index": @@ -589,7 +596,14 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class): if obj_class == "Index": gs = Index(gs) - gs_result = gpu_func(gs) + try: + gs_result = gpu_func(gs) + except OverflowError: + # An error is fine, if pandas raises the same error: + with pytest.raises(OverflowError): + cpu_func(random_series) + + return # class typing if obj_class == "Index": @@ -770,7 +784,8 @@ def test_operator_func_series_and_scalar( fill_value=fill_value, ) pdf_series_result = getattr(pdf_series, func)( - scalar, fill_value=fill_value + np.array(scalar)[()] if use_cudf_scalar else scalar, + fill_value=fill_value, ) assert_eq(pdf_series_result, gdf_series_result) @@ -1679,12 +1694,7 @@ def test_scalar_null_binops(op, dtype_l, dtype_r): rhs = cudf.Scalar(cudf.NA, dtype=dtype_r) result = op(lhs, rhs) - assert result.value is ( - cudf.NaT - if cudf.api.types.is_datetime64_dtype(result.dtype) - or cudf.api.types.is_timedelta64_dtype(result.dtype) - else cudf.NA - ) + assert result.value is (cudf.NaT if result.dtype.kind in "mM" else cudf.NA) # make sure dtype is the same as had there been a valid scalar valid_lhs = cudf.Scalar(1, dtype=dtype_l) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 09617306606..6a21cb1b9d7 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1085,8 +1085,9 @@ def test_csv_reader_arrow_nativefile(path_or_buf): # Arrow FileSystem interface expect = cudf.read_csv(path_or_buf("filepath")) fs, path = pa_fs.FileSystem.from_uri(path_or_buf("filepath")) - with fs.open_input_file(path) as fil: - got = cudf.read_csv(fil) + with pytest.warns(FutureWarning): + with fs.open_input_file(path) as fil: + got = cudf.read_csv(fil) assert_eq(expect, got) @@ -1191,7 +1192,7 @@ def test_csv_reader_byte_range_type_corner_case(tmpdir): ).to_csv(fname, chunksize=100000) byte_range = (2_147_483_648, 0) - with pytest.raises(RuntimeError, match="Offset is past end of file"): + with pytest.raises(OverflowError, match="Offset is past end of file"): cudf.read_csv(fname, byte_range=byte_range, header=None) @@ -1617,7 +1618,7 @@ def test_csv_reader_partial_dtype(dtype): StringIO('"A","B","C"\n0,1,2'), dtype=dtype, usecols=["A", "C"] ) - assert names_df == header_df + assert_eq(names_df, header_df) assert all(names_df.dtypes == ["int16", "int64"]) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f40106a30f4..e2ce5c03b70 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5234,7 +5234,7 @@ def test_rowwise_ops(data, op, skipna, numeric_only): else (pdf[column].notna().count() == 0) ) or cudf.api.types.is_numeric_dtype(pdf[column].dtype) - or cudf.api.types.is_bool_dtype(pdf[column].dtype) + or pdf[column].dtype.kind == "b" for column in pdf ): with pytest.raises(TypeError): @@ -5457,9 +5457,7 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only): gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - if not numeric_only and not all( - cudf.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes - ): + if not numeric_only and not all(dt.kind == "M" for dt in gdf.dtypes): with pytest.raises(TypeError): got = getattr(gdf, op)( axis=1, skipna=skipna, numeric_only=numeric_only @@ -10835,7 +10833,7 @@ def test_dataframe_contains(name, contains, other_names): expectation = contains is cudf.NA and name is cudf.NA assert (contains in pdf) == expectation assert (contains in gdf) == expectation - elif pd.api.types.is_float_dtype(gdf.columns.dtype): + elif gdf.columns.dtype.kind == "f": # In some cases, the columns are converted to an Index[float] based on # the other column names. That casts name values from None to np.nan. expectation = contains is np.nan and (name is None or name is np.nan) @@ -11102,3 +11100,12 @@ def test_from_records_with_index_no_shallow_copy(): data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "= version.parse("13.0.0") and from_dtype == np.dtype("float32") - and to_dtype.precision > 7, + and to_dtype.precision > 12, reason="https://github.com/rapidsai/cudf/issues/14169", ) ) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index 0da5c6b04d6..794660cffcb 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import contextlib import doctest import inspect @@ -8,6 +8,7 @@ import numpy as np import pytest +from packaging import version import cudf @@ -80,6 +81,16 @@ def chdir_to_tmp_path(cls, tmp_path): yield os.chdir(original_directory) + @pytest.fixture(autouse=True) + def prinoptions(cls): + # TODO: NumPy now prints scalars as `np.int8(1)`, etc. this should + # be adapted evantually. + if version.parse(np.__version__) >= version.parse("2.0"): + with np.printoptions(legacy="1.25"): + yield + else: + yield + @pytest.mark.parametrize( "docstring", itertools.chain(*[_find_doctests_in_obj(mod) for mod in tests]), diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index edb534a3618..c62b5889fdd 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -341,7 +341,6 @@ def test_dtype(in_dtype, expect): np.complex128, complex, "S", - "a", "V", "float16", np.float16, diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py index fc22d8bc0ea..28fdfb5c2f1 100644 --- a/python/cudf/cudf/tests/test_gcs.py +++ b/python/cudf/cudf/tests/test_gcs.py @@ -46,7 +46,8 @@ def mock_size(*args): # use_python_file_object=True, because the pyarrow # `open_input_file` command will fail (since it doesn't # use the monkey-patched `open` definition) - got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False) + with pytest.warns(FutureWarning): + got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False) assert_eq(pdf, got) # AbstractBufferedFile -> PythonFile conversion diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 05dcd85df6a..722a64cb553 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -16,7 +16,6 @@ import cudf from cudf.api.extensions import no_default -from cudf.api.types import is_bool_dtype from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex from cudf.testing import assert_eq from cudf.testing._utils import ( @@ -2397,8 +2396,8 @@ def test_intersection_index(idx1, idx2, sort, pandas_compatible): expected, actual, exact=False - if (is_bool_dtype(idx1.dtype) and not is_bool_dtype(idx2.dtype)) - or (not is_bool_dtype(idx1.dtype) or is_bool_dtype(idx2.dtype)) + if (idx1.dtype.kind == "b" and idx2.dtype.kind != "b") + or (idx1.dtype.kind != "b" or idx2.dtype.kind == "b") else True, ) @@ -3295,3 +3294,12 @@ def test_index_assignment_no_shallow_copy(index): df = cudf.DataFrame(range(1)) df.index = index assert df.index is index + + +def test_bool_rangeindex_raises(): + assert_exceptions_equal( + lfunc=bool, + rfunc=bool, + lfunc_args_and_kwargs=[[pd.RangeIndex(0)]], + rfunc_args_and_kwargs=[[cudf.RangeIndex(0)]], + ) diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index 4a0dc331e1a..a4f0b9fc97e 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -135,3 +135,9 @@ def test_interpolate_dataframe_error_cases(data, kwargs): lfunc_args_and_kwargs=([], kwargs), rfunc_args_and_kwargs=([], kwargs), ) + + +def test_interpolate_noop_new_column(): + ser = cudf.Series([1.0, 2.0, 3.0]) + result = ser.interpolate() + assert ser._column is not result._column diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 297040b6d95..c81c2d1d94b 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1077,8 +1077,13 @@ def test_json_dtypes_nested_data(): ) pdf = pd.read_json( - StringIO(expected_json_str), orient="records", lines=True + StringIO(expected_json_str), + orient="records", + lines=True, ) + + assert_eq(df, pdf) + pdf.columns = pdf.columns.astype("str") pa_table_pdf = pa.Table.from_pandas( pdf, schema=df.to_arrow().schema, safe=False @@ -1423,3 +1428,19 @@ def test_json_reader_on_bad_lines(on_bad_lines): orient="records", on_bad_lines=on_bad_lines, ) + + +def test_chunked_json_reader(): + df = cudf.DataFrame( + { + "a": ["aaaa"] * 9_00_00_00, + "b": list(range(0, 9_00_00_00)), + } + ) + buf = BytesIO() + df.to_json(buf, lines=True, orient="records", engine="cudf") + buf.seek(0) + df = df.to_pandas() + with cudf.option_context("io.json.low_memory", True): + gdf = cudf.read_json(buf, lines=True) + assert_eq(df, gdf) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index f76143cb381..36bcaa66d7d 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -12,6 +12,7 @@ from cudf import NA from cudf._lib.copying import get_element from cudf.api.types import is_scalar +from cudf.core.column.column import column_empty from cudf.testing import assert_eq from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES @@ -693,12 +694,7 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level): dtype = cudf.ListDtype(dtype) slr = cudf.Scalar(None, dtype=dtype) - assert slr.value is ( - cudf.NaT - if cudf.api.types.is_datetime64_dtype(slr.dtype) - or cudf.api.types.is_timedelta64_dtype(slr.dtype) - else cudf.NA - ) + assert slr.value is (cudf.NaT if slr.dtype.kind in "mM" else cudf.NA) @pytest.mark.parametrize( @@ -926,3 +922,29 @@ def test_list_iterate_error(): def test_list_struct_list_memory_usage(): df = cudf.DataFrame({"a": [[{"b": [1]}]]}) assert df.memory_usage().sum() == 16 + + +def test_empty_nested_list_uninitialized_offsets_memory_usage(): + col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64"))) + nested_col = col.children[1] + empty_inner = type(nested_col)( + size=nested_col.size, + dtype=nested_col.dtype, + mask=nested_col.mask, + offset=nested_col.offset, + null_count=nested_col.null_count, + children=( + column_empty(0, nested_col.children[0].dtype), + nested_col.children[1], + ), + ) + col_empty_offset = type(col)( + size=col.size, + dtype=col.dtype, + mask=col.mask, + offset=col.offset, + null_count=col.null_count, + children=(column_empty(0, col.children[0].dtype), empty_inner), + ) + ser = cudf.Series._from_data({None: col_empty_offset}) + assert ser.memory_usage() == 8 diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 07c2e9c3fcf..2c00d48266c 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -832,25 +832,17 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): # Assert ._levels identity lptrs = [ - lv._data._data[None].base_data.get_ptr(mode="read") - for lv in mi1._levels + lv._column.base_data.get_ptr(mode="read") for lv in mi1._levels ] rptrs = [ - lv._data._data[None].base_data.get_ptr(mode="read") - for lv in mi2._levels + lv._column.base_data.get_ptr(mode="read") for lv in mi2._levels ] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._codes identity - lptrs = [ - c.base_data.get_ptr(mode="read") - for _, c in mi1._codes._data.items() - ] - rptrs = [ - c.base_data.get_ptr(mode="read") - for _, c in mi2._codes._data.items() - ] + lptrs = [c.base_data.get_ptr(mode="read") for c in mi1._codes] + rptrs = [c.base_data.get_ptr(mode="read") for c in mi2._codes] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) @@ -2169,3 +2161,12 @@ def test_nunique(array, dropna): result = gidx.nunique(dropna=dropna) expected = pidx.nunique(dropna=dropna) assert result == expected + + +def test_bool_raises(): + assert_exceptions_equal( + lfunc=bool, + rfunc=bool, + lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]], + rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]], + ) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 588bc87d268..f2820d9c112 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -711,7 +711,8 @@ def test_parquet_reader_arrow_nativefile(parquet_path_or_buf): expect = cudf.read_parquet(parquet_path_or_buf("filepath")) fs, path = pa_fs.FileSystem.from_uri(parquet_path_or_buf("filepath")) with fs.open_input_file(path) as fil: - got = cudf.read_parquet(fil) + with pytest.warns(FutureWarning): + got = cudf.read_parquet(fil) assert_eq(expect, got) @@ -726,16 +727,18 @@ def test_parquet_reader_use_python_file_object( fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath")) # Pass open fsspec file - with fs.open(paths[0], mode="rb") as fil: - got1 = cudf.read_parquet( - fil, use_python_file_object=use_python_file_object - ) + with pytest.warns(FutureWarning): + with fs.open(paths[0], mode="rb") as fil: + got1 = cudf.read_parquet( + fil, use_python_file_object=use_python_file_object + ) assert_eq(expect, got1) # Pass path only - got2 = cudf.read_parquet( - paths[0], use_python_file_object=use_python_file_object - ) + with pytest.warns(FutureWarning): + got2 = cudf.read_parquet( + paths[0], use_python_file_object=use_python_file_object + ) assert_eq(expect, got2) @@ -1617,7 +1620,11 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): assert_eq(pdf, gdf) # Write out the gdf using the GPU accelerated writer with INT96 timestamps - gdf.to_parquet(gdf_fname.strpath, index=None, int96_timestamps=True) + gdf.to_parquet( + gdf_fname.strpath, + index=None, + int96_timestamps=True, + ) assert os.path.exists(gdf_fname) @@ -1789,10 +1796,11 @@ def test_parquet_write_bytes_io(simple_gdf): assert_eq(cudf.read_parquet(output), simple_gdf) -def test_parquet_writer_bytes_io(simple_gdf): +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_writer_bytes_io(simple_gdf, store_schema): output = BytesIO() - writer = ParquetWriter(output) + writer = ParquetWriter(output, store_schema=store_schema) writer.write_table(simple_gdf) writer.write_table(simple_gdf) writer.close() @@ -2124,7 +2132,8 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory): @pytest.mark.parametrize("cols", [None, ["b"]]) -def test_parquet_write_to_dataset(tmpdir_factory, cols): +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema): dir1 = tmpdir_factory.mktemp("dir1") dir2 = tmpdir_factory.mktemp("dir2") if cols is None: @@ -2140,7 +2149,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols): "b": np.random.choice(np.arange(4), size=size), } ) - gdf.to_parquet(dir1, partition_cols=cols) + gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema) cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols) # Read back with cudf @@ -2156,7 +2165,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols): } ) with pytest.raises(ValueError): - gdf.to_parquet(dir1, partition_cols=cols) + gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema) @pytest.mark.parametrize( @@ -2386,7 +2395,8 @@ def test_parquet_writer_list_large_mixed(tmpdir): assert_eq(expect, got) -def test_parquet_writer_list_chunked(tmpdir): +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_writer_list_chunked(tmpdir, store_schema): table1 = cudf.DataFrame( { "a": list_gen(string_gen, 128, 80, 50), @@ -2407,7 +2417,7 @@ def test_parquet_writer_list_chunked(tmpdir): expect = cudf.concat([table1, table2]) expect = expect.reset_index(drop=True) - writer = ParquetWriter(fname) + writer = ParquetWriter(fname, store_schema=store_schema) writer.write_table(table1) writer.write_table(table2) writer.close() @@ -2542,6 +2552,10 @@ def normalized_equals(value1, value2): value1 = None if value2 is pd.NA or value2 is pd.NaT: value2 = None + if isinstance(value1, np.datetime64): + value1 = pd.Timestamp(value1).to_pydatetime() + if isinstance(value2, np.datetime64): + value2 = pd.Timestamp(value2).to_pydatetime() if isinstance(value1, pd.Timestamp): value1 = value1.to_pydatetime() if isinstance(value2, pd.Timestamp): @@ -2550,6 +2564,9 @@ def normalized_equals(value1, value2): value1 = value1.replace(tzinfo=None) if isinstance(value2, datetime.datetime): value2 = value2.replace(tzinfo=None) + if isinstance(value1, pd.Timedelta): + unit = "ms" if value1.unit == "s" else value1.unit + value2 = pd.Timedelta(value2, unit=unit) # if one is datetime then both values are datetimes now if isinstance(value1, datetime.datetime): @@ -2563,7 +2580,8 @@ def normalized_equals(value1, value2): @pytest.mark.parametrize("add_nulls", [True, False]) -def test_parquet_writer_statistics(tmpdir, pdf, add_nulls): +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_writer_statistics(tmpdir, pdf, add_nulls, store_schema): file_path = tmpdir.join("cudf.parquet") if "col_category" in pdf.columns: pdf = pdf.drop(columns=["col_category", "col_bool"]) @@ -2580,7 +2598,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls): if add_nulls: for col in gdf: set_random_null_mask_inplace(gdf[col]) - gdf.to_parquet(file_path, index=False) + gdf.to_parquet(file_path, index=False, store_schema=store_schema) # Read back from pyarrow pq_file = pq.ParquetFile(file_path) @@ -3205,7 +3223,8 @@ def test_parquet_writer_zstd(): assert_eq(expected, got) -def test_parquet_writer_time_delta_physical_type(): +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_writer_time_delta_physical_type(store_schema): df = cudf.DataFrame( { "s": cudf.Series([1], dtype="timedelta64[s]"), @@ -3217,22 +3236,35 @@ def test_parquet_writer_time_delta_physical_type(): } ) buffer = BytesIO() - df.to_parquet(buffer) + df.to_parquet(buffer, store_schema=store_schema) got = pd.read_parquet(buffer) - expected = pd.DataFrame( - { - "s": ["00:00:01"], - "ms": ["00:00:00.002000"], - "us": ["00:00:00.000003"], - "ns": ["00:00:00.000004"], - }, - dtype="str", - ) + + if store_schema: + expected = pd.DataFrame( + { + "s": ["0 days 00:00:01"], + "ms": ["0 days 00:00:00.002000"], + "us": ["0 days 00:00:00.000003"], + "ns": ["0 days 00:00:00.000004"], + }, + dtype="str", + ) + else: + expected = pd.DataFrame( + { + "s": ["00:00:01"], + "ms": ["00:00:00.002000"], + "us": ["00:00:00.000003"], + "ns": ["00:00:00.000004"], + }, + dtype="str", + ) assert_eq(got.astype("str"), expected) -def test_parquet_roundtrip_time_delta(): +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_roundtrip_time_delta(store_schema): num_rows = 12345 df = cudf.DataFrame( { @@ -3255,10 +3287,11 @@ def test_parquet_roundtrip_time_delta(): } ) buffer = BytesIO() - df.to_parquet(buffer) - # TODO: Remove `check_dtype` once following issue is fixed in arrow: - # https://github.com/apache/arrow/issues/33321 + df.to_parquet(buffer, store_schema=store_schema) + # `check_dtype` cannot be removed here as timedelta64[s] will change to `timedelta[ms]` assert_eq(df, cudf.read_parquet(buffer), check_dtype=False) + if store_schema: + assert_eq(df, pd.read_parquet(buffer)) def test_parquet_reader_malformed_file(datadir): @@ -3420,35 +3453,87 @@ def test_parquet_reader_roundtrip_with_arrow_schema(): # Check results for reader with schema assert_eq(expected, got) + # Reset buffer + buffer = BytesIO() -def test_parquet_reader_roundtrip_structs_with_arrow_schema(): - # Ensure that the structs with duration types are faithfully being - # roundtripped across Parquet with arrow schema - pdf = pd.DataFrame( - { - "struct": { - "payload": { - "Domain": { - "Name": "abc", - "Id": {"Name": "host", "Value": "127.0.0.8"}, - "Duration": datetime.timedelta(minutes=12), - }, - "StreamId": "12345678", - "Duration": datetime.timedelta(minutes=4), - "Offset": None, - "Resource": [ - { - "Name": "ZoneName", - "Value": "RAPIDS", - "Duration": datetime.timedelta(seconds=1), - } - ], + # Write to buffer with cudf + expected.to_parquet(buffer, store_schema=True) + + # Read parquet with arrow schema + got = cudf.read_parquet(buffer) + # Convert to cudf table for an apple to apple comparison + expected = cudf.from_pandas(pdf) + + +@pytest.mark.parametrize( + "data", + [ + # struct + [ + {"a": 1, "b": 2}, + {"a": 10, "b": 20}, + {"a": None, "b": 22}, + {"a": None, "b": None}, + {"a": 15, "b": None}, + ], + # struct-of-list + [ + {"a": 1, "b": 2, "c": [1, 2, 3]}, + {"a": 10, "b": 20, "c": [4, 5]}, + {"a": None, "b": 22, "c": [6]}, + {"a": None, "b": None, "c": None}, + {"a": 15, "b": None, "c": [-1, -2]}, + None, + {"a": 100, "b": 200, "c": [-10, None, -20]}, + ], + # list-of-struct + [ + [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], + None, + [{"a": 10, "b": 20}], + [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], + ], + # struct-of-struct + [ + {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, + {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, + {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, + {"a": 7, "b": None, "c": 8}, + {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, + None, + {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, + ], + # struct-with-mixed-types + [ + { + "struct": { + "payload": { + "Domain": { + "Name": "abc", + "Id": {"Name": "host", "Value": "127.0.0.8"}, + "Duration": datetime.timedelta(minutes=12), + }, + "StreamId": "12345678", + "Duration": datetime.timedelta(minutes=4), + "Offset": None, + "Resource": [ + { + "Name": "ZoneName", + "Value": "RAPIDS", + "Duration": datetime.timedelta(seconds=1), + } + ], + } } } - } - ) + ], + ], +) +def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data): + # Ensure that the structs with duration types are faithfully being + # roundtripped across Parquet with arrow schema + pdf = pd.DataFrame({"struct": pd.Series(data)}) - # Reset the buffer and write parquet with arrow buffer = BytesIO() pdf.to_parquet(buffer, engine="pyarrow") @@ -3460,6 +3545,203 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(): # Check results assert_eq(expected, got) + # Reset buffer + buffer = BytesIO() + + # Write to buffer with cudf + expected.to_parquet(buffer, store_schema=True) + + # Read parquet with arrow schema + got = cudf.read_parquet(buffer) + # Convert to cudf table for an apple to apple comparison + expected = cudf.from_pandas(pdf) + + # Check results + assert_eq(expected, got) + + +@pytest.mark.parametrize("index", [None, True, False]) +def test_parquet_writer_roundtrip_with_arrow_schema(index): + # Ensure that the concrete and nested types are faithfully being roundtripped + # across Parquet with arrow schema + expected = cudf.DataFrame( + { + "s": cudf.Series([None, None, None], dtype="timedelta64[s]"), + "us": cudf.Series([None, 3456, None], dtype="timedelta64[us]"), + "duration_list": list( + [ + [ + datetime.timedelta(minutes=7, seconds=4), + datetime.timedelta(minutes=7), + ], + [ + None, + None, + ], + [ + datetime.timedelta(minutes=7, seconds=4), + None, + ], + ] + ), + "int64": cudf.Series([-1234, 123, 4123], dtype="int64"), + "uint32": cudf.Series([1234, 123, 4123], dtype="uint32"), + "list": list([[1, 2], [1, 2], [1, 2]]), + "bool": cudf.Series([True, None, False], dtype=bool), + "fixed32": cudf.Series([0.00, 1.0, None]).astype( + cudf.Decimal32Dtype(7, 2) + ), + "fixed64": cudf.Series([0.00, 1.0, None]).astype( + cudf.Decimal64Dtype(7, 2) + ), + "fixed128": cudf.Series([0.00, 1.0, None]).astype( + cudf.Decimal128Dtype(7, 2) + ), + "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), + "map": cudf.Series(["cat", "dog", "lion"]).map( + {"cat": "kitten", "dog": "puppy", "lion": "cub"} + ), + } + ) + + # Write to Parquet with arrow schema for faithful roundtrip + buffer = BytesIO() + expected.to_parquet(buffer, store_schema=True, index=index) + + # Convert decimal types to d128 + expected = expected.astype({"fixed32": cudf.Decimal128Dtype(9, 2)}) + expected = expected.astype({"fixed64": cudf.Decimal128Dtype(18, 2)}) + + # Read parquet with pyarrow, pandas and cudf readers + got = cudf.DataFrame.from_arrow(pq.read_table(buffer)) + got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer)) + got3 = cudf.read_parquet(buffer) + + # drop the index column for comparison: __index_level_0__ + if index: + got.drop(columns="__index_level_0__", inplace=True) + got2.drop(columns="__index_level_0__", inplace=True) + + # Check results + assert_eq(expected, got) + assert_eq(expected, got2) + assert_eq(expected, got3) + + +def test_parquet_writer_int96_timestamps_and_arrow_schema(): + df = cudf.DataFrame( + { + "timestamp": cudf.Series( + [1234, 123, 4123], dtype="datetime64[ms]" + ), + } + ) + + # Output buffer + buffer = BytesIO() + + # Writing out parquet with both INT96 timestamps and arrow_schema + # enabled should throw an exception. + with pytest.raises(RuntimeError): + df.to_parquet(buffer, int96_timestamps=True, store_schema=True) + + +@pytest.mark.parametrize( + "data", + [ + # struct + [ + {"a": 1, "b": 2}, + {"a": 10, "b": 20}, + {"a": None, "b": 22}, + {"a": None, "b": None}, + {"a": 15, "b": None}, + ], + # struct-of-list + [ + {"a": 1, "b": 2, "c": [1, 2, 3]}, + {"a": 10, "b": 20, "c": [4, 5]}, + {"a": None, "b": 22, "c": [6]}, + {"a": None, "b": None, "c": None}, + {"a": 15, "b": None, "c": [-1, -2]}, + None, + {"a": 100, "b": 200, "c": [-10, None, -20]}, + ], + # list-of-struct + [ + [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], + None, + [{"a": 10, "b": 20}], + [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], + ], + # struct-of-struct + [ + {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, + {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, + {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, + {"a": 7, "b": None, "c": 8}, + {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, + None, + {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, + ], + # struct-with-mixed-types + [ + { + "struct": { + "payload": { + "Domain": { + "Name": "abc", + "Id": {"Name": "host", "Value": "127.0.0.8"}, + "Duration": datetime.timedelta(minutes=12), + }, + "StreamId": "12345678", + "Duration": datetime.timedelta(minutes=4), + "Offset": None, + "Resource": [ + { + "Name": "ZoneName", + "Value": "RAPIDS", + "Duration": datetime.timedelta(seconds=1), + } + ], + } + } + } + ], + ], +) +@pytest.mark.parametrize("index", [None, True, False]) +def test_parquet_writer_roundtrip_structs_with_arrow_schema( + tmpdir, data, index +): + # Ensure that the structs are faithfully being roundtripped across + # Parquet with arrow schema + pa_expected = pa.Table.from_pydict({"struct": data}) + + expected = cudf.DataFrame.from_arrow(pa_expected) + + # Write expected data frame to Parquet with arrow schema + buffer = BytesIO() + expected.to_parquet(buffer, store_schema=True, index=index) + + # Read Parquet with pyarrow + pa_got = pq.read_table(buffer) + + # drop the index column for comparison: __index_level_0__ + if index: + pa_got = pa_got.drop(columns="__index_level_0__") + + # Check results + assert_eq(pa_expected, pa_got) + + # Convert to cuDF table and also read Parquet with cuDF reader + got = cudf.DataFrame.from_arrow(pa_got) + got2 = cudf.read_parquet(buffer) + + # Check results + assert_eq(expected, got) + assert_eq(expected, got2) + @pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000]) @pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000]) @@ -3493,6 +3775,6 @@ def test_parquet_reader_pandas_compatibility(): ) buffer = BytesIO() df.to_parquet(buffer) - with cudf.option_context("mode.pandas_compatible", True): + with cudf.option_context("io.parquet.low_memory", True): expected = cudf.read_parquet(buffer) assert_eq(expected, df) diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 1247fa362ce..8be6463c699 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -248,16 +248,11 @@ def test_sum_masked(nelem): def test_sum_boolean(): s = Series(np.arange(100000)) - got = (s > 1).sum(dtype=np.int32) + got = (s > 1).sum() expect = 99998 assert expect == got - got = (s > 1).sum(dtype=np.bool_) - expect = True - - assert expect == got - def test_date_minmax(): np_data = np.random.normal(size=10**3) @@ -371,3 +366,11 @@ def test_reduction_column_multiindex(): result = df.mean() expected = df.to_pandas().mean() assert_eq(result, expected) + + +@pytest.mark.parametrize("op", ["sum", "product"]) +def test_dtype_deprecated(op): + ser = cudf.Series(range(5)) + with pytest.warns(FutureWarning): + result = getattr(ser, op)(dtype=np.dtype(np.int8)) + assert isinstance(result, np.int8) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 193d64a9e7f..a013745f71e 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -186,13 +186,11 @@ def test_MI(): } ) levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]] - codes = cudf.DataFrame( - { - "a": [0, 0, 0, 0, 1, 1, 2, 2, 3, 3], - "b": [0, 1, 2, 3, 0, 1, 2, 3, 0, 1], - "c": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], - } - ) + codes = [ + [0, 0, 0, 0, 1, 1, 2, 2, 3, 3], + [0, 1, 2, 3, 0, 1, 2, 3, 0, 1], + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], + ] pd.options.display.max_rows = 999 pd.options.display.max_columns = 0 gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes)) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index a44bf791767..3ae318d3bf5 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -138,22 +138,24 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread): buffer = pdf.to_csv(index=False) # Use fsspec file object - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_csv( - f"s3://{bucket}/{fname}", - storage_options=s3so, - bytes_per_thread=bytes_per_thread, - use_python_file_object=False, - ) + with pytest.warns(FutureWarning): + with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): + got = cudf.read_csv( + f"s3://{bucket}/{fname}", + storage_options=s3so, + bytes_per_thread=bytes_per_thread, + use_python_file_object=False, + ) assert_eq(pdf, got) # Use Arrow PythonFile object - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_csv( - f"s3://{bucket}/{fname}", - storage_options=s3so, - use_python_file_object=True, - ) + with pytest.warns(FutureWarning): + with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): + got = cudf.read_csv( + f"s3://{bucket}/{fname}", + storage_options=s3so, + use_python_file_object=True, + ) assert_eq(pdf, got) @@ -166,8 +168,9 @@ def test_read_csv_arrow_nativefile(s3_base, s3so, pdf): fs = pa_fs.S3FileSystem( endpoint_override=s3so["client_kwargs"]["endpoint_url"], ) - with fs.open_input_file(f"{bucket}/{fname}") as fil: - got = cudf.read_csv(fil) + with pytest.warns(FutureWarning): + with fs.open_input_file(f"{bucket}/{fname}") as fil: + got = cudf.read_csv(fil) assert_eq(pdf, got) @@ -184,17 +187,18 @@ def test_read_csv_byte_range( # Use fsspec file object with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_csv( - f"s3://{bucket}/{fname}", - storage_options=s3so, - byte_range=(74, 73), - bytes_per_thread=bytes_per_thread - if not use_python_file_object - else None, - header=None, - names=["Integer", "Float", "Integer2", "String", "Boolean"], - use_python_file_object=use_python_file_object, - ) + with pytest.warns(FutureWarning): + got = cudf.read_csv( + f"s3://{bucket}/{fname}", + storage_options=s3so, + byte_range=(74, 73), + bytes_per_thread=bytes_per_thread + if not use_python_file_object + else None, + header=None, + names=["Integer", "Float", "Integer2", "String", "Boolean"], + use_python_file_object=use_python_file_object, + ) assert_eq(pdf.iloc[-2:].reset_index(drop=True), got) @@ -241,18 +245,19 @@ def test_read_parquet( # Check direct path handling buffer.seek(0) with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got1 = cudf.read_parquet( - f"s3://{bucket}/{fname}", - open_file_options=( - {"precache_options": {"method": precache}} - if use_python_file_object - else None - ), - storage_options=s3so, - bytes_per_thread=bytes_per_thread, - columns=columns, - use_python_file_object=use_python_file_object, - ) + with pytest.warns(FutureWarning): + got1 = cudf.read_parquet( + f"s3://{bucket}/{fname}", + open_file_options=( + {"precache_options": {"method": precache}} + if use_python_file_object + else None + ), + storage_options=s3so, + bytes_per_thread=bytes_per_thread, + columns=columns, + use_python_file_object=use_python_file_object, + ) expect = pdf[columns] if columns else pdf assert_eq(expect, got1) @@ -263,12 +268,13 @@ def test_read_parquet( f"s3://{bucket}/{fname}", storage_options=s3so )[0] with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f: - got2 = cudf.read_parquet( - f, - bytes_per_thread=bytes_per_thread, - columns=columns, - use_python_file_object=use_python_file_object, - ) + with pytest.warns(FutureWarning): + got2 = cudf.read_parquet( + f, + bytes_per_thread=bytes_per_thread, + columns=columns, + use_python_file_object=use_python_file_object, + ) assert_eq(expect, got2) @@ -353,11 +359,12 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns): pdf.to_parquet(path=buffer) buffer.seek(0) with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - fs = pa_fs.S3FileSystem( - endpoint_override=s3so["client_kwargs"]["endpoint_url"], - ) - with fs.open_input_file(f"{bucket}/{fname}") as fil: - got = cudf.read_parquet(fil, columns=columns) + with pytest.warns(FutureWarning): + fs = pa_fs.S3FileSystem( + endpoint_override=s3so["client_kwargs"]["endpoint_url"], + ) + with fs.open_input_file(f"{bucket}/{fname}") as fil: + got = cudf.read_parquet(fil, columns=columns) expect = pdf[columns] if columns else pdf assert_eq(expect, got) @@ -372,12 +379,13 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache): buffer.seek(0) filters = [("String", "==", "Omega")] with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_parquet( - f"s3://{bucket}/{fname}", - storage_options=s3so, - filters=filters, - open_file_options={"precache_options": {"method": precache}}, - ) + with pytest.warns(FutureWarning): + got = cudf.read_parquet( + f"s3://{bucket}/{fname}", + storage_options=s3so, + filters=filters, + open_file_options={"precache_options": {"method": precache}}, + ) # All row-groups should be filtered out assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True)) @@ -449,12 +457,13 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns): buffer = f.read() with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_orc( - f"s3://{bucket}/{fname}", - columns=columns, - storage_options=s3so, - use_python_file_object=use_python_file_object, - ) + with pytest.warns(FutureWarning): + got = cudf.read_orc( + f"s3://{bucket}/{fname}", + columns=columns, + storage_options=s3so, + use_python_file_object=use_python_file_object, + ) if columns: expect = expect[columns] @@ -475,8 +484,9 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns): fs = pa_fs.S3FileSystem( endpoint_override=s3so["client_kwargs"]["endpoint_url"], ) - with fs.open_input_file(f"{bucket}/{fname}") as fil: - got = cudf.read_orc(fil, columns=columns) + with pytest.warns(FutureWarning): + with fs.open_input_file(f"{bucket}/{fname}") as fil: + got = cudf.read_orc(fil, columns=columns) if columns: expect = expect[columns] diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 05a91a8fea3..f2faf4343b6 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -8,6 +8,7 @@ import pandas as pd import pyarrow as pa import pytest +from packaging import version import rmm @@ -211,9 +212,7 @@ def test_scalar_roundtrip(value): ) def test_null_scalar(dtype): s = cudf.Scalar(None, dtype=dtype) - if cudf.api.types.is_datetime64_dtype( - dtype - ) or cudf.api.types.is_timedelta64_dtype(dtype): + if s.dtype.kind in "mM": assert s.value is cudf.NaT else: assert s.value is cudf.NA @@ -253,6 +252,22 @@ def test_generic_null_scalar_construction_fails(value): cudf.Scalar(value) +@pytest.mark.parametrize( + "value, dtype", [(1000, "uint8"), (2**30, "int16"), (-1, "uint16")] +) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_scalar_out_of_bounds_pyint_fails(value, dtype): + # Test that we align with NumPy on scalar creation behavior from + # Python integers. + if version.parse(np.__version__) >= version.parse("2.0"): + with pytest.raises(OverflowError): + cudf.Scalar(value, dtype) + else: + # NumPy allowed this, but it gives a DeprecationWarning on newer + # versions (which cudf did not used to do). + assert cudf.Scalar(value, dtype).value == np.dtype(dtype).type(value) + + @pytest.mark.parametrize( "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"] ) @@ -352,12 +367,7 @@ def test_scalar_implicit_int_conversion(value): @pytest.mark.parametrize("dtype", sorted(set(ALL_TYPES) - {"category"})) def test_scalar_invalid_implicit_conversion(cls, dtype): try: - cls( - pd.NaT - if cudf.api.types.is_datetime64_dtype(dtype) - or cudf.api.types.is_timedelta64_dtype(dtype) - else pd.NA - ) + cls(pd.NaT if cudf.dtype(dtype).kind in "mM" else pd.NA) except TypeError as e: with pytest.raises(TypeError, match=re.escape(str(e))): slr = cudf.Scalar(None, dtype=dtype) diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index dbbf4fba3a6..5f5d79c1dce 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -81,7 +81,10 @@ def generate_valid_scalar_unaop_combos(): @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos()) def test_scalar_unary_operations(slr, dtype, op): slr_host = np.array([slr])[0].astype(cudf.dtype(dtype)) - slr_device = cudf.Scalar(slr, dtype=dtype) + # The scalar may be out of bounds, so go via array force-cast + # NOTE: This is a change in behavior + slr = np.array(slr).astype(dtype)[()] + slr_device = cudf.Scalar(slr) expect = op(slr_host) got = op(slr_device) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 2aa3129ab30..b0788bcc0fc 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,7 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import datetime from decimal import Decimal +from typing import TYPE_CHECKING import cupy as cp import numpy as np @@ -10,8 +12,9 @@ from pandas.core.dtypes.common import infer_dtype_from_object import cudf -from cudf._typing import DtypeObj -from cudf.api.types import is_bool, is_float, is_integer + +if TYPE_CHECKING: + from cudf._typing import DtypeObj """Map numpy dtype to pyarrow types. Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special @@ -91,10 +94,6 @@ BOOL_TYPES = {"bool"} ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES -# The NumPy scalar types are a bit of a mess as they align with the C types -# so for now we use the `sctypes` dict (although it was made private in 2.0) -_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes - def np_to_pa_dtype(dtype): """Util to convert numpy dtype to PyArrow dtype.""" @@ -116,12 +115,6 @@ def np_to_pa_dtype(dtype): return _np_pa_dtypes[cudf.dtype(dtype).type] -def numeric_normalize_types(*args): - """Cast all args to a common type using numpy promotion logic""" - dtype = np.result_type(*[a.dtype for a in args]) - return [a.astype(dtype) for a in args] - - def _find_common_type_decimal(dtypes): # Find the largest scale and the largest difference between # precision and scale of the columns to be concatenated @@ -253,16 +246,18 @@ def to_cudf_compatible_scalar(val, dtype=None): elif isinstance(val, datetime.timedelta): val = np.timedelta64(val) - val = _maybe_convert_to_default_type( - cudf.api.types.pandas_dtype(type(val)) - ).type(val) - if dtype is not None: - if isinstance(val, str) and np.dtype(dtype).kind == "M": + dtype = np.dtype(dtype) + if isinstance(val, str) and dtype.kind == "M": # pd.Timestamp can handle str, but not np.str_ val = pd.Timestamp(str(val)).to_datetime64().astype(dtype) else: - val = val.astype(dtype) + # At least datetimes cannot be converted to scalar via dtype.type: + val = np.array(val, dtype)[()] + else: + val = _maybe_convert_to_default_type( + cudf.api.types.pandas_dtype(type(val)) + ).type(val) if val.dtype.type is np.datetime64: time_unit, _ = np.datetime_data(val.dtype) @@ -330,32 +325,28 @@ def can_convert_to_column(obj): return is_column_like(obj) or cudf.api.types.is_list_like(obj) -def min_scalar_type(a, min_size=8): - return min_signed_type(a, min_size=min_size) - - -def min_signed_type(x, min_size=8): +def min_signed_type(x: int, min_size: int = 8) -> np.dtype: """ Return the smallest *signed* integer dtype that can represent the integer ``x`` """ - for int_dtype in _NUMPY_SCTYPES["int"]: + for int_dtype in (np.int8, np.int16, np.int32, np.int64): if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max: - return int_dtype + return np.dtype(int_dtype) # resort to using `int64` and let numpy raise appropriate exception: return np.int64(x).dtype -def min_unsigned_type(x, min_size=8): +def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype: """ Return the smallest *unsigned* integer dtype that can represent the integer ``x`` """ - for int_dtype in _NUMPY_SCTYPES["uint"]: + for int_dtype in (np.uint8, np.uint16, np.uint32, np.uint64): if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: if 0 <= x <= np.iinfo(int_dtype).max: - return int_dtype + return np.dtype(int_dtype) # resort to using `uint64` and let numpy raise appropriate exception: return np.uint64(x).dtype @@ -373,10 +364,10 @@ def min_column_type(x, expected_type): if x.null_count == len(x): return x.dtype - if np.issubdtype(x.dtype, np.floating): + if x.dtype.kind == "f": return get_min_float_dtype(x) - elif np.issubdtype(expected_type, np.integer): + elif cudf.dtype(expected_type).kind in "iu": max_bound_dtype = np.min_scalar_type(x.max()) min_bound_dtype = np.min_scalar_type(x.min()) result_type = np.promote_types(max_bound_dtype, min_bound_dtype) @@ -422,9 +413,7 @@ def get_time_unit(obj): def _get_nan_for_dtype(dtype): dtype = cudf.dtype(dtype) - if pd.api.types.is_datetime64_dtype( - dtype - ) or pd.api.types.is_timedelta64_dtype(dtype): + if dtype.kind in "mM": time_unit, _ = np.datetime_data(dtype) return dtype.type("nat", time_unit) elif dtype.kind == "f": @@ -525,16 +514,14 @@ def find_common_type(dtypes): return cudf.dtype("O") # Aggregate same types - dtypes = set(dtypes) + dtypes = {cudf.dtype(dtype) for dtype in dtypes} + if len(dtypes) == 1: + return dtypes.pop() if any( isinstance(dtype, cudf.core.dtypes.DecimalDtype) for dtype in dtypes ): - if all( - cudf.api.types.is_decimal_dtype(dtype) - or cudf.api.types.is_numeric_dtype(dtype) - for dtype in dtypes - ): + if all(cudf.api.types.is_numeric_dtype(dtype) for dtype in dtypes): return _find_common_type_decimal( [ dtype @@ -544,40 +531,28 @@ def find_common_type(dtypes): ) else: return cudf.dtype("O") - if any(isinstance(dtype, cudf.ListDtype) for dtype in dtypes): - if len(dtypes) == 1: - return dtypes.get(0) - else: - # TODO: As list dtypes allow casting - # to identical types, improve this logic of returning a - # common dtype, for example: - # ListDtype(int64) & ListDtype(int32) common - # dtype could be ListDtype(int64). - raise NotImplementedError( - "Finding a common type for `ListDtype` is currently " - "not supported" - ) - if any(isinstance(dtype, cudf.StructDtype) for dtype in dtypes): - if len(dtypes) == 1: - return dtypes.get(0) - else: - raise NotImplementedError( - "Finding a common type for `StructDtype` is currently " - "not supported" - ) + elif any( + isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)) + for dtype in dtypes + ): + # TODO: As list dtypes allow casting + # to identical types, improve this logic of returning a + # common dtype, for example: + # ListDtype(int64) & ListDtype(int32) common + # dtype could be ListDtype(int64). + raise NotImplementedError( + "Finding a common type for `ListDtype` or `StructDtype` is currently " + "not supported" + ) # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately - dt_dtypes = set( - filter(lambda t: cudf.api.types.is_datetime_dtype(t), dtypes) - ) + dt_dtypes = set(filter(lambda t: t.kind == "M", dtypes)) if len(dt_dtypes) > 0: dtypes = dtypes - dt_dtypes dtypes.add(np.result_type(*dt_dtypes)) - td_dtypes = set( - filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes) - ) + td_dtypes = set(filter(lambda t: t.kind == "m", dtypes)) if len(td_dtypes) > 0: dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) @@ -598,121 +573,22 @@ def _dtype_pandas_compatible(dtype): return dtype -def _can_cast(from_dtype, to_dtype): - """ - Utility function to determine if we can cast - from `from_dtype` to `to_dtype`. This function primarily calls - `np.can_cast` but with some special handling around - cudf specific dtypes. - """ - if cudf.utils.utils.is_na_like(from_dtype): - return True - if isinstance(from_dtype, type): - from_dtype = cudf.dtype(from_dtype) - if isinstance(to_dtype, type): - to_dtype = cudf.dtype(to_dtype) - - # TODO : Add precision & scale checking for - # decimal types in future - - if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype): - if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype): - return True - elif isinstance(to_dtype, np.dtype): - if to_dtype.kind in {"i", "f", "u", "U", "O"}: - return True - else: - return False - elif isinstance(from_dtype, np.dtype): - if isinstance(to_dtype, np.dtype): - return np.can_cast(from_dtype, to_dtype) - elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype): - if from_dtype.kind in {"i", "f", "u", "U", "O"}: - return True - else: - return False - elif isinstance(to_dtype, cudf.core.types.CategoricalDtype): - return True - else: - return False - elif isinstance(from_dtype, cudf.core.dtypes.ListDtype): - # TODO: Add level based checks too once casting of - # list columns is supported - if isinstance(to_dtype, cudf.core.dtypes.ListDtype): - return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type) - else: - return False - elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype): - if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype): - return True - elif isinstance(to_dtype, np.dtype): - return np.can_cast(from_dtype._categories.dtype, to_dtype) - else: - return False - else: - return np.can_cast(from_dtype, to_dtype) - - -def _maybe_convert_to_default_type(dtype): +def _maybe_convert_to_default_type(dtype: DtypeObj) -> DtypeObj: """Convert `dtype` to default if specified by user. If not specified, return as is. """ - if cudf.get_option("default_integer_bitwidth"): - if cudf.api.types.is_signed_integer_dtype(dtype): - return cudf.dtype( - f'i{cudf.get_option("default_integer_bitwidth")//8}' - ) - elif cudf.api.types.is_unsigned_integer_dtype(dtype): - return cudf.dtype( - f'u{cudf.get_option("default_integer_bitwidth")//8}' - ) - if cudf.get_option( - "default_float_bitwidth" - ) and cudf.api.types.is_float_dtype(dtype): - return cudf.dtype(f'f{cudf.get_option("default_float_bitwidth")//8}') - + if ib := cudf.get_option("default_integer_bitwidth"): + if dtype.kind == "i": + return cudf.dtype(f"i{ib//8}") + elif dtype.kind == "u": + return cudf.dtype(f"u{ib//8}") + if (fb := cudf.get_option("default_float_bitwidth")) and dtype.kind == "f": + return cudf.dtype(f"f{fb//8}") return dtype -def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool: - if not len(rng): - return True - return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype) - - -def _dtype_can_hold_element(dtype: np.dtype, element) -> bool: - if dtype.kind in {"i", "u"}: - if isinstance(element, range): - if _dtype_can_hold_range(element, dtype): - return True - return False - - elif is_integer(element) or ( - is_float(element) and element.is_integer() - ): - info = np.iinfo(dtype) - if info.min <= element <= info.max: - return True - return False - - elif dtype.kind == "f": - if is_integer(element) or is_float(element): - casted = dtype.type(element) - if np.isnan(casted) or casted == element: - return True - # otherwise e.g. overflow see TestCoercionFloat32 - return False - - elif dtype.kind == "b": - if is_bool(element): - return True - return False - - raise NotImplementedError(f"Unsupported dtype: {dtype}") - - -def _get_base_dtype(dtype: DtypeObj) -> DtypeObj: +def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype: # TODO: replace the use of this function with just `dtype.base` # when Pandas 2.1.0 is the minimum version we support: # https://github.com/pandas-dev/pandas/pull/52706 diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 0209c692935..80555750b3a 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -6,6 +6,7 @@ import warnings from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper from threading import Thread +from typing import Callable import fsspec import fsspec.implementations.local @@ -15,6 +16,7 @@ from pyarrow import PythonFile as ArrowPythonFile from pyarrow.lib import NativeFile +from cudf.api.extensions import no_default from cudf.core._compat import PANDAS_LT_300 from cudf.utils.docutils import docfmt_partial @@ -24,7 +26,6 @@ except ImportError: fsspec_parquet = None - _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 _ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024 @@ -86,7 +87,7 @@ 1 20 rapids 2 30 ai """.format(remote_data_sources=_docstring_remote_sources) -doc_read_avro = docfmt_partial(docstring=_docstring_read_avro) +doc_read_avro: Callable = docfmt_partial(docstring=_docstring_read_avro) _docstring_read_parquet_metadata = """ Read a Parquet file's metadata and schema @@ -174,15 +175,23 @@ columns are also loaded. use_python_file_object : boolean, default True If True, Arrow-backed PythonFile objects will be used in place of fsspec - AbstractBufferedFile objects at IO time. Setting this argument to `False` - will require the entire file to be copied to host memory, and is highly - discouraged. + AbstractBufferedFile objects at IO time. + + .. deprecated:: 24.08 + `use_python_file_object` is deprecated and will be removed in a future + version of cudf, as PyArrow NativeFiles will no longer be accepted as + input/output in cudf readers/writers in the future. open_file_options : dict, optional Dictionary of key-value pairs to pass to the function used to open remote files. By default, this will be `fsspec.parquet.open_parquet_file`. To deactivate optimized precaching, set the "method" to `None` under the "precache_options" key. Note that the `open_file_func` key can also be used to specify a custom file-open function. + + .. deprecated:: 24.08 + `open_file_options` is deprecated as it was intended for + pyarrow file inputs, which will no longer be accepted as + input/output cudf readers/writers in the future. bytes_per_thread : int, default None Determines the number of bytes to be allocated per thread to read the files in parallel. When there is a file of large size, we get slightly @@ -322,6 +331,12 @@ output_as_binary : set, optional, default None If a column name is present in the set, that column will be output as unannotated binary, rather than the default 'UTF-8'. +store_schema : bool, default False + If ``True``, writes arrow schema to Parquet file footer's key-value + metadata section to faithfully round-trip ``duration`` types with arrow. + This cannot be used with ``int96_timestamps`` enabled as int96 timestamps + are deprecated in arrow. Also, all decimal32 and decimal64 columns will be + converted to decimal128 as arrow only supports decimal128 and decimal256 types. **kwargs Additional parameters will be passed to execution engines other than ``cudf``. @@ -462,8 +477,12 @@ If True, use row index if available for faster seeking. use_python_file_object : boolean, default True If True, Arrow-backed PythonFile objects will be used in place of fsspec - AbstractBufferedFile objects at IO time. This option is likely to improve - performance when making small reads from larger ORC files. + AbstractBufferedFile objects at IO time. + + .. deprecated:: 24.08 + `use_python_file_object` is deprecated and will be removed in a future + version of cudf, as PyArrow NativeFiles will no longer be accepted as + input/output in cudf readers/writers in the future. storage_options : dict, optional, default None Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value @@ -928,7 +947,7 @@ -------- cudf.DataFrame.to_hdf : Write a HDF file from a DataFrame. """ -doc_read_hdf = docfmt_partial(docstring=_docstring_read_hdf) +doc_read_hdf: Callable = docfmt_partial(docstring=_docstring_read_hdf) _docstring_to_hdf = """ Write the contained data to an HDF5 file using HDFStore. @@ -1000,7 +1019,7 @@ cudf.DataFrame.to_parquet : Write a DataFrame to the binary parquet format. cudf.DataFrame.to_feather : Write out feather-format for DataFrames. """ -doc_to_hdf = docfmt_partial(docstring=_docstring_to_hdf) +doc_to_hdf: Callable = docfmt_partial(docstring=_docstring_to_hdf) _docstring_read_feather = """ Load an feather object from the file path, returning a DataFrame. @@ -1182,8 +1201,12 @@ the end of the range. use_python_file_object : boolean, default True If True, Arrow-backed PythonFile objects will be used in place of fsspec - AbstractBufferedFile objects at IO time. This option is likely to improve - performance when making small reads from larger CSV files. + AbstractBufferedFile objects at IO time. + + .. deprecated:: 24.08 + `use_python_file_object` is deprecated and will be removed in a future + version of cudf, as PyArrow NativeFiles will no longer be accepted as + input/output in cudf readers/writers in the future. storage_options : dict, optional, default None Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value @@ -1403,7 +1426,7 @@ result : Series """ -doc_read_text = docfmt_partial(docstring=_docstring_text_datasource) +doc_read_text: Callable = docfmt_partial(docstring=_docstring_text_datasource) _docstring_get_reader_filepath_or_buffer = """ @@ -1424,9 +1447,19 @@ use_python_file_object : boolean, default False If True, Arrow-backed PythonFile objects will be used in place of fsspec AbstractBufferedFile objects. + + .. deprecated:: 24.08 + `use_python_file_object` is deprecated and will be removed in a future + version of cudf, as PyArrow NativeFiles will no longer be accepted as + input/output in cudf readers/writers. open_file_options : dict, optional Optional dictionary of keyword arguments to pass to `_open_remote_files` (used for remote storage only). + + .. deprecated:: 24.08 + `open_file_options` is deprecated as it was intended for + pyarrow file inputs, which will no longer be accepted as + input/output cudf readers/writers in the future. allow_raw_text_input : boolean, default False If True, this indicates the input `path_or_data` could be a raw text input and will not check for its existence in the filesystem. If False, @@ -1702,7 +1735,8 @@ def get_reader_filepath_or_buffer( mode="rb", fs=None, iotypes=(BytesIO, NativeFile), - use_python_file_object=False, + # no_default aliases to False + use_python_file_object=no_default, open_file_options=None, allow_raw_text_input=False, storage_options=None, @@ -1714,6 +1748,30 @@ def get_reader_filepath_or_buffer( path_or_data = stringify_pathlike(path_or_data) + if use_python_file_object is no_default: + use_python_file_object = False + elif use_python_file_object is not None: + warnings.warn( + "The 'use_python_file_object' keyword is deprecated and " + "will be removed in a future version.", + FutureWarning, + ) + else: + # Preserve the readers (e.g. read_csv) default of True + # if no use_python_file_object option is specified by the user + # for now (note: this is different from the default for this + # function of False) + # TODO: when non-pyarrow file reading perf is good enough + # we can default this to False + use_python_file_object = True + + if open_file_options is not None: + warnings.warn( + "The 'open_file_options' keyword is deprecated and " + "will be removed in a future version.", + FutureWarning, + ) + if isinstance(path_or_data, str): # Get a filesystem object if one isn't already available paths = [path_or_data] diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 7347ec7866a..c9b343e0f9f 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -6,6 +6,7 @@ import os import traceback import warnings +from contextlib import contextmanager import numpy as np import pandas as pd @@ -403,3 +404,28 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value): if result_mask is not None: result_col = result_col.set_mask(result_mask.as_mask()) return result_col + + +@contextmanager +def maybe_filter_deprecation( + condition: bool, message: str, category: type[Warning] +): + """Conditionally filter a warning category. + + Parameters + ---------- + condition + If true, filter the warning + message + Message to match, passed to :func:`warnings.filterwarnings` + category + Category of warning, passed to :func:`warnings.filterwarnings` + """ + with warnings.catch_warnings(): + if condition: + warnings.filterwarnings( + "ignore", + message, + category=category, + ) + yield diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index f51ce103677..6292022d8e4 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -1080,6 +1080,13 @@ def test_pickle(obj): tm.assert_equal(obj, copy) + with tempfile.TemporaryFile() as f: + xpd.to_pickle(obj, f) + f.seek(0) + copy = xpd.read_pickle(f) + + tm.assert_equal(obj, copy) + def test_dataframe_query(): cudf_pandas_df = xpd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) @@ -1566,3 +1573,62 @@ def test_arrow_string_arrays(): ) tm.assert_equal(cu_arr, pd_arr) + + +@pytest.mark.parametrize("indexer", ["at", "iat"]) +def test_at_iat(indexer): + df = xpd.DataFrame(range(3)) + result = getattr(df, indexer)[0, 0] + assert result == 0 + + getattr(df, indexer)[0, 0] = 1 + expected = pd.DataFrame([1, 1, 2]) + tm.assert_frame_equal(df, expected) + + +def test_at_setitem_empty(): + df = xpd.DataFrame({"name": []}, dtype="float64") + df.at[0, "name"] = 1.0 + df.at[0, "new"] = 2.0 + expected = pd.DataFrame({"name": [1.0], "new": [2.0]}) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize( + "index", + [ + xpd.Index([1, 2, 3], name="foo"), + xpd.Index(["a", "b", "c"], name="foo"), + xpd.RangeIndex(start=0, stop=3, step=1, name="foo"), + xpd.CategoricalIndex(["a", "b", "a"], name="foo"), + xpd.DatetimeIndex( + ["2024-04-24", "2025-04-24", "2026-04-24"], name="foo" + ), + xpd.TimedeltaIndex(["1 days", "2 days", "3 days"], name="foo"), + xpd.PeriodIndex( + ["2024-06", "2023-06", "2022-06"], freq="M", name="foo" + ), + xpd.IntervalIndex.from_breaks([0, 1, 2, 3], name="foo"), + xpd.MultiIndex.from_tuples( + [(1, "a"), (2, "b"), (3, "c")], names=["foo1", "bar1"] + ), + ], +) +def test_change_index_name(index): + s = xpd.Series([1, 2, object()], index=index) + df = xpd.DataFrame({"values": [1, 2, object()]}, index=index) + + if isinstance(index, xpd.MultiIndex): + names = ["foo2", "bar2"] + s.index.names = names + df.index.names = names + + assert s.index.names == names + assert df.index.names == names + else: + name = "bar" + s.index.name = name + df.index.name = name + + assert s.index.name == name + assert df.index.name == name diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 20b731624df..dcb33b1fc1a 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -121,7 +121,7 @@ skip = [ build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml" requires = [ - "cmake>=3.26.4", + "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", "ninja", "numpy==1.23.*", diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 11e18cd4f32..badfdf06d15 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -101,7 +101,7 @@ regex = "(?P.*)" build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml" requires = [ - "cmake>=3.26.4", + "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", "ninja", "numpy==1.23.*", diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 979087d5273..764cdd3b3ca 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -34,7 +34,12 @@ def _callback( return ir.evaluate(cache={}).to_polars() -def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None: +def execute_with_cudf( + nt: NodeTraverser, + *, + raise_on_fail: bool = False, + exception: type[Exception] | tuple[type[Exception], ...] = Exception, +) -> None: """ A post optimization callback that attempts to execute the plan with cudf. @@ -47,11 +52,15 @@ def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None Should conversion raise an exception rather than continuing without setting a callback. + exception + Optional exception, or tuple of exceptions, to catch during + translation. Defaults to ``Exception``. + The NodeTraverser is mutated if the libcudf executor can handle the plan. """ try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): nt.set_udf(partial(_callback, translate_ir(nt))) - except NotImplementedError: + except exception: if raise_on_fail: raise diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 28685f0c4ed..02018548b2c 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -13,6 +13,8 @@ if TYPE_CHECKING: from typing_extensions import Self + import polars as pl + __all__: list[str] = ["Column", "NamedColumn"] @@ -76,12 +78,49 @@ def sorted_like(self, like: Column, /) -> Self: See Also -------- - set_sorted + set_sorted, copy_metadata """ return self.set_sorted( is_sorted=like.is_sorted, order=like.order, null_order=like.null_order ) + def copy_metadata(self, from_: pl.Series, /) -> Self: + """ + Copy metadata from a host series onto self. + + Parameters + ---------- + from_ + Polars series to copy metadata from + + Returns + ------- + Self with metadata set. + + See Also + -------- + set_sorted, sorted_like + """ + if len(from_) <= 1: + return self + ascending = from_.flags["SORTED_ASC"] + descending = from_.flags["SORTED_DESC"] + if ascending or descending: + has_null_first = from_.item(0) is None + has_null_last = from_.item(-1) is None + order = ( + plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING + ) + null_order = plc.types.NullOrder.BEFORE + if (descending and has_null_first) or (ascending and has_null_last): + null_order = plc.types.NullOrder.AFTER + return self.set_sorted( + is_sorted=plc.types.Sorted.YES, + order=order, + null_order=null_order, + ) + return self + def set_sorted( self, *, @@ -128,24 +167,28 @@ def copy(self) -> Self: ) def mask_nans(self) -> Self: - """Return a copy of self with nans masked out.""" - if self.nan_count > 0: - raise NotImplementedError("Need to port transform.hpp to pylibcudf") + """Return a shallow copy of self with nans masked out.""" + if plc.traits.is_floating_point(self.obj.type()): + old_count = self.obj.null_count() + mask, new_count = plc.transform.nans_to_nulls(self.obj) + result = type(self)(self.obj.with_mask(mask, new_count)) + if old_count == new_count: + return result.sorted_like(self) + return result return self.copy() @functools.cached_property def nan_count(self) -> int: """Return the number of NaN values in the column.""" - if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): - return 0 - return plc.interop.to_arrow( - plc.reduce.reduce( - plc.unary.is_nan(self.obj), - plc.aggregation.sum(), - # TODO: pylibcudf needs to have a SizeType DataType singleton - plc.DataType(plc.TypeId.INT32), - ) - ).as_py() + if plc.traits.is_floating_point(self.obj.type()): + return plc.interop.to_arrow( + plc.reduce.reduce( + plc.unary.is_nan(self.obj), + plc.aggregation.sum(), + plc.types.SIZE_TYPE, + ) + ).as_py() + return 0 class NamedColumn(Column): @@ -187,3 +230,17 @@ def copy(self, *, new_name: str | None = None) -> Self: order=self.order, null_order=self.null_order, ) + + def mask_nans(self) -> Self: + """Return a shallow copy of self with nans masked out.""" + # Annoying, the inheritance is not right (can't call the + # super-type mask_nans), but will sort that by refactoring + # later. + if plc.traits.is_floating_point(self.obj.type()): + old_count = self.obj.null_count() + mask, new_count = plc.transform.nans_to_nulls(self.obj) + result = type(self)(self.obj.with_mask(mask, new_count), self.name) + if old_count == new_count: + return result.sorted_like(self) + return result + return self.copy() diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index ec8d00c3123..cbeadf1426a 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -5,19 +5,22 @@ from __future__ import annotations +import itertools from functools import cached_property from typing import TYPE_CHECKING, cast +import pyarrow as pa + import polars as pl import cudf._lib.pylibcudf as plc from cudf_polars.containers.column import NamedColumn +from cudf_polars.utils import dtypes if TYPE_CHECKING: from collections.abc import Mapping, Sequence, Set - import pyarrow as pa from typing_extensions import Self import cudf @@ -49,8 +52,16 @@ def to_polars(self) -> pl.DataFrame: self.table, [plc.interop.ColumnMetadata(name=c.name) for c in self.columns], ) - - return cast(pl.DataFrame, pl.from_arrow(table)) + return cast(pl.DataFrame, pl.from_arrow(table)).with_columns( + *( + pl.col(c.name).set_sorted( + descending=c.order == plc.types.Order.DESCENDING + ) + if c.is_sorted + else pl.col(c.name) + for c in self.columns + ) + ) @cached_property def column_names_set(self) -> frozenset[str]: @@ -82,6 +93,35 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self: ] ) + @classmethod + def from_polars(cls, df: pl.DataFrame) -> Self: + """ + Create from a polars dataframe. + + Parameters + ---------- + df + Polars dataframe to convert + + Returns + ------- + New dataframe representing the input. + """ + table = df.to_arrow() + schema = table.schema + for i, field in enumerate(schema): + schema = schema.set( + i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type)) + ) + # No-op if the schema is unchanged. + d_table = plc.interop.from_arrow(table.cast(schema)) + return cls( + [ + NamedColumn(column, h_col.name).copy_metadata(h_col) + for column, h_col in zip(d_table.columns(), df.iter_columns()) + ] + ) + @classmethod def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: """ @@ -160,7 +200,10 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: ----- If column names overlap, newer names replace older ones. """ - return type(self)([*self.columns, *columns]) + columns = list( + {c.name: c for c in itertools.chain(self.columns, columns)}.values() + ) + return type(self)(columns) def discard_columns(self, names: Set[str]) -> Self: """Drop columns by name.""" diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index fe859c8d958..a034d55120a 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -32,7 +32,7 @@ if TYPE_CHECKING: from collections.abc import Mapping, Sequence - import polars.polars as plrs + import polars as pl import polars.type_aliases as pl_types from cudf_polars.containers import DataFrame @@ -44,6 +44,7 @@ "Col", "BooleanFunction", "StringFunction", + "TemporalFunction", "Sort", "SortBy", "Gather", @@ -369,6 +370,10 @@ def do_evaluate( # datatype of pyarrow scalar is correct by construction. return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1)) + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return AggInfo([]) + class LiteralColumn(Expr): __slots__ = ("value",) @@ -376,11 +381,18 @@ class LiteralColumn(Expr): value: pa.Array[Any, Any] children: tuple[()] - def __init__(self, dtype: plc.DataType, value: plrs.PySeries) -> None: + def __init__(self, dtype: plc.DataType, value: pl.Series) -> None: super().__init__(dtype) data = value.to_arrow() self.value = data.cast(dtypes.downcast_arrow_lists(data.type)) + def get_hash(self) -> int: + """Compute a hash of the column.""" + # This is stricter than necessary, but we only need this hash + # for identity in groupby replacements so it's OK. And this + # way we avoid doing potentially expensive compute. + return hash((type(self), self.dtype, id(self.value))) + def do_evaluate( self, df: DataFrame, @@ -392,6 +404,10 @@ def do_evaluate( # datatype of pyarrow array is correct by construction. return Column(plc.interop.from_arrow(self.value)) + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return AggInfo([]) + class Col(Expr): __slots__ = ("name",) @@ -703,6 +719,7 @@ def _validate_input(self): pl_expr.StringFunction.EndsWith, pl_expr.StringFunction.StartsWith, pl_expr.StringFunction.Contains, + pl_expr.StringFunction.Slice, ): raise NotImplementedError(f"String function {self.name}") if self.name == pl_expr.StringFunction.Contains: @@ -716,6 +733,11 @@ def _validate_input(self): raise NotImplementedError( "Regex contains only supports a scalar pattern" ) + elif self.name == pl_expr.StringFunction.Slice: + if not all(isinstance(child, Literal) for child in self.children[1:]): + raise NotImplementedError( + "Slice only supports literal start and stop values" + ) def do_evaluate( self, @@ -744,6 +766,36 @@ def do_evaluate( flags=plc.strings.regex_flags.RegexFlags.DEFAULT, ) return Column(plc.strings.contains.contains_re(column.obj, prog)) + elif self.name == pl_expr.StringFunction.Slice: + child, expr_offset, expr_length = self.children + assert isinstance(expr_offset, Literal) + assert isinstance(expr_length, Literal) + + column = child.evaluate(df, context=context, mapping=mapping) + # libcudf slices via [start,stop). + # polars slices with offset + length where start == offset + # stop = start + length. Negative values for start look backward + # from the last element of the string. If the end index would be + # below zero, an empty string is returned. + # Do this maths on the host + start = expr_offset.value.as_py() + length = expr_length.value.as_py() + + if length == 0: + stop = start + else: + # No length indicates a scan to the end + # The libcudf equivalent is a null stop + stop = start + length if length else None + if length and start < 0 and length >= -start: + stop = None + return Column( + plc.strings.slice.slice_strings( + column.obj, + plc.interop.from_arrow(pa.scalar(start, type=pa.int32())), + plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), + ) + ) columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -779,6 +831,159 @@ def do_evaluate( ) # pragma: no cover; handled by init raising +class TemporalFunction(Expr): + __slots__ = ("name", "options", "children") + _non_child = ("dtype", "name", "options") + children: tuple[Expr, ...] + + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.TemporalFunction, + options: tuple[Any, ...], + *children: Expr, + ) -> None: + super().__init__(dtype) + self.options = options + self.name = name + self.children = children + if self.name != pl_expr.TemporalFunction.Year: + raise NotImplementedError(f"String function {self.name}") + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + if self.name == pl_expr.TemporalFunction.Year: + (column,) = columns + return Column(plc.datetime.extract_year(column.obj)) + raise NotImplementedError( + f"TemporalFunction {self.name}" + ) # pragma: no cover; init trips first + + +class UnaryFunction(Expr): + __slots__ = ("name", "options", "children") + _non_child = ("dtype", "name", "options") + children: tuple[Expr, ...] + + def __init__( + self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr + ) -> None: + super().__init__(dtype) + self.name = name + self.options = options + self.children = children + if self.name not in ("mask_nans", "round", "setsorted", "unique"): + raise NotImplementedError(f"Unary function {name=}") + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if self.name == "mask_nans": + (child,) = self.children + return child.evaluate(df, context=context, mapping=mapping).mask_nans() + if self.name == "round": + (decimal_places,) = self.options + (values,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + return Column( + plc.round.round( + values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP + ) + ).sorted_like(values) + elif self.name == "unique": + (maintain_order,) = self.options + (values,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + # Only one column, so keep_any is the same as keep_first + # for stable distinct + keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY + if values.is_sorted: + maintain_order = True + result = plc.stream_compaction.unique( + plc.Table([values.obj]), + [0], + keep, + plc.types.NullEquality.EQUAL, + ) + else: + distinct = ( + plc.stream_compaction.stable_distinct + if maintain_order + else plc.stream_compaction.distinct + ) + result = distinct( + plc.Table([values.obj]), + [0], + keep, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + (column,) = result.columns() + if maintain_order: + return Column(column).sorted_like(values) + return Column(column) + elif self.name == "setsorted": + (column,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + (asc,) = self.options + order = ( + plc.types.Order.ASCENDING + if asc == "ascending" + else plc.types.Order.DESCENDING + ) + null_order = plc.types.NullOrder.BEFORE + if column.obj.null_count() > 0 and (n := column.obj.size()) > 1: + # PERF: This invokes four stream synchronisations! + has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid() + has_nulls_last = not plc.copying.get_element( + column.obj, n - 1 + ).is_valid() + if (order == plc.types.Order.DESCENDING and has_nulls_first) or ( + order == plc.types.Order.ASCENDING and has_nulls_last + ): + null_order = plc.types.NullOrder.AFTER + return column.set_sorted( + is_sorted=plc.types.Sorted.YES, + order=order, + null_order=null_order, + ) + raise NotImplementedError( + f"Unimplemented unary function {self.name=}" + ) # pragma: no cover; init trips first + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth == 1: + # inside aggregation, need to pre-evaluate, groupby + # construction has checked that we don't have nested aggs, + # so stop the recursion and return ourselves for pre-eval + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + else: + (child,) = self.children + return child.collect_agg(depth=depth) + + class Sort(Expr): __slots__ = ("options", "children") _non_child = ("dtype", "options") @@ -1055,12 +1260,19 @@ def collect_agg(self, *, depth: int) -> AggInfo: raise NotImplementedError( "Nested aggregations in groupby" ) # pragma: no cover; check_agg trips first + if (isminmax := self.name in {"min", "max"}) and self.options: + raise NotImplementedError("Nan propagation in groupby for min/max") (child,) = self.children ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests if self.request is None: raise NotImplementedError( f"Aggregation {self.name} in groupby" ) # pragma: no cover; __init__ trips first + if isminmax and plc.traits.is_floating_point(self.dtype): + assert expr is not None + # Ignore nans in these groupby aggs, do this by masking + # nans in the input + expr = UnaryFunction(self.dtype, "mask_nans", (), expr) return AggInfo([(expr, self.request, self)]) def _reduce( @@ -1182,7 +1394,8 @@ def __init__( self.children = (left, right) if ( op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB) - and ({left.dtype.id(), right.dtype.id()}.issubset(dtypes.TIMELIKE_TYPES)) + and plc.traits.is_chrono(left.dtype) + and plc.traits.is_chrono(right.dtype) and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id()) ): raise NotImplementedError("Casting rules for timelike types") diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 9b3096becd4..a84fe73810e 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -15,9 +15,9 @@ import dataclasses import itertools -import json import types from functools import cache +from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, ClassVar import pyarrow as pa @@ -30,7 +30,7 @@ import cudf_polars.dsl.expr as expr from cudf_polars.containers import DataFrame, NamedColumn -from cudf_polars.utils import dtypes, sorting +from cudf_polars.utils import sorting if TYPE_CHECKING: from collections.abc import MutableMapping @@ -96,6 +96,8 @@ def broadcast( ``target_length`` is provided and not all columns are length-1 (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``. """ + if len(columns) == 0: + return [] lengths: set[int] = {column.obj.size() for column in columns} if lengths == {1}: if target_length is None: @@ -183,8 +185,10 @@ class Scan(IR): typ: str """What type of file are we reading? Parquet, CSV, etc...""" - options: tuple[Any, ...] - """Type specific options, as json-encoded strings.""" + reader_options: dict[str, Any] + """Reader-specific options, as dictionary.""" + cloud_options: dict[str, Any] | None + """Cloud-related authentication options, currently ignored.""" paths: list[str] """List of paths to read from.""" file_options: Any @@ -204,9 +208,33 @@ def __post_init__(self) -> None: if self.file_options.n_rows is not None: raise NotImplementedError("row limit in scan") if self.typ not in ("csv", "parquet"): + raise NotImplementedError(f"Unhandled scan type: {self.typ}") + if self.cloud_options is not None and any( + self.cloud_options[k] is not None for k in ("aws", "azure", "gcp") + ): raise NotImplementedError( - f"Unhandled scan type: {self.typ}" - ) # pragma: no cover; polars raises on the rust side for now + "Read from cloud storage" + ) # pragma: no cover; no test yet + if self.typ == "csv": + if self.reader_options["skip_rows_after_header"] != 0: + raise NotImplementedError("Skipping rows after header in CSV reader") + parse_options = self.reader_options["parse_options"] + if ( + null_values := parse_options["null_values"] + ) is not None and "Named" in null_values: + raise NotImplementedError( + "Per column null value specification not supported for CSV reader" + ) + if ( + comment := parse_options["comment_prefix"] + ) is not None and "Multi" in comment: + raise NotImplementedError( + "Multi-character comment prefix not supported for CSV reader" + ) + if not self.reader_options["has_header"]: + # Need to do some file introspection to get the number + # of columns so that column projection works right. + raise NotImplementedError("Reading CSV without header") def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -214,14 +242,72 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: with_columns = options.with_columns row_index = options.row_index if self.typ == "csv": - opts, cloud_opts = map(json.loads, self.options) - df = DataFrame.from_cudf( - cudf.concat( - [cudf.read_csv(p, usecols=with_columns) for p in self.paths] + parse_options = self.reader_options["parse_options"] + sep = chr(parse_options["separator"]) + quote = chr(parse_options["quote_char"]) + eol = chr(parse_options["eol_char"]) + if self.reader_options["schema"] is not None: + # Reader schema provides names + column_names = list(self.reader_options["schema"]["inner"].keys()) + else: + # file provides column names + column_names = None + usecols = with_columns + # TODO: support has_header=False + header = 0 + + # polars defaults to no null recognition + null_values = [""] + if parse_options["null_values"] is not None: + ((typ, nulls),) = parse_options["null_values"].items() + if typ == "AllColumnsSingle": + # Single value + null_values.append(nulls) + else: + # List of values + null_values.extend(nulls) + if parse_options["comment_prefix"] is not None: + comment = chr(parse_options["comment_prefix"]["Single"]) + else: + comment = None + decimal = "," if parse_options["decimal_comma"] else "." + + # polars skips blank lines at the beginning of the file + pieces = [] + for p in self.paths: + skiprows = self.reader_options["skip_rows"] + path = Path(p) + with path.open() as f: + while f.readline() == "\n": + skiprows += 1 + tbl_w_meta = plc.io.csv.read_csv( + plc.io.SourceInfo([path]), + delimiter=sep, + quotechar=quote, + lineterminator=eol, + col_names=column_names, + header=header, + usecols=usecols, + na_filter=True, + na_values=null_values, + keep_default_na=False, + skiprows=skiprows, + comment=comment, + decimal=decimal, + dtypes=self.schema, + ) + pieces.append(tbl_w_meta) + tables, colnames = zip( + *( + (piece.tbl, piece.column_names(include_children=False)) + for piece in pieces ) ) + df = DataFrame.from_table( + plc.concatenate.concatenate(list(tables)), + colnames[0], + ) elif self.typ == "parquet": - opts, cloud_opts = map(json.loads, self.options) cdf = cudf.read_parquet(self.paths, columns=with_columns) assert isinstance(cdf, cudf.DataFrame) df = DataFrame.from_cudf(cdf) @@ -229,7 +315,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: raise NotImplementedError( f"Unhandled scan type: {self.typ}" ) # pragma: no cover; post init trips first - if row_index is not None: + if ( + row_index is not None + # TODO: remove condition when dropping support for polars 1.0 + # https://github.com/pola-rs/polars/pull/17363 + and row_index[0] in self.schema + ): name, offset = row_index dtype = self.schema[name] step = plc.interop.from_arrow( @@ -301,17 +392,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: pdf = pl.DataFrame._from_pydf(self.df) if self.projection is not None: pdf = pdf.select(self.projection) - table = pdf.to_arrow() - schema = table.schema - for i, field in enumerate(schema): - schema = schema.set( - i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type)) - ) - # No-op if the schema is unchanged. - table = table.cast(schema) - df = DataFrame.from_table( - plc.interop.from_arrow(table), list(self.schema.keys()) - ) + df = DataFrame.from_polars(pdf) assert all( c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values()) ) @@ -431,11 +512,11 @@ def check_agg(agg: expr.Expr) -> int: NotImplementedError For unsupported expression nodes. """ - if isinstance(agg, (expr.BinOp, expr.Cast)): + if isinstance(agg, (expr.BinOp, expr.Cast, expr.UnaryFunction)): return max(GroupBy.check_agg(child) for child in agg.children) elif isinstance(agg, expr.Agg): return 1 + max(GroupBy.check_agg(child) for child in agg.children) - elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)): + elif isinstance(agg, (expr.Len, expr.Col, expr.Literal, expr.LiteralColumn)): return 0 else: raise NotImplementedError(f"No handler for {agg=}") @@ -458,16 +539,17 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: keys = broadcast( *(k.evaluate(df) for k in self.keys), target_length=df.num_rows ) - # TODO: use sorted information, need to expose column_order - # and null_precedence in pylibcudf groupby constructor - # sorted = ( - # plc.types.Sorted.YES - # if all(k.is_sorted for k in keys) - # else plc.types.Sorted.NO - # ) + sorted = ( + plc.types.Sorted.YES + if all(k.is_sorted for k in keys) + else plc.types.Sorted.NO + ) grouper = plc.groupby.GroupBy( plc.Table([k.obj for k in keys]), null_handling=plc.types.NullPolicy.INCLUDE, + keys_are_sorted=sorted, + column_order=[k.order for k in keys], + null_precedence=[k.null_order for k in keys], ) # TODO: uniquify requests = [] @@ -494,7 +576,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: results = [ req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests ] - return DataFrame([*result_keys, *results]).slice(self.options.slice) + return DataFrame(broadcast(*result_keys, *results)).slice(self.options.slice) @dataclasses.dataclass @@ -573,6 +655,59 @@ def _joiners( else: assert_never(how) + def _reorder_maps( + self, + left_rows: int, + lg: plc.Column, + left_policy: plc.copying.OutOfBoundsPolicy, + right_rows: int, + rg: plc.Column, + right_policy: plc.copying.OutOfBoundsPolicy, + ) -> list[plc.Column]: + """ + Reorder gather maps to satisfy polars join order restrictions. + + Parameters + ---------- + left_rows + Number of rows in left table + lg + Left gather map + left_policy + Nullify policy for left map + right_rows + Number of rows in right table + rg + Right gather map + right_policy + Nullify policy for right map + + Returns + ------- + list of reordered left and right gather maps. + + Notes + ----- + For a left join, the polars result preserves the order of the + left keys, and is stable wrt the right keys. For all other + joins, there is no order obligation. + """ + dt = plc.interop.to_arrow(plc.types.SIZE_TYPE) + init = plc.interop.from_arrow(pa.scalar(0, type=dt)) + step = plc.interop.from_arrow(pa.scalar(1, type=dt)) + left_order = plc.copying.gather( + plc.Table([plc.filling.sequence(left_rows, init, step)]), lg, left_policy + ) + right_order = plc.copying.gather( + plc.Table([plc.filling.sequence(right_rows, init, step)]), rg, right_policy + ) + return plc.sorting.stable_sort_by_key( + plc.Table([lg, rg]), + plc.Table([*left_order.columns(), *right_order.columns()]), + [plc.types.Order.ASCENDING, plc.types.Order.ASCENDING], + [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], + ).columns() + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" left = self.left.evaluate(cache=cache) @@ -613,6 +748,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: result = DataFrame.from_table(table, left.column_names) else: lg, rg = join_fn(left_on.table, right_on.table, null_equality) + if how == "left": + # Order of left table is preserved + lg, rg = self._reorder_maps( + left.num_rows, lg, left_policy, right.num_rows, rg, right_policy + ) if coalesce and how == "inner": right = right.discard_columns(right_on.column_names_set) left = DataFrame.from_table( diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index a2fdb3c3d79..dec45679c75 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -5,6 +5,7 @@ from __future__ import annotations +import json from contextlib import AbstractContextManager, nullcontext from functools import singledispatch from typing import Any @@ -12,6 +13,7 @@ import pyarrow as pa from typing_extensions import assert_never +import polars as pl import polars.polars as plrs from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir @@ -88,10 +90,16 @@ def _( node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: typ, *options = node.scan_type + if typ == "ndjson": + (reader_options,) = map(json.loads, options) + cloud_options = None + else: + reader_options, cloud_options = map(json.loads, options) return ir.Scan( schema, typ, - tuple(options), + reader_options, + cloud_options, node.paths, node.file_options, translate_named_expr(visitor, n=node.predicate) @@ -361,8 +369,23 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex options, *(translate_expr(visitor, n=n) for n in node.input), ) - else: - raise NotImplementedError(f"No handler for Expr function node with {name=}") + elif isinstance(name, pl_expr.TemporalFunction): + return expr.TemporalFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) + elif isinstance(name, str): + return expr.UnaryFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) + raise NotImplementedError( + f"No handler for Expr function node with {name=}" + ) # pragma: no cover; polars raises on the rust side for now @_translate_expr.register @@ -387,7 +410,7 @@ def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr @_translate_expr.register def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: if isinstance(node.value, plrs.PySeries): - return expr.LiteralColumn(dtype, node.value) + return expr.LiteralColumn(dtype, pl.Series._from_pyseries(node.value)) value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype)) return expr.Literal(dtype, value) @@ -432,8 +455,11 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype))) - else: - return expr.Cast(dtype, inner) + elif isinstance(inner, expr.Cast): + # Translation of Len/Count-agg put in a cast, remove double + # casts if we have one. + (inner,) = inner.children + return expr.Cast(dtype, inner) @_translate_expr.register @@ -443,12 +469,15 @@ def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr @_translate_expr.register def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: - return expr.Agg( + value = expr.Agg( dtype, node.name, node.options, *(translate_expr(visitor, n=n) for n in node.arguments), ) + if value.name == "count" and value.dtype.id() != plc.TypeId.INT32: + return expr.Cast(value.dtype, value) + return value @_translate_expr.register @@ -475,7 +504,10 @@ def _( @_translate_expr.register def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: - return expr.Len(dtype) + value = expr.Len(dtype) + if dtype.id() != plc.TypeId.INT32: + return expr.Cast(dtype, value) + return value # pragma: no cover; never reached since polars len has uint32 dtype def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr: diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 507acb5d33a..918cd024fa2 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -17,19 +17,6 @@ __all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"] -TIMELIKE_TYPES: frozenset[plc.TypeId] = frozenset( - [ - plc.TypeId.TIMESTAMP_MILLISECONDS, - plc.TypeId.TIMESTAMP_MICROSECONDS, - plc.TypeId.TIMESTAMP_NANOSECONDS, - plc.TypeId.TIMESTAMP_DAYS, - plc.TypeId.DURATION_MILLISECONDS, - plc.TypeId.DURATION_MICROSECONDS, - plc.TypeId.DURATION_NANOSECONDS, - ] -) - - def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId): """ Do two datetime typeids have matching resolution for a binop. diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py new file mode 100644 index 00000000000..9807cffb384 --- /dev/null +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Version utilities so that cudf_polars supports a range of polars versions.""" + +# ruff: noqa: SIM300 +from __future__ import annotations + +from packaging.version import parse + +from polars import __version__ + +POLARS_VERSION = parse(__version__) + +POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0") +POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1") +POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2") +POLARS_VERSION_GE_121 = POLARS_VERSION >= parse("1.2.1") +POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0") +POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1") +POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2") + +POLARS_VERSION_LE_12 = POLARS_VERSION <= parse("1.2") +POLARS_VERSION_LE_11 = POLARS_VERSION <= parse("1.1") +POLARS_VERSION_LT_12 = POLARS_VERSION < parse("1.2") +POLARS_VERSION_LT_11 = POLARS_VERSION < parse("1.1") + +if POLARS_VERSION < parse("1.0"): # pragma: no cover + raise ImportError("cudf_polars requires py-polars v1.0 or greater.") diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index bf4673fcc50..0b559f7a8e9 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -182,5 +182,3 @@ docstring-code-format = true [tool.rapids-build-backend] build-backend = "setuptools.build_meta" dependencies-file = "../../dependencies.yaml" -# Pure python -disable-cuda = true diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py index 3291d8db161..4f3c0de5975 100644 --- a/python/cudf_polars/tests/containers/test_column.py +++ b/python/cudf_polars/tests/containers/test_column.py @@ -3,12 +3,14 @@ from __future__ import annotations +from functools import partial + import pyarrow import pytest import cudf._lib.pylibcudf as plc -from cudf_polars.containers import Column +from cudf_polars.containers import Column, NamedColumn def test_non_scalar_access_raises(): @@ -54,17 +56,21 @@ def test_shallow_copy(): @pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32]) -def test_mask_nans(typeid): +@pytest.mark.parametrize("constructor", [Column, partial(NamedColumn, name="name")]) +def test_mask_nans(typeid, constructor): dtype = plc.DataType(typeid) values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype)) - column = Column(plc.interop.from_arrow(values)) + column = constructor(plc.interop.from_arrow(values)) masked = column.mask_nans() - assert column.obj is masked.obj + assert column.obj.null_count() == masked.obj.null_count() -def test_mask_nans_float_with_nan_notimplemented(): +def test_mask_nans_float(): dtype = plc.DataType(plc.TypeId.FLOAT32) values = pyarrow.array([0, 0, float("nan")], type=plc.interop.to_arrow(dtype)) column = Column(plc.interop.from_arrow(values)) - with pytest.raises(NotImplementedError): - _ = column.mask_nans() + masked = column.mask_nans() + expect = pyarrow.array([0, 0, None], type=plc.interop.to_arrow(dtype)) + got = pyarrow.array(plc.interop.to_arrow(masked.obj)) + + assert expect == got diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py index 2e385e39eef..87508e17407 100644 --- a/python/cudf_polars/tests/containers/test_dataframe.py +++ b/python/cudf_polars/tests/containers/test_dataframe.py @@ -5,6 +5,8 @@ import pytest +import polars as pl + import cudf._lib.pylibcudf as plc from cudf_polars.containers import DataFrame, NamedColumn @@ -90,3 +92,52 @@ def test_shallow_copy(): ) assert df.columns[0].is_sorted == plc.types.Sorted.YES assert copy.columns[0].is_sorted == plc.types.Sorted.NO + + +def test_sorted_flags_preserved_empty(): + df = pl.DataFrame({"a": pl.Series([], dtype=pl.Int8())}) + df.select(pl.col("a").sort()) + + gf = DataFrame.from_polars(df) + + (a,) = gf.columns + + assert a.is_sorted == plc.types.Sorted.YES + + assert df.flags == gf.to_polars().flags + + +@pytest.mark.parametrize("nulls_last", [True, False]) +def test_sorted_flags_preserved(with_nulls, nulls_last): + values = [1, 2, -1, 2, 4, 5] + if with_nulls: + values[4] = None + df = pl.DataFrame({"a": values, "b": values, "c": values}) + + df = df.select( + pl.col("a").sort(descending=False, nulls_last=nulls_last), + pl.col("b").sort(descending=True, nulls_last=nulls_last), + pl.col("c"), + ) + + gf = DataFrame.from_polars(df) + + a_null_order = ( + plc.types.NullOrder.AFTER + if nulls_last and with_nulls + else plc.types.NullOrder.BEFORE + ) + b_null_order = ( + plc.types.NullOrder.AFTER + if not nulls_last and with_nulls + else plc.types.NullOrder.BEFORE + ) + a, b, c = gf.columns + assert a.is_sorted == plc.types.Sorted.YES + assert a.order == plc.types.Order.ASCENDING + assert a.null_order == a_null_order + assert b.is_sorted == plc.types.Sorted.YES + assert b.order == plc.types.Order.DESCENDING + assert b.null_order == b_null_order + assert c.is_sorted == plc.types.Sorted.NO + assert df.flags == gf.to_polars().flags diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 267d0a99692..245bde3acab 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -20,13 +20,7 @@ def dtype(request): return request.param -@pytest.fixture( - params=[ - False, - pytest.param(True, marks=pytest.mark.xfail(reason="No handler for set_sorted")), - ], - ids=["unsorted", "sorted"], -) +@pytest.fixture(params=[False, True], ids=["unsorted", "sorted"]) def is_sorted(request): return request.param @@ -59,14 +53,25 @@ def test_agg(df, agg): @pytest.mark.parametrize( - "propagate_nans", - [pytest.param(False, marks=pytest.mark.xfail(reason="Need to mask nans")), True], - ids=["mask_nans", "propagate_nans"], + "op", [pl.Expr.min, pl.Expr.nan_min, pl.Expr.max, pl.Expr.nan_max] ) -@pytest.mark.parametrize("op", ["min", "max"]) -def test_agg_float_with_nans(propagate_nans, op): - df = pl.LazyFrame({"a": pl.Series([1, 2, float("nan")], dtype=pl.Float64())}) - op = getattr(pl.Expr, f"nan_{op}" if propagate_nans else op) +def test_agg_float_with_nans(op): + df = pl.LazyFrame( + { + "a": pl.Series([1, 2, float("nan")], dtype=pl.Float64()), + "b": pl.Series([1, 2, None], dtype=pl.Int8()), + } + ) + q = df.select(op(pl.col("a")), op(pl.col("b"))) + + assert_gpu_result_equal(q) + + +@pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513") +@pytest.mark.parametrize("op", [pl.Expr.max, pl.Expr.min]) +def test_agg_singleton(op): + df = pl.LazyFrame({"a": pl.Series([float("nan")])}) + q = df.select(op(pl.col("a"))) assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py index 6ba2a1dce1e..218101bf87c 100644 --- a/python/cudf_polars/tests/expressions/test_datetime_basic.py +++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py @@ -2,6 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import datetime +from operator import methodcaller + import pytest import polars as pl @@ -32,3 +35,28 @@ def test_datetime_dataframe_scan(dtype): query = ldf.select(pl.col("b"), pl.col("a")) assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "field", + [ + methodcaller("year"), + pytest.param( + methodcaller("day"), + marks=pytest.mark.xfail(reason="day extraction not implemented"), + ), + ], +) +def test_datetime_extract(field): + ldf = pl.LazyFrame( + {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]} + ) + q = ldf.select(field(pl.col("dates").dt)) + + with pytest.raises(AssertionError): + # polars produces int32, libcudf produces int16 for the year extraction + # libcudf can lose data here. + # https://github.com/rapidsai/cudf/issues/16196 + assert_gpu_result_equal(q) + + assert_gpu_result_equal(q, check_dtypes=False) diff --git a/python/cudf_polars/tests/expressions/test_round.py b/python/cudf_polars/tests/expressions/test_round.py new file mode 100644 index 00000000000..3af3a0ce6d1 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_round.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import math + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(params=[pl.Float32, pl.Float64]) +def dtype(request): + return request.param + + +@pytest.fixture +def df(dtype, with_nulls): + a = [-math.e, 10, 22.5, 1.5, 2.5, -1.5, math.pi, 8] + if with_nulls: + a[2] = None + a[-1] = None + return pl.LazyFrame({"a": a}, schema={"a": dtype}) + + +@pytest.mark.parametrize("decimals", [0, 2, 4]) +def test_round(df, decimals): + q = df.select(pl.col("a").round(decimals=decimals)) + + assert_gpu_result_equal(q, check_exact=False) diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py index 0195266f5c6..d46df92db94 100644 --- a/python/cudf_polars/tests/expressions/test_sort.py +++ b/python/cudf_polars/tests/expressions/test_sort.py @@ -8,6 +8,9 @@ import polars as pl +import cudf._lib.pylibcudf as plc + +from cudf_polars import translate_ir from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -51,3 +54,31 @@ def test_sort_by_expression(descending, nulls_last, maintain_order): ) ) assert_gpu_result_equal(query, check_row_order=maintain_order) + + +@pytest.mark.parametrize("descending", [False, True]) +@pytest.mark.parametrize("nulls_last", [False, True]) +def test_setsorted(descending, nulls_last, with_nulls): + values = sorted([1, 2, 3, 4, 5, 6, -2], reverse=descending) + if with_nulls: + values[-1 if nulls_last else 0] = None + df = pl.LazyFrame({"a": values}) + + q = df.set_sorted("a", descending=descending) + + assert_gpu_result_equal(q) + + df = translate_ir(q._ldf.visit()).evaluate(cache={}) + + (a,) = df.columns + + assert a.is_sorted == plc.types.Sorted.YES + null_order = ( + plc.types.NullOrder.AFTER + if (descending ^ nulls_last) and with_nulls + else plc.types.NullOrder.BEFORE + ) + assert a.null_order == null_order + assert a.order == ( + plc.types.Order.DESCENDING if descending else plc.types.Order.ASCENDING + ) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 9729e765948..8cf65dd51ac 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -37,6 +37,30 @@ def ldf(with_nulls): return pl.LazyFrame({"a": a, "b": range(len(a))}) +slice_cases = [ + (1, 3), + (0, 3), + (0, 0), + (-3, 1), + (-100, 5), + (1, 1), + (100, 100), + (-3, 4), + (-3, 3), +] + + +@pytest.fixture(params=slice_cases) +def slice_column_data(ldf, request): + start, length = request.param + if length: + return ldf.with_columns( + pl.lit(start).alias("start"), pl.lit(length).alias("length") + ) + else: + return ldf.with_columns(pl.lit(start).alias("start")) + + def test_supported_stringfunction_expression(ldf): query = ldf.select( pl.col("a").str.starts_with("Z"), @@ -104,3 +128,25 @@ def test_contains_invalid(ldf): query.collect() with pytest.raises(pl.exceptions.ComputeError): query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)) + + +@pytest.mark.parametrize("offset", [1, -1, 0, 100, -100]) +def test_slice_scalars_offset(ldf, offset): + query = ldf.select(pl.col("a").str.slice(offset)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize("offset,length", slice_cases) +def test_slice_scalars_length_and_offset(ldf, offset, length): + query = ldf.select(pl.col("a").str.slice(offset, length)) + assert_gpu_result_equal(query) + + +def test_slice_column(slice_column_data): + if "length" in slice_column_data.collect_schema(): + query = slice_column_data.select( + pl.col("a").str.slice(pl.col("start"), pl.col("length")) + ) + else: + query = slice_column_data.select(pl.col("a").str.slice(pl.col("start"))) + assert_ir_translation_raises(query, NotImplementedError) diff --git a/python/cudf_polars/tests/expressions/test_unique.py b/python/cudf_polars/tests/expressions/test_unique.py new file mode 100644 index 00000000000..9b009a422c2 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_unique.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"]) +@pytest.mark.parametrize("pre_sorted", [False, True], ids=["unsorted", "sorted"]) +def test_unique(maintain_order, pre_sorted): + ldf = pl.DataFrame( + { + "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3], + } + ).lazy() + if pre_sorted: + ldf = ldf.sort("b") + + query = ldf.select(pl.col("b").unique(maintain_order=maintain_order)) + assert_gpu_result_equal(query, check_row_order=maintain_order) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index aefad59eb91..a75825ef3d3 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import itertools + import pytest import polars as pl @@ -10,6 +12,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) +from cudf_polars.utils import versions @pytest.fixture @@ -26,12 +29,12 @@ def df(): @pytest.fixture( params=[ - ["key1"], - ["key2"], + [pl.col("key1")], + [pl.col("key2")], [pl.col("key1") * pl.col("key2")], - ["key1", "key2"], + [pl.col("key1"), pl.col("key2")], [pl.col("key1") == pl.col("key2")], - ["key2", pl.col("key1") == pl.lit(1, dtype=pl.Int64)], + [pl.col("key2"), pl.col("key1") == pl.lit(1, dtype=pl.Int64)], ], ids=lambda keys: "-".join(map(str, keys)), ) @@ -47,6 +50,8 @@ def keys(request): [pl.col("float").max() - pl.col("int").min()], [pl.col("float").mean(), pl.col("int").std()], [(pl.col("float") - pl.lit(2)).max()], + [pl.col("float").sum().round(decimals=1)], + [pl.col("float").round(decimals=1).sum()], ], ids=lambda aggs: "-".join(map(str, aggs)), ) @@ -80,13 +85,39 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs): assert_gpu_result_equal(q, check_exact=False) +def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs): + sorted_keys = [ + key.sort(descending=descending) + for key, descending in zip(keys, itertools.cycle([False, True])) + ] + + q = df.group_by(*sorted_keys).agg(*exprs) + + schema = q.collect_schema() + sort_keys = list(schema.keys())[: len(keys)] + # Multiple keys don't do sorting + qsorted = q.sort(*sort_keys) + if len(keys) > 1: + with pytest.raises(AssertionError): + # https://github.com/pola-rs/polars/issues/17556 + assert_gpu_result_equal(q, check_exact=False) + if versions.POLARS_VERSION_LT_12 and schema[sort_keys[1]] == pl.Boolean(): + # https://github.com/pola-rs/polars/issues/17557 + with pytest.raises(AssertionError): + assert_gpu_result_equal(qsorted, check_exact=False) + else: + assert_gpu_result_equal(qsorted, check_exact=False) + elif schema[sort_keys[0]] == pl.Boolean(): + # Boolean keys don't do sorting, so we get random order + assert_gpu_result_equal(qsorted, check_exact=False) + else: + assert_gpu_result_equal(q, check_exact=False) + + def test_groupby_len(df, keys): q = df.group_by(*keys).agg(pl.len()) - # TODO: polars returns UInt32, libcudf returns Int32 - with pytest.raises(AssertionError): - assert_gpu_result_equal(q, check_row_order=False) - assert_gpu_result_equal(q, check_dtypes=False, check_row_order=False) + assert_gpu_result_equal(q, check_row_order=False) @pytest.mark.parametrize( @@ -100,3 +131,55 @@ def test_groupby_unsupported(df, expr): q = df.group_by("key1").agg(expr) assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513") +def test_groupby_minmax_with_nan(): + df = pl.LazyFrame( + {"key": [1, 2, 2, 2], "value": [float("nan"), 1, -1, float("nan")]} + ) + + q = df.group_by("key").agg( + pl.col("value").max().alias("max"), pl.col("value").min().alias("min") + ) + + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("op", [pl.Expr.nan_max, pl.Expr.nan_min]) +def test_groupby_nan_minmax_raises(op): + df = pl.LazyFrame( + {"key": [1, 2, 2, 2], "value": [float("nan"), 1, -1, float("nan")]} + ) + + q = df.group_by("key").agg(op(pl.col("value"))) + + assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize( + "key", + [ + pytest.param( + 1, + marks=pytest.mark.xfail( + versions.POLARS_VERSION_GE_121, reason="polars 1.2.1 disallows this" + ), + ), + pl.col("key1"), + ], +) +@pytest.mark.parametrize( + "expr", + [ + pl.lit(1).alias("value"), + pl.lit([[4, 5, 6]]).alias("value"), + pl.col("float") * (1 - pl.col("int")), + [pl.lit(2).alias("value"), pl.col("float") * 2], + ], +) +def test_groupby_literal_in_agg(df, key, expr): + # check_row_order=False doesn't work for list aggregations + # so just sort by the group key + q = df.group_by(key).agg(expr).sort(key, maintain_order=True) + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 89f6fd3455b..1ffbf3c0ef4 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -53,7 +53,7 @@ def test_join(how, coalesce, join_nulls, join_expr): query = left.join( right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce ) - assert_gpu_result_equal(query, check_row_order=False) + assert_gpu_result_equal(query, check_row_order=how == "left") def test_cross_join(): diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index f129cc7ca32..0981a96a34a 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import os + import pytest import polars as pl @@ -10,6 +12,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) +from cudf_polars.utils import versions @pytest.fixture( @@ -22,22 +25,22 @@ def row_index(request): @pytest.fixture( params=[ - (None, 0), + None, pytest.param( - (2, 1), marks=pytest.mark.xfail(reason="No handling of row limit in scan") + 2, marks=pytest.mark.xfail(reason="No handling of row limit in scan") ), pytest.param( - (3, 0), marks=pytest.mark.xfail(reason="No handling of row limit in scan") + 3, marks=pytest.mark.xfail(reason="No handling of row limit in scan") ), ], ids=["all-rows", "n_rows-with-skip", "n_rows-no-skip"], ) -def n_rows_skip_rows(request): +def n_rows(request): return request.param @pytest.fixture(params=["csv", "parquet"]) -def df(request, tmp_path, row_index, n_rows_skip_rows): +def df(request, tmp_path, row_index, n_rows): df = pl.DataFrame( { "a": [1, 2, 3, None], @@ -46,14 +49,12 @@ def df(request, tmp_path, row_index, n_rows_skip_rows): } ) name, offset = row_index - n_rows, skip_rows = n_rows_skip_rows if request.param == "csv": df.write_csv(tmp_path / "file.csv") return pl.scan_csv( tmp_path / "file.csv", row_index_name=name, row_index_offset=offset, - skip_rows_after_header=skip_rows, n_rows=n_rows, ) else: @@ -97,3 +98,138 @@ def test_scan_unsupported_raises(tmp_path): df.write_ndjson(tmp_path / "df.json") q = pl.scan_ndjson(tmp_path / "df.json") assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.xfail( + versions.POLARS_VERSION_LT_11, + reason="https://github.com/pola-rs/polars/issues/15730", +) +def test_scan_row_index_projected_out(tmp_path): + df = pl.DataFrame({"a": [1, 2, 3]}) + + df.write_parquet(tmp_path / "df.pq") + + q = pl.scan_parquet(tmp_path / "df.pq").with_row_index().select(pl.col("a")) + + assert_gpu_result_equal(q) + + +def test_scan_csv_column_renames_projection_schema(tmp_path): + with (tmp_path / "test.csv").open("w") as f: + f.write("""foo,bar,baz\n1,2\n3,4,5""") + + q = pl.scan_csv( + tmp_path / "test.csv", + with_column_names=lambda names: [f"{n}_suffix" for n in names], + schema_overrides={ + "foo_suffix": pl.String(), + "bar_suffix": pl.Int8(), + "baz_suffix": pl.UInt16(), + }, + ) + + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "filename,glob", + [ + (["test1.csv", "test2.csv"], True), + ("test*.csv", True), + # Make sure we don't expand glob when + # trying to read a file like test*.csv + # when glob=False + ("test*.csv", False), + ], +) +def test_scan_csv_multi(tmp_path, filename, glob): + with (tmp_path / "test1.csv").open("w") as f: + f.write("""foo,bar,baz\n1,2\n3,4,5""") + with (tmp_path / "test2.csv").open("w") as f: + f.write("""foo,bar,baz\n1,2\n3,4,5""") + with (tmp_path / "test*.csv").open("w") as f: + f.write("""foo,bar,baz\n1,2\n3,4,5""") + os.chdir(tmp_path) + q = pl.scan_csv(filename, glob=glob) + + assert_gpu_result_equal(q) + + +def test_scan_csv_multi_differing_colnames(tmp_path): + with (tmp_path / "test1.csv").open("w") as f: + f.write("""foo,bar,baz\n1,2\n3,4,5""") + with (tmp_path / "test2.csv").open("w") as f: + f.write("""abc,def,ghi\n1,2\n3,4,5""") + q = pl.scan_csv( + [tmp_path / "test1.csv", tmp_path / "test2.csv"], + ) + with pytest.raises(pl.exceptions.ComputeError): + q.explain() + + +def test_scan_csv_skip_after_header_not_implemented(tmp_path): + with (tmp_path / "test.csv").open("w") as f: + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") + + q = pl.scan_csv(tmp_path / "test.csv", skip_rows_after_header=1) + + assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_csv_null_values_per_column_not_implemented(tmp_path): + with (tmp_path / "test.csv").open("w") as f: + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") + + q = pl.scan_csv(tmp_path / "test.csv", null_values={"foo": "1", "baz": "5"}) + + assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_csv_comment_str_not_implemented(tmp_path): + with (tmp_path / "test.csv").open("w") as f: + f.write("""foo,bar,baz\n// 1,2,3\n3,4,5""") + + q = pl.scan_csv(tmp_path / "test.csv", comment_prefix="// ") + + assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_csv_comment_char(tmp_path): + with (tmp_path / "test.csv").open("w") as f: + f.write("""foo,bar,baz\n# 1,2,3\n3,4,5""") + + q = pl.scan_csv(tmp_path / "test.csv", comment_prefix="#") + + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("nulls", [None, "3", ["3", "5"]]) +def test_scan_csv_null_values(tmp_path, nulls): + with (tmp_path / "test.csv").open("w") as f: + f.write("""foo,bar,baz\n1,2,3\n3,4,5\n5,,2""") + + q = pl.scan_csv(tmp_path / "test.csv", null_values=nulls) + + assert_gpu_result_equal(q) + + +def test_scan_csv_decimal_comma(tmp_path): + with (tmp_path / "test.csv").open("w") as f: + f.write("""foo|bar|baz\n1,23|2,34|3,56\n1""") + + q = pl.scan_csv(tmp_path / "test.csv", separator="|", decimal_comma=True) + + assert_gpu_result_equal(q) + + +def test_scan_csv_skip_initial_empty_rows(tmp_path): + with (tmp_path / "test.csv").open("w") as f: + f.write("""\n\n\n\nfoo|bar|baz\n1|2|3\n1""") + + q = pl.scan_csv(tmp_path / "test.csv", separator="|", skip_rows=1, has_header=False) + + assert_ir_translation_raises(q, NotImplementedError) + + q = pl.scan_csv(tmp_path / "test.csv", separator="|", skip_rows=1) + + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py index b021d832910..865b95a7d91 100644 --- a/python/cudf_polars/tests/test_union.py +++ b/python/cudf_polars/tests/test_union.py @@ -46,3 +46,12 @@ def test_concat_vertical(): q = pl.concat([ldf, ldf2], how="vertical") assert_gpu_result_equal(q) + + +def test_concat_diagonal_empty(): + df1 = pl.LazyFrame() + df2 = pl.LazyFrame({"a": [1, 2]}) + + q = pl.concat([df1, df2], how="diagonal_relaxed") + + assert_gpu_result_equal(q, collect_kwargs={"no_optimization": True}) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 1f55a59ea55..4bdb5d921ec 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -667,17 +667,10 @@ def from_dict( ) @staticmethod - def read_json(*args, engine="auto", **kwargs): - return _default_backend( - dd.read_json, - *args, - engine=( - partial(cudf.read_json, engine=engine) - if isinstance(engine, str) - else engine - ), - **kwargs, - ) + def read_json(*args, **kwargs): + from dask_cudf.io.json import read_json as read_json_impl + + return read_json_impl(*args, **kwargs) @staticmethod def read_orc(*args, **kwargs): diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py index 2a6ad603414..8705d98e9d6 100644 --- a/python/dask_cudf/dask_cudf/io/json.py +++ b/python/dask_cudf/dask_cudf/io/json.py @@ -1,15 +1,71 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. from functools import partial +import numpy as np +from fsspec.core import get_compression, get_fs_token_paths + import dask +from dask.utils import parse_bytes import cudf +from cudf.core.column import as_column +from cudf.utils.ioutils import _is_local_filesystem from dask_cudf.backends import _default_backend -def read_json(url_path, engine="auto", **kwargs): +def _read_json_partition( + paths, + fs=None, + include_path_column=False, + path_converter=None, + **kwargs, +): + # Transfer all data up front for remote storage + sources = ( + paths + if fs is None + else fs.cat_ranges( + paths, + [0] * len(paths), + fs.sizes(paths), + ) + ) + + if include_path_column: + # Add "path" column. + # Must iterate over sources sequentially + if not isinstance(include_path_column, str): + include_path_column = "path" + converted_paths = ( + paths + if path_converter is None + else [path_converter(path) for path in paths] + ) + dfs = [] + for i, source in enumerate(sources): + df = cudf.read_json(source, **kwargs) + df[include_path_column] = as_column( + converted_paths[i], length=len(df) + ) + dfs.append(df) + return cudf.concat(dfs) + else: + # Pass sources directly to cudf + return cudf.read_json(sources, **kwargs) + + +def read_json( + url_path, + engine="auto", + blocksize=None, + orient="records", + lines=None, + compression="infer", + aggregate_files=True, + **kwargs, +): """Read JSON data into a :class:`.DataFrame`. This function wraps :func:`dask.dataframe.read_json`, and passes @@ -30,7 +86,13 @@ def read_json(url_path, engine="auto", **kwargs): data. The default value is "auto", so that ``engine=partial(cudf.read_json, engine="auto")`` will be passed to :func:`dask.dataframe.read_json` by default. - + aggregate_files : bool or int + Whether to map multiple files to each output partition. If True, + the `blocksize` argument will be used to determine the number of + files in each partition. If any one file is larger than `blocksize`, + the `aggregate_files` argument will be ignored. If an integer value + is specified, the `blocksize` argument will be ignored, and that + number of files will be mapped to each partition. Default is True. **kwargs : Key-word arguments to pass through to :func:`dask.dataframe.read_json`. @@ -60,9 +122,77 @@ def read_json(url_path, engine="auto", **kwargs): """ - # TODO: Add optimized code path to leverage the - # `byte_range` argument in `cudf.read_json` for - # local storage (see `dask_cudf.read_csv`) + if lines is None: + lines = orient == "records" + if orient != "records" and lines: + raise ValueError( + 'Line-delimited JSON is only available with orient="records".' + ) + if blocksize and (orient != "records" or not lines): + raise ValueError( + "JSON file chunking only allowed for JSON-lines" + "input (orient='records', lines=True)." + ) + + inputs = [] + if aggregate_files and blocksize or int(aggregate_files) > 1: + # Attempt custom read if we are mapping multiple files + # to each output partition. Otherwise, upstream logic + # is sufficient. + + storage_options = kwargs.get("storage_options", {}) + fs, _, paths = get_fs_token_paths( + url_path, mode="rb", storage_options=storage_options + ) + if isinstance(aggregate_files, int) and aggregate_files > 1: + # Map a static file count to each partition + inputs = [ + paths[offset : offset + aggregate_files] + for offset in range(0, len(paths), aggregate_files) + ] + elif aggregate_files is True and blocksize: + # Map files dynamically (using blocksize) + file_sizes = fs.sizes(paths) # NOTE: This can be slow + blocksize = parse_bytes(blocksize) + if all([file_size <= blocksize for file_size in file_sizes]): + counts = np.unique( + np.floor(np.cumsum(file_sizes) / blocksize), + return_counts=True, + )[1] + offsets = np.concatenate([[0], counts.cumsum()]) + inputs = [ + paths[offsets[i] : offsets[i + 1]] + for i in range(len(offsets) - 1) + ] + + if inputs: + # Inputs were successfully populated. + # Use custom _read_json_partition function + # to generate each partition. + + compression = get_compression( + url_path[0] if isinstance(url_path, list) else url_path, + compression, + ) + _kwargs = dict( + orient=orient, + lines=lines, + compression=compression, + include_path_column=kwargs.get("include_path_column", False), + path_converter=kwargs.get("path_converter"), + ) + if not _is_local_filesystem(fs): + _kwargs["fs"] = fs + # TODO: Generate meta more efficiently + meta = _read_json_partition(inputs[0][:1], **_kwargs) + return dask.dataframe.from_map( + _read_json_partition, + inputs, + meta=meta, + **_kwargs, + ) + + # Fall back to dask.dataframe.read_json return _default_backend( dask.dataframe.read_json, url_path, @@ -71,5 +201,9 @@ def read_json(url_path, engine="auto", **kwargs): if isinstance(engine, str) else engine ), + blocksize=blocksize, + orient=orient, + lines=lines, + compression=compression, **kwargs, ) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index dc780478794..abafbffd197 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -1,5 +1,6 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. +import math import os import pandas as pd @@ -97,3 +98,31 @@ def test_read_json_nested(tmp_path): # Ensure not passing kwargs also reads the file. actual = dask_cudf.read_json(f) dd.assert_eq(actual, actual_pd) + + +def test_read_json_aggregate_files(tmp_path): + df1 = dask.datasets.timeseries( + dtypes={"x": int, "y": int}, freq="120s" + ).reset_index(drop=True) + json_path = str(tmp_path / "data-*.json") + df1.to_json(json_path) + + df2 = dask_cudf.read_json(json_path, aggregate_files=2) + assert df2.npartitions == math.ceil(df1.npartitions / 2) + dd.assert_eq(df1, df2, check_index=False) + + df2 = dask_cudf.read_json( + json_path, aggregate_files=True, blocksize="1GiB" + ) + assert df2.npartitions == 1 + dd.assert_eq(df1, df2, check_index=False) + + for include_path_column, name in [(True, "path"), ("file", "file")]: + df2 = dask_cudf.read_json( + json_path, + aggregate_files=2, + include_path_column=include_path_column, + ) + assert name in df2.columns + assert len(df2[name].compute().unique()) == df1.npartitions + dd.assert_eq(df1, df2.drop(columns=[name]), check_index=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index a67404da4fe..3947c69aaa5 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -138,5 +138,7 @@ def test_read_parquet(s3_base, s3so, open_file_options): storage_options=s3so, open_file_options=open_file_options, ) - assert df.a.sum().compute() == 10 - assert df.b.sum().compute() == 9 + with pytest.warns(FutureWarning): + assert df.a.sum().compute() == 10 + with pytest.warns(FutureWarning): + assert df.b.sum().compute() == 9