diff --git a/build.sh b/build.sh index adf6e220744..f5a59b6edcf 100755 --- a/build.sh +++ b/build.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # cuDF build script @@ -17,7 +17,7 @@ ARGS=$* # script, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h" +VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h --build_metrics --incl_cache_stats" HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l] [--cmake-args=\\\"\\\"] clean - remove all existing build artifacts and configuration (start over) @@ -37,6 +37,8 @@ HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafk --disable_nvtx - disable inserting NVTX profiling ranges --show_depr_warn - show cmake deprecation warnings --ptds - enable per-thread default stream + --build_metrics - generate build metrics report for libcudf + --incl_cache_stats - include cache statistics in build metrics report --cmake-args=\\\"\\\" - pass arbitrary list of CMake configuration options (escape all quotes in argument) -h | --h[elp] - print this text @@ -61,6 +63,8 @@ BUILD_NVTX=ON BUILD_TESTS=OFF BUILD_DISABLE_DEPRECATION_WARNING=ON BUILD_PER_THREAD_DEFAULT_STREAM=OFF +BUILD_REPORT_METRICS=OFF +BUILD_REPORT_INCL_CACHE_STATS=OFF # Set defaults for vars that may not have been defined externally # FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check @@ -144,6 +148,14 @@ fi if hasArg --ptds; then BUILD_PER_THREAD_DEFAULT_STREAM=ON fi +if hasArg --build_metrics; then + BUILD_REPORT_METRICS=ON +fi + +if hasArg --incl_cache_stats; then + BUILD_REPORT_INCL_CACHE_STATS=ON +fi + # If clean given, run it prior to any other steps if hasArg clean; then @@ -174,8 +186,11 @@ if buildAll || hasArg libcudf; then # get the current count before the compile starts FILES_IN_CCACHE="" - if [ -x "$(command -v ccache)" ]; then + if [[ "$BUILD_REPORT_INCL_CACHE_STATS"=="ON" && -x "$(command -v ccache)" ]]; then FILES_IN_CCACHE=$(ccache -s | grep "files in cache") + echo "$FILES_IN_CCACHE" + # zero the ccache statistics + ccache -z fi cmake -S $REPODIR/cpp -B ${LIB_BUILD_DIR} \ @@ -197,12 +212,24 @@ if buildAll || hasArg libcudf; then compile_total=$(( compile_end - compile_start )) # Record build times - if [[ -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then - echo "Formatting build times" + if [[ "$BUILD_REPORT_METRICS"=="ON" && -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then + echo "Formatting build metrics" python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml - message="$FILES_IN_CCACHE

$PARALLEL_LEVEL parallel build time is $compile_total seconds" - echo "$message" - python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$message" > ${LIB_BUILD_DIR}/ninja_log.html + MSG="

" + # get some ccache stats after the compile + if [[ "$BUILD_REPORT_INCL_CACHE_STATS"=="ON" && -x "$(command -v ccache)" ]]; then + MSG="${MSG}
$FILES_IN_CCACHE" + HIT_RATE=$(ccache -s | grep "cache hit rate") + MSG="${MSG}
${HIT_RATE}" + fi + MSG="${MSG}
parallel setting: $PARALLEL_LEVEL" + MSG="${MSG}
parallel build time: $compile_total seconds" + if [[ -f "${LIB_BUILD_DIR}/libcudf.so" ]]; then + LIBCUDF_FS=$(ls -lh ${LIB_BUILD_DIR}/libcudf.so | awk '{print $5}') + MSG="${MSG}
libcudf.so size: $LIBCUDF_FS" + fi + echo "$MSG" + python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html fi if [[ ${INSTALL_TARGET} != "" ]]; then diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index 979db1b5034..534ac19ee98 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/" export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" # Dask & Distributed git tag -export DASK_DISTRIBUTED_GIT_TAG='2021.11.2' +export DASK_DISTRIBUTED_GIT_TAG='main' function remove_libcudf_kernel_cache_dir { EXITCODE=$? @@ -98,7 +98,7 @@ conda list --show-channel-urls ################################################################################ logger "Build libcudf..." -if [[ ${BUILD_MODE} == "pull-request" ]]; then +if [[ "${BUILD_MODE}" == "pull-request" ]]; then "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds else "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests -l --ptds diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 00dffa57683..f23296038f2 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -78,6 +78,14 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then mkdir -p ${CONDA_BLD_DIR}/libcudf/work cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcudf/work + # Copy libcudf build metrics results + LIBCUDF_BUILD_DIR=$CONDA_BLD_DIR/libcudf/work/cpp/build + echo "Checking for build metrics log $LIBCUDF_BUILD_DIR/ninja_log.html" + if [[ -f "$LIBCUDF_BUILD_DIR/ninja_log.html" ]]; then + gpuci_logger "Copying build metrics results" + mkdir -p "$WORKSPACE/build-metrics" + cp "$LIBCUDF_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html" + fi gpuci_logger "Build conda pkg for libcudf_kafka" gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcudf_kafka $CONDA_BUILD_ARGS diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh index 40e80def8ae..e6ef72d930c 100755 --- a/ci/cpu/upload.sh +++ b/ci/cpu/upload.sh @@ -12,7 +12,7 @@ export GPUCI_RETRY_SLEEP=30 export LABEL_OPTION=${LABEL_OPTION:-"--label main"} # Skip uploads unless BUILD_MODE == "branch" -if [ ${BUILD_MODE} != "branch" ]; then +if [ "${BUILD_MODE}" != "branch" ]; then echo "Skipping upload" return 0 fi diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index a557a2ef066..39a39c46eff 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags` export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` # Dask & Distributed git tag -export DASK_DISTRIBUTED_GIT_TAG='2021.11.2' +export DASK_DISTRIBUTED_GIT_TAG='main' # ucx-py version export UCX_PY_VERSION='0.24.*' @@ -89,7 +89,7 @@ gpuci_mamba_retry install -y \ "ucx-py=${UCX_PY_VERSION}" # https://docs.rapids.ai/maintainers/depmgmt/ -# gpuci_mamba_retry remove --force rapids-build-env rapids-notebook-env +# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env # gpuci_mamba_retry install -y "your-pkg=1.0.0" @@ -124,7 +124,7 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then ################################################################################ gpuci_logger "Build from source" - if [[ ${BUILD_MODE} == "pull-request" ]]; then + if [[ "${BUILD_MODE}" == "pull-request" ]]; then "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds else "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests -l --ptds @@ -181,12 +181,10 @@ else done # Copy libcudf build time results - echo "Checking for build time log $LIB_BUILD_DIR/ninja_log.html" - if [[ -f "$LIB_BUILD_DIR/ninja_log.html" ]]; then + echo "Checking for build time log $LIB_BUILD_DIR/ninja_log.xml" + if [[ -f "$LIB_BUILD_DIR/ninja_log.xml" ]]; then gpuci_logger "Copying build time results" cp "$LIB_BUILD_DIR/ninja_log.xml" "$WORKSPACE/test-results/buildtimes-junit.xml" - mkdir -p "$WORKSPACE/build-metrics" - cp "$LIB_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html" fi ################################################################################ @@ -224,7 +222,7 @@ else install_dask gpuci_logger "Build python libs from source" - if [[ ${BUILD_MODE} == "pull-request" ]]; then + if [[ "${BUILD_MODE}" == "pull-request" ]]; then "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds else "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka -l --ptds diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh index 6f7038d21d7..b048470d155 100755 --- a/ci/gpu/java.sh +++ b/ci/gpu/java.sh @@ -77,7 +77,7 @@ conda config --show-sources conda list --show-channel-urls gpuci_logger "Install dependencies" -gpuci_conda_retry install -y \ +gpuci_mamba_retry install -y \ "cudatoolkit=$CUDA_REL" \ "rapids-build-env=$MINOR_VERSION.*" \ "rapids-notebook-env=$MINOR_VERSION.*" \ @@ -86,10 +86,14 @@ gpuci_conda_retry install -y \ "ucx-py=${UCX_PY_VERSION}" \ "openjdk=8.*" \ "maven" +# "mamba install openjdk" adds an activation script to set JAVA_HOME but this is +# not triggered on installation. Re-activating the conda environment will set +# this environment variable so that CMake can find JNI. +conda activate rapids # https://docs.rapids.ai/maintainers/depmgmt/ # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env -# gpuci_conda_retry install -y "your-pkg=1.0.0" +# gpuci_mamba_retry install -y "your-pkg=1.0.0" gpuci_logger "Check compiler versions" @@ -130,7 +134,7 @@ KAFKA_CONDA_FILE=`basename "$KAFKA_CONDA_FILE" .tar.bz2` #get filename without e KAFKA_CONDA_FILE=${KAFKA_CONDA_FILE//-/=} #convert to conda install gpuci_logger "Installing $CUDF_CONDA_FILE & $KAFKA_CONDA_FILE" -conda install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE" +gpuci_mamba_retry install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE" install_dask diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index cc8d50a1717..c258a5caabb 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -41,8 +41,8 @@ dependencies: - pydocstyle=6.1.1 - typing_extensions - pre-commit - - dask>=2021.11.1,<=2021.11.2 - - distributed>=2021.11.1,<=2021.11.2 + - dask>=2021.11.1 + - distributed>=2021.11.1 - streamz - arrow-cpp=5.0.0 - dlpack>=0.5,<0.6.0a0 diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh index 3db559c144d..5d8720f1c98 100644 --- a/conda/recipes/cudf_kafka/build.sh +++ b/conda/recipes/cudf_kafka/build.sh @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # This assumes the script is executed from the root of the repo directory ./build.sh -v cudf_kafka diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index e450d306cbe..571d1bdea8f 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -1,9 +1,10 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} +{% set cuda_version = '.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} +{% set py_version = environ.get('PY_VER', '3.8') %} +{% set py_version_numeric = py_version.replace('.', '') %} package: name: cudf_kafka @@ -14,7 +15,7 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: py{{ py_version_numeric }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - CC - CXX @@ -26,14 +27,15 @@ requirements: build: - cmake >=3.20.1 host: - - python + - python {{ py_version }} - cython >=0.29,<0.30 - - setuptools - cudf {{ version }} - libcudf_kafka {{ version }} + - setuptools run: + - python {{ py_version }} - libcudf_kafka {{ version }} - - python-confluent-kafka + - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}* - cudf {{ version }} test: # [linux64] diff --git a/conda/recipes/custreamz/build.sh b/conda/recipes/custreamz/build.sh index 6ce9e4f21a9..88fccf90c69 100644 --- a/conda/recipes/custreamz/build.sh +++ b/conda/recipes/custreamz/build.sh @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # This assumes the script is executed from the root of the repo directory ./build.sh -v custreamz diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index a8b096d4892..7d9529257e6 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -1,9 +1,10 @@ -# Copyright (c) 2018-2019, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} +{% set cuda_version = '.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} +{% set py_version = environ.get('PY_VER', '3.8') %} +{% set py_version_numeric = py_version.replace('.', '') %} package: name: custreamz @@ -14,7 +15,7 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: py{{ py_version_numeric }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - VERSION_SUFFIX - PARALLEL_LEVEL @@ -24,16 +25,16 @@ build: requirements: host: - - python - - python-confluent-kafka + - python {{ py_version }} + - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}* - cudf_kafka {{ version }} run: - - python + - python {{ py_version }} - streamz - cudf {{ version }} - dask>=2021.11.1,<=2021.11.2 - distributed>=2021.11.1,<=2021.11.2 - - python-confluent-kafka + - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}* - cudf_kafka {{ version }} test: # [linux64] diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index da8bcea430a..fd34ff4112d 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -27,14 +27,14 @@ requirements: host: - python - cudf {{ version }} - - dask>=2021.11.1,<=2021.11.2 - - distributed>=2021.11.1,<=2021.11.2 + - dask>=2021.11.1 + - distributed>=2021.11.1 - cudatoolkit {{ cuda_version }} run: - python - cudf {{ version }} - - dask>=2021.11.1,<=2021.11.2 - - distributed>=2021.11.1,<=2021.11.2 + - dask>=2021.11.1 + - distributed>=2021.11.1 - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} test: # [linux64] diff --git a/conda/recipes/libcudf/build.sh b/conda/recipes/libcudf/build.sh index 703f8dc15c7..c3730b3241a 100644 --- a/conda/recipes/libcudf/build.sh +++ b/conda/recipes/libcudf/build.sh @@ -4,5 +4,5 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then # This assumes the script is executed from the root of the repo directory ./build.sh -v libcudf --allgpuarch --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\" else - ./build.sh -v libcudf tests --allgpuarch --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\" + ./build.sh -v libcudf tests --allgpuarch --build_metrics --incl_cache_stats --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\" fi diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index bd9b76e4890..2cbe5173de0 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -158,6 +158,7 @@ test: - test -f $PREFIX/include/cudf/lists/explode.hpp - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/extract.hpp + - test -f $PREFIX/include/cudf/lists/filling.hpp - test -f $PREFIX/include/cudf/lists/contains.hpp - test -f $PREFIX/include/cudf/lists/gather.hpp - test -f $PREFIX/include/cudf/lists/lists_column_view.hpp diff --git a/conda/recipes/libcudf_kafka/build.sh b/conda/recipes/libcudf_kafka/build.sh index cbe4584cb63..b656f55a64e 100644 --- a/conda/recipes/libcudf_kafka/build.sh +++ b/conda/recipes/libcudf_kafka/build.sh @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then # This assumes the script is executed from the root of the repo directory diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index 6b15890e7c7..0b274f3a41d 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -26,7 +26,7 @@ requirements: - cmake >=3.20.1 host: - libcudf {{version}} - - librdkafka >=1.6.0,<1.7.0a0 + - librdkafka >=1.7.0,<1.8.0a0 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 84e486c7e18..a8100fb3f92 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -618,7 +618,7 @@ set_target_properties( ) target_compile_options( - cudftestutil PUBLIC "$<$:${CUDF_CXX_FLAGS}>" + cudftestutil PUBLIC "$:${CUDF_CXX_FLAGS}>>" "$:${CUDF_CUDA_FLAGS}>>" ) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 63f6857ee08..370f84fc14a 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -177,6 +177,7 @@ ConfigureBench( groupby/group_shift_benchmark.cu groupby/group_struct_benchmark.cu groupby/group_no_requests_benchmark.cu + groupby/group_scan_benchmark.cu ) # ################################################################################################## diff --git a/cpp/benchmarks/groupby/group_benchmark_common.hpp b/cpp/benchmarks/groupby/group_benchmark_common.hpp new file mode 100644 index 00000000000..fba5bc28822 --- /dev/null +++ b/cpp/benchmarks/groupby/group_benchmark_common.hpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +template +T random_int(T min, T max) +{ + static unsigned seed = 13377331; + static std::mt19937 engine{seed}; + static std::uniform_int_distribution uniform{min, max}; + + return uniform(engine); +} diff --git a/cpp/benchmarks/groupby/group_no_requests_benchmark.cu b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu index 7dbe1888cee..209155862bd 100644 --- a/cpp/benchmarks/groupby/group_no_requests_benchmark.cu +++ b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,32 +14,23 @@ * limitations under the License. */ +#include +#include +#include + #include #include #include #include #include + #include -#include -#include #include -#include class Groupby : public cudf::benchmark { }; -// TODO: put it in a struct so `uniform` can be remade with different min, max -template -T random_int(T min, T max) -{ - static unsigned seed = 13377331; - static std::mt19937 engine{seed}; - static std::uniform_int_distribution uniform{min, max}; - - return uniform(engine); -} - void BM_basic_no_requests(benchmark::State& state) { using wrapper = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/benchmarks/groupby/group_nth_benchmark.cu b/cpp/benchmarks/groupby/group_nth_benchmark.cu index 8d1de36db95..107b3839c4c 100644 --- a/cpp/benchmarks/groupby/group_nth_benchmark.cu +++ b/cpp/benchmarks/groupby/group_nth_benchmark.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,32 +14,23 @@ * limitations under the License. */ +#include +#include +#include + #include #include #include #include #include + #include -#include -#include #include -#include class Groupby : public cudf::benchmark { }; -// TODO: put it in a struct so `uniform` can be remade with different min, max -template -T random_int(T min, T max) -{ - static unsigned seed = 13377331; - static std::mt19937 engine{seed}; - static std::uniform_int_distribution uniform{min, max}; - - return uniform(engine); -} - void BM_pre_sorted_nth(benchmark::State& state) { using wrapper = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/benchmarks/groupby/group_scan_benchmark.cu b/cpp/benchmarks/groupby/group_scan_benchmark.cu new file mode 100644 index 00000000000..d9849e53498 --- /dev/null +++ b/cpp/benchmarks/groupby/group_scan_benchmark.cu @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +class Groupby : public cudf::benchmark { +}; + +void BM_basic_sum_scan(benchmark::State& state) +{ + using wrapper = cudf::test::fixed_width_column_wrapper; + + const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + + auto data_it = cudf::detail::make_counting_transform_iterator( + 0, [=](cudf::size_type row) { return random_int(0, 100); }); + + wrapper keys(data_it, data_it + column_size); + wrapper vals(data_it, data_it + column_size); + + cudf::groupby::groupby gb_obj(cudf::table_view({keys, keys, keys})); + + std::vector requests; + requests.emplace_back(cudf::groupby::scan_request()); + requests[0].values = vals; + requests[0].aggregations.push_back(cudf::make_sum_aggregation()); + + for (auto _ : state) { + cuda_event_timer timer(state, true); + + auto result = gb_obj.scan(requests); + } +} + +BENCHMARK_DEFINE_F(Groupby, BasicSumScan)(::benchmark::State& state) { BM_basic_sum_scan(state); } + +BENCHMARK_REGISTER_F(Groupby, BasicSumScan) + ->UseManualTime() + ->Unit(benchmark::kMillisecond) + ->Arg(1000000) + ->Arg(10000000) + ->Arg(100000000); + +void BM_pre_sorted_sum_scan(benchmark::State& state) +{ + using wrapper = cudf::test::fixed_width_column_wrapper; + + const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + + auto data_it = cudf::detail::make_counting_transform_iterator( + 0, [=](cudf::size_type row) { return random_int(0, 100); }); + auto valid_it = cudf::detail::make_counting_transform_iterator( + 0, [=](cudf::size_type row) { return random_int(0, 100) < 90; }); + + wrapper keys(data_it, data_it + column_size); + wrapper vals(data_it, data_it + column_size, valid_it); + + auto keys_table = cudf::table_view({keys}); + auto sort_order = cudf::sorted_order(keys_table); + auto sorted_keys = cudf::gather(keys_table, *sort_order); + // No need to sort values using sort_order because they were generated randomly + + cudf::groupby::groupby gb_obj(*sorted_keys, cudf::null_policy::EXCLUDE, cudf::sorted::YES); + + std::vector requests; + requests.emplace_back(cudf::groupby::scan_request()); + requests[0].values = vals; + requests[0].aggregations.push_back(cudf::make_sum_aggregation()); + + for (auto _ : state) { + cuda_event_timer timer(state, true); + + auto result = gb_obj.scan(requests); + } +} + +BENCHMARK_DEFINE_F(Groupby, PreSortedSumScan)(::benchmark::State& state) +{ + BM_pre_sorted_sum_scan(state); +} + +BENCHMARK_REGISTER_F(Groupby, PreSortedSumScan) + ->UseManualTime() + ->Unit(benchmark::kMillisecond) + ->Arg(1000000) + ->Arg(10000000) + ->Arg(100000000); diff --git a/cpp/benchmarks/groupby/group_shift_benchmark.cu b/cpp/benchmarks/groupby/group_shift_benchmark.cu index 81afcdd80e1..6b0710f4044 100644 --- a/cpp/benchmarks/groupby/group_shift_benchmark.cu +++ b/cpp/benchmarks/groupby/group_shift_benchmark.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,7 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include +#include #include #include @@ -24,24 +26,9 @@ #include -#include - -#include - class Groupby : public cudf::benchmark { }; -// TODO: put it in a struct so `uniform` can be remade with different min, max -template -T random_int(T min, T max) -{ - static unsigned seed = 13377331; - static std::mt19937 engine{seed}; - static std::uniform_int_distribution uniform{min, max}; - - return uniform(engine); -} - void BM_group_shift(benchmark::State& state) { using wrapper = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/benchmarks/groupby/group_sum_benchmark.cu b/cpp/benchmarks/groupby/group_sum_benchmark.cu index 0e9f5061a1a..63f9aa02070 100644 --- a/cpp/benchmarks/groupby/group_sum_benchmark.cu +++ b/cpp/benchmarks/groupby/group_sum_benchmark.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,32 +14,23 @@ * limitations under the License. */ +#include +#include +#include + #include #include #include #include #include + #include -#include -#include #include -#include class Groupby : public cudf::benchmark { }; -// TODO: put it in a struct so `uniform` can be remade with different min, max -template -T random_int(T min, T max) -{ - static unsigned seed = 13377331; - static std::mt19937 engine{seed}; - static std::uniform_int_distribution uniform{min, max}; - - return uniform(engine); -} - void BM_basic_sum(benchmark::State& state) { using wrapper = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp index a68ce2bd1a1..888102c03be 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp @@ -89,14 +89,14 @@ void BM_parq_read_varying_options(benchmark::State& state) auto const use_pandas_metadata = (flags & 2) != 0; auto const ts_type = cudf::data_type{static_cast(state.range(state_idx++))}; - auto const data_types = - dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL), - int32_t(type_group_id::FLOATING_POINT), - int32_t(type_group_id::FIXED_POINT), - int32_t(type_group_id::TIMESTAMP), - int32_t(cudf::type_id::STRING), - int32_t(cudf::type_id::LIST)}), - col_sel); + auto const data_types = dtypes_for_column_selection( + get_type_or_group({static_cast(type_group_id::INTEGRAL), + static_cast(type_group_id::FLOATING_POINT), + static_cast(type_group_id::FIXED_POINT), + static_cast(type_group_id::TIMESTAMP), + static_cast(cudf::type_id::STRING), + static_cast(cudf::type_id::LIST)}), + col_sel); auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size}); auto const view = tbl->view(); @@ -181,6 +181,9 @@ BENCHMARK_REGISTER_F(ParquetRead, column_selection) ->Unit(benchmark::kMillisecond) ->UseManualTime(); +// Disabled until we add an API to read metadata from a parquet file and determine num row groups. +// https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863 +/* BENCHMARK_DEFINE_F(ParquetRead, row_selection) (::benchmark::State& state) { BM_parq_read_varying_options(state); } BENCHMARK_REGISTER_F(ParquetRead, row_selection) @@ -191,6 +194,7 @@ BENCHMARK_REGISTER_F(ParquetRead, row_selection) {int32_t(cudf::type_id::EMPTY)}}) ->Unit(benchmark::kMillisecond) ->UseManualTime(); +*/ BENCHMARK_DEFINE_F(ParquetRead, misc_options) (::benchmark::State& state) { BM_parq_read_varying_options(state); } diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index b721448b45a..76ebb48d195 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -30,7 +30,7 @@ The details are based on features documented at https://www.regular-expressions. | Literal character | Any character except `[\^$.⎮?*+()` | All characters except the listed special characters match a single instance of themselves | `a` matches `a` | | Literal curly braces | `{` and `}` | `{` and `}` are literal characters, unless they are part of a valid regular expression token such as a quantifier `{3}` | `{` matches `{` | | Backslash escapes a metacharacter | `\` followed by any of `[\^$.⎮?*+(){}` | A backslash escapes special characters to suppress their special meaning | `\*` matches `*` | -| Hexadecimal escape | `\xFF` where `FF` are 2 hexadecimal digits | Matches the character at the specified position in the code page | `\xA9` matches `©` | +| Hexadecimal escape | `\xFF` where `FF` are 2 hexadecimal digits | Matches the character at the specified position in the ASCII table | `\x40` matches `@` | | Character escape | `\n`, `\r` and `\t` | Match an line-feed (LF) character, carriage return (CR) character and a tab character respectively | `\r\n` matches a Windows CRLF line break | | Character escape | `\a` | Match the "alert" or "bell" control character (ASCII 0x07) | | | Character escape | `\f` | Match the form-feed control character (ASCII 0x0C) | | diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh index f974088c8e7..0b739482c4d 100644 --- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh +++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh @@ -57,11 +57,8 @@ struct expression_result { /** * Helper function to get the subclass type to dispatch methods to. */ - CUDA_DEVICE_CALLABLE Subclass& subclass() { return static_cast(*this); } - CUDA_DEVICE_CALLABLE Subclass const& subclass() const - { - return static_cast(*this); - } + __device__ inline Subclass& subclass() { return static_cast(*this); } + __device__ inline Subclass const& subclass() const { return static_cast(*this); } // TODO: The index is ignored by the value subclass, but is included in this // signature because it is required by the implementation in the template @@ -73,15 +70,15 @@ struct expression_result { // used, whereas passing it as a parameter keeps it in registers for fast // access at the point where indexing occurs. template - CUDA_DEVICE_CALLABLE void set_value(cudf::size_type index, - possibly_null_value_t const& result) + __device__ inline void set_value(cudf::size_type index, + possibly_null_value_t const& result) { subclass().template set_value(index, result); } - CUDA_DEVICE_CALLABLE bool is_valid() const { return subclass().is_valid(); } + __device__ inline bool is_valid() const { return subclass().is_valid(); } - CUDA_DEVICE_CALLABLE T value() const { return subclass().value(); } + __device__ inline T value() const { return subclass().value(); } }; /** @@ -97,11 +94,11 @@ struct expression_result { template struct value_expression_result : public expression_result, T, has_nulls> { - CUDA_DEVICE_CALLABLE value_expression_result() {} + __device__ inline value_expression_result() {} template - CUDA_DEVICE_CALLABLE void set_value(cudf::size_type index, - possibly_null_value_t const& result) + __device__ inline void set_value(cudf::size_type index, + possibly_null_value_t const& result) { if constexpr (std::is_same_v) { _obj = result; @@ -113,7 +110,7 @@ struct value_expression_result /** * @brief Returns true if the underlying data is valid and false otherwise. */ - CUDA_DEVICE_CALLABLE bool is_valid() const + __device__ inline bool is_valid() const { if constexpr (has_nulls) { return _obj.has_value(); } return true; @@ -125,7 +122,7 @@ struct value_expression_result * If the underlying data is not valid, behavior is undefined. Callers should * use is_valid to check for validity before accessing the value. */ - CUDA_DEVICE_CALLABLE T value() const + __device__ inline T value() const { // Using two separate constexprs silences compiler warnings, whereas an // if/else does not. An unconditional return is not ignored by the compiler @@ -156,13 +153,11 @@ struct mutable_column_expression_result : public expression_result, mutable_column_device_view, has_nulls> { - CUDA_DEVICE_CALLABLE mutable_column_expression_result(mutable_column_device_view& obj) : _obj(obj) - { - } + __device__ inline mutable_column_expression_result(mutable_column_device_view& obj) : _obj(obj) {} template - CUDA_DEVICE_CALLABLE void set_value(cudf::size_type index, - possibly_null_value_t const& result) + __device__ inline void set_value(cudf::size_type index, + possibly_null_value_t const& result) { if constexpr (has_nulls) { if (result.has_value()) { @@ -179,7 +174,7 @@ struct mutable_column_expression_result /** * @brief Not implemented for this specialization. */ - CUDA_DEVICE_CALLABLE bool is_valid() const + __device__ inline bool is_valid() const { // Not implemented since it would require modifying the API in the parent class to accept an // index. @@ -191,7 +186,7 @@ struct mutable_column_expression_result /** * @brief Not implemented for this specialization. */ - CUDA_DEVICE_CALLABLE mutable_column_device_view value() const + __device__ inline mutable_column_device_view value() const { // Not implemented since it would require modifying the API in the parent class to accept an // index. @@ -222,7 +217,7 @@ struct single_dispatch_binary_operator { * @param args Forwarded arguments to `operator()` of `f`. */ template - CUDA_DEVICE_CALLABLE auto operator()(F&& f, Ts&&... args) + __device__ inline auto operator()(F&& f, Ts&&... args) { f.template operator()(std::forward(args)...); } @@ -247,9 +242,9 @@ struct expression_evaluator { * storing intermediates. */ - CUDA_DEVICE_CALLABLE expression_evaluator(table_device_view const& left, - table_device_view const& right, - expression_device_view const& plan) + __device__ inline expression_evaluator(table_device_view const& left, + table_device_view const& right, + expression_device_view const& plan) : left(left), right(right), plan(plan) { } @@ -262,8 +257,8 @@ struct expression_evaluator { * @param thread_intermediate_storage Pointer to this thread's portion of shared memory for * storing intermediates. */ - CUDA_DEVICE_CALLABLE expression_evaluator(table_device_view const& table, - expression_device_view const& plan) + __device__ inline expression_evaluator(table_device_view const& table, + expression_device_view const& plan) : expression_evaluator(table, table, plan) { } @@ -282,7 +277,7 @@ struct expression_evaluator { * @return Element The type- and null-resolved data. */ template ())> - CUDA_DEVICE_CALLABLE possibly_null_value_t resolve_input( + __device__ inline possibly_null_value_t resolve_input( detail::device_data_reference const& input_reference, IntermediateDataType* thread_intermediate_storage, cudf::size_type left_row_index, @@ -333,7 +328,7 @@ struct expression_evaluator { template ())> - CUDA_DEVICE_CALLABLE possibly_null_value_t resolve_input( + __device__ inline possibly_null_value_t resolve_input( detail::device_data_reference const& device_data_reference, IntermediateDataType* thread_intermediate_storage, cudf::size_type left_row_index, @@ -358,7 +353,7 @@ struct expression_evaluator { * @param op The operator to act with. */ template - CUDA_DEVICE_CALLABLE void operator()( + __device__ inline void operator()( expression_result& output_object, cudf::size_type const input_row_index, detail::device_data_reference const& input, @@ -395,7 +390,7 @@ struct expression_evaluator { * @param op The operator to act with. */ template - CUDA_DEVICE_CALLABLE void operator()( + __device__ inline void operator()( expression_result& output_object, cudf::size_type const left_row_index, cudf::size_type const right_row_index, @@ -431,9 +426,10 @@ struct expression_evaluator { * @param row_index Row index of all input and output data column(s). */ template - CUDF_DFI void evaluate(expression_result& output_object, - cudf::size_type const row_index, - IntermediateDataType* thread_intermediate_storage) + __device__ __forceinline__ void evaluate( + expression_result& output_object, + cudf::size_type const row_index, + IntermediateDataType* thread_intermediate_storage) { evaluate(output_object, row_index, row_index, row_index, thread_intermediate_storage); } @@ -451,11 +447,12 @@ struct expression_evaluator { * @param output_row_index The row in the output to insert the result. */ template - CUDF_DFI void evaluate(expression_result& output_object, - cudf::size_type const left_row_index, - cudf::size_type const right_row_index, - cudf::size_type const output_row_index, - IntermediateDataType* thread_intermediate_storage) + __device__ __forceinline__ void evaluate( + expression_result& output_object, + cudf::size_type const left_row_index, + cudf::size_type const right_row_index, + cudf::size_type const output_row_index, + IntermediateDataType* thread_intermediate_storage) { cudf::size_type operator_source_index{0}; for (cudf::size_type operator_index = 0; operator_index < plan.operators.size(); @@ -517,7 +514,7 @@ struct expression_evaluator { */ struct expression_output_handler { public: - CUDA_DEVICE_CALLABLE expression_output_handler() {} + __device__ inline expression_output_handler() {} /** * @brief Resolves an output data reference and assigns result value. @@ -539,7 +536,7 @@ struct expression_evaluator { typename T, bool result_has_nulls, CUDF_ENABLE_IF(is_rep_layout_compatible())> - CUDA_DEVICE_CALLABLE void resolve_output( + __device__ inline void resolve_output( expression_result& output_object, detail::device_data_reference const& device_data_reference, cudf::size_type const row_index, @@ -563,7 +560,7 @@ struct expression_evaluator { typename T, bool result_has_nulls, CUDF_ENABLE_IF(!is_rep_layout_compatible())> - CUDA_DEVICE_CALLABLE void resolve_output( + __device__ inline void resolve_output( expression_result& output_object, detail::device_data_reference const& device_data_reference, cudf::size_type const row_index, @@ -582,7 +579,7 @@ struct expression_evaluator { */ template struct unary_expression_output_handler : public expression_output_handler { - CUDA_DEVICE_CALLABLE unary_expression_output_handler() {} + __device__ inline unary_expression_output_handler() {} /** * @brief Callable to perform a unary operation. @@ -602,7 +599,7 @@ struct expression_evaluator { std::enable_if_t< detail::is_valid_unary_op, possibly_null_value_t>>* = nullptr> - CUDA_DEVICE_CALLABLE void operator()( + __device__ inline void operator()( expression_result& output_object, cudf::size_type const output_row_index, possibly_null_value_t const& input, @@ -626,7 +623,7 @@ struct expression_evaluator { std::enable_if_t< !detail::is_valid_unary_op, possibly_null_value_t>>* = nullptr> - CUDA_DEVICE_CALLABLE void operator()( + __device__ inline void operator()( expression_result& output_object, cudf::size_type const output_row_index, possibly_null_value_t const& input, @@ -645,7 +642,7 @@ struct expression_evaluator { */ template struct binary_expression_output_handler : public expression_output_handler { - CUDA_DEVICE_CALLABLE binary_expression_output_handler() {} + __device__ inline binary_expression_output_handler() {} /** * @brief Callable to perform a binary operation. @@ -667,7 +664,7 @@ struct expression_evaluator { possibly_null_value_t, possibly_null_value_t>>* = nullptr> - CUDA_DEVICE_CALLABLE void operator()( + __device__ inline void operator()( expression_result& output_object, cudf::size_type const output_row_index, possibly_null_value_t const& lhs, @@ -693,7 +690,7 @@ struct expression_evaluator { !detail::is_valid_binary_op, possibly_null_value_t, possibly_null_value_t>>* = nullptr> - CUDA_DEVICE_CALLABLE void operator()( + __device__ inline void operator()( expression_result& output_object, cudf::size_type const output_row_index, possibly_null_value_t const& lhs, diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index cffefcaf9cd..d7fd109f12a 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -49,9 +49,7 @@ constexpr bool is_valid_unary_op = cuda::std::is_invocable::value; * @param args Forwarded arguments to `operator()` of `f`. */ template -CUDA_HOST_DEVICE_CALLABLE constexpr void ast_operator_dispatcher(ast_operator op, - F&& f, - Ts&&... args) +CUDF_HOST_DEVICE inline constexpr void ast_operator_dispatcher(ast_operator op, F&& f, Ts&&... args) { switch (op) { case ast_operator::ADD: @@ -234,7 +232,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs + rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs + rhs) { return lhs + rhs; } @@ -245,7 +243,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs - rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs - rhs) { return lhs - rhs; } @@ -256,7 +254,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs * rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs * rhs) { return lhs * rhs; } @@ -267,7 +265,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs / rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs / rhs) { return lhs / rhs; } @@ -278,7 +276,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(static_cast(lhs) / static_cast(rhs)) { return static_cast(lhs) / static_cast(rhs); @@ -290,7 +288,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(floor(static_cast(lhs) / static_cast(rhs))) { return floor(static_cast(lhs) / static_cast(rhs)); @@ -305,7 +303,7 @@ struct operator_functor { typename RHS, typename CommonType = std::common_type_t, std::enable_if_t::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(static_cast(lhs) % static_cast(rhs)) { return static_cast(lhs) % static_cast(rhs); @@ -315,7 +313,7 @@ struct operator_functor { typename RHS, typename CommonType = std::common_type_t, std::enable_if_t>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(fmodf(static_cast(lhs), static_cast(rhs))) { return fmodf(static_cast(lhs), static_cast(rhs)); @@ -325,7 +323,7 @@ struct operator_functor { typename RHS, typename CommonType = std::common_type_t, std::enable_if_t>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(fmod(static_cast(lhs), static_cast(rhs))) { return fmod(static_cast(lhs), static_cast(rhs)); @@ -340,7 +338,7 @@ struct operator_functor { typename RHS, typename CommonType = std::common_type_t, std::enable_if_t::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(((static_cast(lhs) % static_cast(rhs)) + static_cast(rhs)) % static_cast(rhs)) @@ -354,7 +352,7 @@ struct operator_functor { typename RHS, typename CommonType = std::common_type_t, std::enable_if_t>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(fmodf(fmodf(static_cast(lhs), static_cast(rhs)) + static_cast(rhs), static_cast(rhs))) @@ -368,7 +366,7 @@ struct operator_functor { typename RHS, typename CommonType = std::common_type_t, std::enable_if_t>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(fmod(fmod(static_cast(lhs), static_cast(rhs)) + static_cast(rhs), static_cast(rhs))) @@ -384,7 +382,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(std::pow(lhs, rhs)) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(std::pow(lhs, rhs)) { return std::pow(lhs, rhs); } @@ -395,7 +393,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs == rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs == rhs) { return lhs == rhs; } @@ -412,7 +410,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs != rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs != rhs) { return lhs != rhs; } @@ -423,7 +421,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs < rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs < rhs) { return lhs < rhs; } @@ -434,7 +432,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs > rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs > rhs) { return lhs > rhs; } @@ -445,7 +443,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs <= rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs <= rhs) { return lhs <= rhs; } @@ -456,7 +454,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs >= rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs >= rhs) { return lhs >= rhs; } @@ -467,7 +465,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs & rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs & rhs) { return lhs & rhs; } @@ -478,7 +476,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs | rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs | rhs) { return lhs | rhs; } @@ -489,7 +487,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs ^ rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs ^ rhs) { return lhs ^ rhs; } @@ -500,7 +498,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs && rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs && rhs) { return lhs && rhs; } @@ -517,7 +515,7 @@ struct operator_functor { static constexpr auto arity{2}; template - CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs || rhs) + __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs || rhs) { return lhs || rhs; } @@ -534,7 +532,7 @@ struct operator_functor { static constexpr auto arity{1}; template - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(input) + __device__ inline auto operator()(InputT input) -> decltype(input) { return input; } @@ -545,7 +543,7 @@ struct operator_functor { static constexpr auto arity{1}; template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::sin(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::sin(input)) { return std::sin(input); } @@ -556,7 +554,7 @@ struct operator_functor { static constexpr auto arity{1}; template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::cos(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::cos(input)) { return std::cos(input); } @@ -567,7 +565,7 @@ struct operator_functor { static constexpr auto arity{1}; template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::tan(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::tan(input)) { return std::tan(input); } @@ -578,7 +576,7 @@ struct operator_functor { static constexpr auto arity{1}; template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::asin(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::asin(input)) { return std::asin(input); } @@ -589,7 +587,7 @@ struct operator_functor { static constexpr auto arity{1}; template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::acos(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::acos(input)) { return std::acos(input); } @@ -600,7 +598,7 @@ struct operator_functor { static constexpr auto arity{1}; template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::atan(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::atan(input)) { return std::atan(input); } @@ -611,7 +609,7 @@ struct operator_functor { static constexpr auto arity{1}; template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::sinh(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::sinh(input)) { return std::sinh(input); } @@ -622,7 +620,7 @@ struct operator_functor { static constexpr auto arity{1}; template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::cosh(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::cosh(input)) { return std::cosh(input); } @@ -633,7 +631,7 @@ struct operator_functor { static constexpr auto arity{1}; template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::tanh(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::tanh(input)) { return std::tanh(input); } @@ -644,7 +642,7 @@ struct operator_functor { static constexpr auto arity{1}; template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::asinh(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::asinh(input)) { return std::asinh(input); } @@ -655,7 +653,7 @@ struct operator_functor { static constexpr auto arity{1}; template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::acosh(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::acosh(input)) { return std::acosh(input); } @@ -666,7 +664,7 @@ struct operator_functor { static constexpr auto arity{1}; template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::atanh(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::atanh(input)) { return std::atanh(input); } @@ -677,7 +675,7 @@ struct operator_functor { static constexpr auto arity{1}; template - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::exp(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::exp(input)) { return std::exp(input); } @@ -688,7 +686,7 @@ struct operator_functor { static constexpr auto arity{1}; template - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::log(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::log(input)) { return std::log(input); } @@ -699,7 +697,7 @@ struct operator_functor { static constexpr auto arity{1}; template - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::sqrt(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::sqrt(input)) { return std::sqrt(input); } @@ -710,7 +708,7 @@ struct operator_functor { static constexpr auto arity{1}; template - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::cbrt(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::cbrt(input)) { return std::cbrt(input); } @@ -721,7 +719,7 @@ struct operator_functor { static constexpr auto arity{1}; template - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::ceil(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::ceil(input)) { return std::ceil(input); } @@ -732,7 +730,7 @@ struct operator_functor { static constexpr auto arity{1}; template - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::floor(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::floor(input)) { return std::floor(input); } @@ -744,13 +742,13 @@ struct operator_functor { // Only accept signed or unsigned types (both require is_arithmetic to be true) template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::abs(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::abs(input)) { return std::abs(input); } template ::value>* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(input) + __device__ inline auto operator()(InputT input) -> decltype(input) { return input; } @@ -761,7 +759,7 @@ struct operator_functor { static constexpr auto arity{1}; template - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::rint(input)) + __device__ inline auto operator()(InputT input) -> decltype(std::rint(input)) { return std::rint(input); } @@ -772,7 +770,7 @@ struct operator_functor { static constexpr auto arity{1}; template - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(~input) + __device__ inline auto operator()(InputT input) -> decltype(~input) { return ~input; } @@ -783,7 +781,7 @@ struct operator_functor { static constexpr auto arity{1}; template - CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(!input) + __device__ inline auto operator()(InputT input) -> decltype(!input) { return !input; } @@ -793,7 +791,7 @@ template struct cast { static constexpr auto arity{1}; template - CUDA_DEVICE_CALLABLE auto operator()(From f) -> decltype(static_cast(f)) + __device__ inline auto operator()(From f) -> decltype(static_cast(f)) { return static_cast(f); } @@ -822,7 +820,7 @@ struct operator_functor { typename RHS, std::size_t arity_placeholder = arity, std::enable_if_t* = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(LHS const lhs, RHS const rhs) + __device__ inline auto operator()(LHS const lhs, RHS const rhs) -> possibly_null_value_t { using Out = possibly_null_value_t; @@ -832,7 +830,7 @@ struct operator_functor { template * = nullptr> - CUDA_DEVICE_CALLABLE auto operator()(Input const input) + __device__ inline auto operator()(Input const input) -> possibly_null_value_t { using Out = possibly_null_value_t; @@ -848,7 +846,7 @@ struct operator_functor { static constexpr auto arity = NonNullOperator::arity; template - CUDA_DEVICE_CALLABLE auto operator()(LHS const lhs, RHS const rhs) + __device__ inline auto operator()(LHS const lhs, RHS const rhs) -> possibly_null_value_t { // Case 1: Neither is null, so the output is given by the operation. @@ -869,7 +867,7 @@ struct operator_functor { static constexpr auto arity = NonNullOperator::arity; template - CUDA_DEVICE_CALLABLE auto operator()(LHS const lhs, RHS const rhs) + __device__ inline auto operator()(LHS const lhs, RHS const rhs) -> possibly_null_value_t { // Case 1: Neither is null, so the output is given by the operation. @@ -892,7 +890,7 @@ struct operator_functor { static constexpr auto arity = NonNullOperator::arity; template - CUDA_DEVICE_CALLABLE auto operator()(LHS const lhs, RHS const rhs) + __device__ inline auto operator()(LHS const lhs, RHS const rhs) -> possibly_null_value_t { // Case 1: Neither is null, so the output is given by the operation. @@ -922,7 +920,7 @@ struct single_dispatch_binary_operator_types { typename F, typename... Ts, std::enable_if_t>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args) + CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args) { f.template operator()(std::forward(args)...); } @@ -931,7 +929,7 @@ struct single_dispatch_binary_operator_types { typename F, typename... Ts, std::enable_if_t>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args) + CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args) { #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid binary operation."); @@ -960,10 +958,10 @@ struct type_dispatch_binary_op { * @param args Forwarded arguments to `operator()` of `f`. */ template - CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type lhs_type, - cudf::data_type rhs_type, - F&& f, - Ts&&... args) + CUDF_HOST_DEVICE inline void operator()(cudf::data_type lhs_type, + cudf::data_type rhs_type, + F&& f, + Ts&&... args) { // Single dispatch (assume lhs_type == rhs_type) type_dispatcher( @@ -986,7 +984,7 @@ struct type_dispatch_binary_op { * @param args Forwarded arguments to `operator()` of `f`. */ template -CUDA_HOST_DEVICE_CALLABLE constexpr void binary_operator_dispatcher( +CUDF_HOST_DEVICE inline constexpr void binary_operator_dispatcher( ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args) { ast_operator_dispatcher(op, @@ -1011,7 +1009,7 @@ struct dispatch_unary_operator_types { typename F, typename... Ts, std::enable_if_t>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args) + CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args) { f.template operator()(std::forward(args)...); } @@ -1020,7 +1018,7 @@ struct dispatch_unary_operator_types { typename F, typename... Ts, std::enable_if_t>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args) + CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args) { #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid unary operation."); @@ -1035,7 +1033,7 @@ struct dispatch_unary_operator_types { */ struct type_dispatch_unary_op { template - CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type input_type, F&& f, Ts&&... args) + CUDF_HOST_DEVICE inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args) { type_dispatcher( input_type, @@ -1056,10 +1054,10 @@ struct type_dispatch_unary_op { * @param args Forwarded arguments to `operator()` of `f`. */ template -CUDA_HOST_DEVICE_CALLABLE constexpr void unary_operator_dispatcher(ast_operator op, - cudf::data_type input_type, - F&& f, - Ts&&... args) +CUDF_HOST_DEVICE inline constexpr void unary_operator_dispatcher(ast_operator op, + cudf::data_type input_type, + F&& f, + Ts&&... args) { ast_operator_dispatcher(op, detail::type_dispatch_unary_op{}, @@ -1084,7 +1082,7 @@ struct return_type_functor { typename LHS, typename RHS, std::enable_if_t>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type& result) + CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result) { using Out = cuda::std::invoke_result_t; result = cudf::data_type(cudf::type_to_id()); @@ -1094,7 +1092,7 @@ struct return_type_functor { typename LHS, typename RHS, std::enable_if_t>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type& result) + CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result) { #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid binary operation. Return type cannot be determined."); @@ -1113,7 +1111,7 @@ struct return_type_functor { template >* = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type& result) + CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result) { using Out = cuda::std::invoke_result_t; result = cudf::data_type(cudf::type_to_id()); @@ -1122,7 +1120,7 @@ struct return_type_functor { template >* = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type& result) + CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result) { #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid unary operation. Return type cannot be determined."); @@ -1161,7 +1159,7 @@ inline cudf::data_type ast_operator_return_type(ast_operator op, */ struct arity_functor { template - CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::size_type& result) + CUDF_HOST_DEVICE inline void operator()(cudf::size_type& result) { // Arity is not dependent on null handling, so just use the false implementation here. result = operator_functor::arity; @@ -1174,7 +1172,7 @@ struct arity_functor { * @param op Operator used to determine arity. * @return Arity of the operator. */ -CUDA_HOST_DEVICE_CALLABLE cudf::size_type ast_operator_arity(ast_operator op) +CUDF_HOST_DEVICE inline cudf::size_type ast_operator_arity(ast_operator op) { auto result = cudf::size_type(0); ast_operator_dispatcher(op, detail::arity_functor{}, result); diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index a15f20ef52d..b29df1852b2 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -1151,8 +1151,7 @@ struct optional_accessor { if (with_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); } } - CUDA_DEVICE_CALLABLE - thrust::optional operator()(cudf::size_type i) const + __device__ inline thrust::optional operator()(cudf::size_type i) const { if (has_nulls) { return (col.is_valid_nocheck(i)) ? thrust::optional{col.element(i)} @@ -1196,8 +1195,7 @@ struct pair_accessor { if (has_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); } } - CUDA_DEVICE_CALLABLE - thrust::pair operator()(cudf::size_type i) const + __device__ inline thrust::pair operator()(cudf::size_type i) const { return {col.element(i), (has_nulls ? col.is_valid_nocheck(i) : true)}; } @@ -1237,21 +1235,20 @@ struct pair_rep_accessor { if (has_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); } } - CUDA_DEVICE_CALLABLE - thrust::pair operator()(cudf::size_type i) const + __device__ inline thrust::pair operator()(cudf::size_type i) const { return {get_rep(i), (has_nulls ? col.is_valid_nocheck(i) : true)}; } private: template , void>* = nullptr> - CUDA_DEVICE_CALLABLE auto get_rep(cudf::size_type i) const + __device__ inline auto get_rep(cudf::size_type i) const { return col.element(i); } template , void>* = nullptr> - CUDA_DEVICE_CALLABLE auto get_rep(cudf::size_type i) const + __device__ inline auto get_rep(cudf::size_type i) const { return col.element(i).value(); } diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index c2bd7a4893c..3674efbcc7b 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1324,9 +1324,9 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation); */ #pragma nv_exec_check_disable template -CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kind k, - F&& f, - Ts&&... args) +CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind k, + F&& f, + Ts&&... args) { switch (k) { case aggregation::SUM: @@ -1418,7 +1418,7 @@ template struct dispatch_aggregation { #pragma nv_exec_check_disable template - CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(F&& f, Ts&&... args) const + CUDF_HOST_DEVICE inline decltype(auto) operator()(F&& f, Ts&&... args) const { return f.template operator()(std::forward(args)...); } @@ -1427,9 +1427,7 @@ struct dispatch_aggregation { struct dispatch_source { #pragma nv_exec_check_disable template - CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(aggregation::Kind k, - F&& f, - Ts&&... args) const + CUDF_HOST_DEVICE inline decltype(auto) operator()(aggregation::Kind k, F&& f, Ts&&... args) const { return aggregation_dispatcher( k, dispatch_aggregation{}, std::forward(f), std::forward(args)...); @@ -1453,8 +1451,10 @@ struct dispatch_source { */ #pragma nv_exec_check_disable template -CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) dispatch_type_and_aggregation( - data_type type, aggregation::Kind k, F&& f, Ts&&... args) +CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(data_type type, + aggregation::Kind k, + F&& f, + Ts&&... args) { return type_dispatcher(type, dispatch_source{}, k, std::forward(f), std::forward(args)...); } diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh index d0fa4e02440..11c82da8097 100644 --- a/cpp/include/cudf/detail/indexalator.cuh +++ b/cpp/include/cudf/detail/indexalator.cuh @@ -55,7 +55,7 @@ struct base_indexalator { /** * @brief Prefix increment operator. */ - CUDA_HOST_DEVICE_CALLABLE T& operator++() + CUDF_HOST_DEVICE inline T& operator++() { T& derived = static_cast(*this); derived.p_ += width_; @@ -65,7 +65,7 @@ struct base_indexalator { /** * @brief Postfix increment operator. */ - CUDA_HOST_DEVICE_CALLABLE T operator++(int) + CUDF_HOST_DEVICE inline T operator++(int) { T tmp{static_cast(*this)}; operator++(); @@ -75,7 +75,7 @@ struct base_indexalator { /** * @brief Prefix decrement operator. */ - CUDA_HOST_DEVICE_CALLABLE T& operator--() + CUDF_HOST_DEVICE inline T& operator--() { T& derived = static_cast(*this); derived.p_ -= width_; @@ -85,7 +85,7 @@ struct base_indexalator { /** * @brief Postfix decrement operator. */ - CUDA_HOST_DEVICE_CALLABLE T operator--(int) + CUDF_HOST_DEVICE inline T operator--(int) { T tmp{static_cast(*this)}; operator--(); @@ -95,7 +95,7 @@ struct base_indexalator { /** * @brief Compound assignment by sum operator. */ - CUDA_HOST_DEVICE_CALLABLE T& operator+=(difference_type offset) + CUDF_HOST_DEVICE inline T& operator+=(difference_type offset) { T& derived = static_cast(*this); derived.p_ += offset * width_; @@ -105,7 +105,7 @@ struct base_indexalator { /** * @brief Increment by offset operator. */ - CUDA_HOST_DEVICE_CALLABLE T operator+(difference_type offset) const + CUDF_HOST_DEVICE inline T operator+(difference_type offset) const { auto tmp = T{static_cast(*this)}; tmp.p_ += (offset * width_); @@ -115,7 +115,7 @@ struct base_indexalator { /** * @brief Addition assignment operator. */ - CUDA_HOST_DEVICE_CALLABLE friend T operator+(difference_type offset, T const& rhs) + CUDF_HOST_DEVICE inline friend T operator+(difference_type offset, T const& rhs) { T tmp{rhs}; tmp.p_ += (offset * rhs.width_); @@ -125,7 +125,7 @@ struct base_indexalator { /** * @brief Compound assignment by difference operator. */ - CUDA_HOST_DEVICE_CALLABLE T& operator-=(difference_type offset) + CUDF_HOST_DEVICE inline T& operator-=(difference_type offset) { T& derived = static_cast(*this); derived.p_ -= offset * width_; @@ -135,7 +135,7 @@ struct base_indexalator { /** * @brief Decrement by offset operator. */ - CUDA_HOST_DEVICE_CALLABLE T operator-(difference_type offset) const + CUDF_HOST_DEVICE inline T operator-(difference_type offset) const { auto tmp = T{static_cast(*this)}; tmp.p_ -= (offset * width_); @@ -145,7 +145,7 @@ struct base_indexalator { /** * @brief Subtraction assignment operator. */ - CUDA_HOST_DEVICE_CALLABLE friend T operator-(difference_type offset, T const& rhs) + CUDF_HOST_DEVICE inline friend T operator-(difference_type offset, T const& rhs) { T tmp{rhs}; tmp.p_ -= (offset * rhs.width_); @@ -155,7 +155,7 @@ struct base_indexalator { /** * @brief Compute offset from iterator difference operator. */ - CUDA_HOST_DEVICE_CALLABLE difference_type operator-(T const& rhs) const + CUDF_HOST_DEVICE inline difference_type operator-(T const& rhs) const { return (static_cast(*this).p_ - rhs.p_) / width_; } @@ -163,42 +163,42 @@ struct base_indexalator { /** * @brief Equals to operator. */ - CUDA_HOST_DEVICE_CALLABLE bool operator==(T const& rhs) const + CUDF_HOST_DEVICE inline bool operator==(T const& rhs) const { return rhs.p_ == static_cast(*this).p_; } /** * @brief Not equals to operator. */ - CUDA_HOST_DEVICE_CALLABLE bool operator!=(T const& rhs) const + CUDF_HOST_DEVICE inline bool operator!=(T const& rhs) const { return rhs.p_ != static_cast(*this).p_; } /** * @brief Less than operator. */ - CUDA_HOST_DEVICE_CALLABLE bool operator<(T const& rhs) const + CUDF_HOST_DEVICE inline bool operator<(T const& rhs) const { return static_cast(*this).p_ < rhs.p_; } /** * @brief Greater than operator. */ - CUDA_HOST_DEVICE_CALLABLE bool operator>(T const& rhs) const + CUDF_HOST_DEVICE inline bool operator>(T const& rhs) const { return static_cast(*this).p_ > rhs.p_; } /** * @brief Less than or equals to operator. */ - CUDA_HOST_DEVICE_CALLABLE bool operator<=(T const& rhs) const + CUDF_HOST_DEVICE inline bool operator<=(T const& rhs) const { return static_cast(*this).p_ <= rhs.p_; } /** * @brief Greater than or equals to operator. */ - CUDA_HOST_DEVICE_CALLABLE bool operator>=(T const& rhs) const + CUDF_HOST_DEVICE inline bool operator>=(T const& rhs) const { return static_cast(*this).p_ >= rhs.p_; } @@ -253,7 +253,7 @@ struct input_indexalator : base_indexalator { /** * @brief Indirection operator returns the value at the current iterator position. */ - CUDA_DEVICE_CALLABLE size_type operator*() const { return operator[](0); } + __device__ inline size_type operator*() const { return operator[](0); } /** * @brief Dispatch functor for resolving a size_type value from any index type. @@ -275,7 +275,7 @@ struct input_indexalator : base_indexalator { * @brief Array subscript operator returns a value at the input * `idx` position as a `size_type` value. */ - CUDA_DEVICE_CALLABLE size_type operator[](size_type idx) const + __device__ inline size_type operator[](size_type idx) const { void const* tp = p_ + (idx * width_); return type_dispatcher(dtype_, index_as_size_type{}, tp); @@ -339,14 +339,14 @@ struct output_indexalator : base_indexalator { * @brief Indirection operator returns this iterator instance in order * to capture the `operator=(size_type)` calls. */ - CUDA_DEVICE_CALLABLE output_indexalator const& operator*() const { return *this; } + __device__ inline output_indexalator const& operator*() const { return *this; } /** * @brief Array subscript operator returns an iterator instance at the specified `idx` position. * * This allows capturing the subsequent `operator=(size_type)` call in this class. */ - CUDA_DEVICE_CALLABLE output_indexalator const operator[](size_type idx) const + __device__ inline output_indexalator const operator[](size_type idx) const { output_indexalator tmp{*this}; tmp.p_ += (idx * width_); @@ -372,7 +372,7 @@ struct output_indexalator : base_indexalator { /** * @brief Assign a size_type value to the current iterator position. */ - CUDA_DEVICE_CALLABLE output_indexalator const& operator=(size_type const value) const + __device__ inline output_indexalator const& operator=(size_type const value) const { void* tp = p_; type_dispatcher(dtype_, size_type_to_index{}, tp, value); diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh index 01742384972..10d9cda55dd 100644 --- a/cpp/include/cudf/detail/iterator.cuh +++ b/cpp/include/cudf/detail/iterator.cuh @@ -107,8 +107,7 @@ struct null_replaced_value_accessor { if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask"); } - CUDA_DEVICE_CALLABLE - Element operator()(cudf::size_type i) const + __device__ inline Element operator()(cudf::size_type i) const { return has_nulls && col.is_null_nocheck(i) ? null_replacement : col.element(i); } @@ -135,8 +134,7 @@ struct validity_accessor { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); } - CUDA_DEVICE_CALLABLE - bool operator()(cudf::size_type i) const { return col.is_valid_nocheck(i); } + __device__ inline bool operator()(cudf::size_type i) const { return col.is_valid_nocheck(i); } }; /** @@ -344,8 +342,7 @@ struct scalar_value_accessor { * * @return value of the scalar. */ - CUDA_DEVICE_CALLABLE - const Element operator()(size_type) const + __device__ inline const Element operator()(size_type) const { #if defined(__CUDA_ARCH__) return dscalar.value(); @@ -423,8 +420,7 @@ struct scalar_optional_accessor : public scalar_value_accessor { * * @return a thrust::optional for the scalar value. */ - CUDA_HOST_DEVICE_CALLABLE - const value_type operator()(size_type) const + CUDF_HOST_DEVICE inline const value_type operator()(size_type) const { if (has_nulls) { return (super_t::dscalar.is_valid()) ? Element{super_t::dscalar.value()} @@ -457,8 +453,7 @@ struct scalar_pair_accessor : public scalar_value_accessor { * * @return a pair with value and validity of the scalar. */ - CUDA_HOST_DEVICE_CALLABLE - const value_type operator()(size_type) const + CUDF_HOST_DEVICE inline const value_type operator()(size_type) const { #if defined(__CUDA_ARCH__) return {Element(super_t::dscalar.value()), super_t::dscalar.is_valid()}; @@ -509,8 +504,7 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor::value, void>* = nullptr> - CUDA_DEVICE_CALLABLE rep_type get_rep(DeviceScalar const& dscalar) const + __device__ inline rep_type get_rep(DeviceScalar const& dscalar) const { return dscalar.value(); } template ::value, void>* = nullptr> - CUDA_DEVICE_CALLABLE rep_type get_rep(DeviceScalar const& dscalar) const + __device__ inline rep_type get_rep(DeviceScalar const& dscalar) const { return dscalar.rep(); } diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 6090477c28d..df06ad9e4f3 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -261,7 +261,7 @@ __global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bi */ struct bit_to_word_index { bit_to_word_index(bool inclusive) : inclusive(inclusive) {} - CUDA_DEVICE_CALLABLE size_type operator()(const size_type& bit_index) const + __device__ inline size_type operator()(const size_type& bit_index) const { return word_index(bit_index) + ((inclusive || intra_word_index(bit_index) == 0) ? 0 : 1); } @@ -269,7 +269,7 @@ struct bit_to_word_index { }; struct popc { - CUDA_DEVICE_CALLABLE size_type operator()(bitmask_type word) const { return __popc(word); } + __device__ inline size_type operator()(bitmask_type word) const { return __popc(word); } }; // Count set/unset bits in a segmented null mask, using offset iterators accessible by the device. @@ -377,7 +377,7 @@ size_type validate_segmented_indices(IndexIterator indices_begin, IndexIterator } struct index_alternator { - CUDA_DEVICE_CALLABLE size_type operator()(const size_type& i) const + __device__ inline size_type operator()(const size_type& i) const { return *(d_indices + 2 * i + (is_end ? 1 : 0)); } diff --git a/cpp/include/cudf/detail/reduction_operators.cuh b/cpp/include/cudf/detail/reduction_operators.cuh index 866e26cd655..5a0cb4c1714 100644 --- a/cpp/include/cudf/detail/reduction_operators.cuh +++ b/cpp/include/cudf/detail/reduction_operators.cuh @@ -19,7 +19,7 @@ #include #include #include -#include //for CUDA_HOST_DEVICE_CALLABLE +#include //for CUDF_HOST_DEVICE #include #include @@ -32,14 +32,12 @@ struct var_std { ResultType value; /// the value ResultType value_squared; /// the value of squared - CUDA_HOST_DEVICE_CALLABLE - var_std(ResultType _value = 0, ResultType _value_squared = 0) + CUDF_HOST_DEVICE inline var_std(ResultType _value = 0, ResultType _value_squared = 0) : value(_value), value_squared(_value_squared){}; using this_t = var_std; - CUDA_HOST_DEVICE_CALLABLE - this_t operator+(this_t const& rhs) const + CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const { return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared)); }; @@ -50,8 +48,10 @@ template struct transformer_var_std { using OutputType = var_std; - CUDA_HOST_DEVICE_CALLABLE - OutputType operator()(ResultType const& value) { return OutputType(value, value * value); }; + CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) + { + return OutputType(value, value * value); + }; }; // ------------------------------------------------------------------------ @@ -201,9 +201,9 @@ struct compound_op : public simple_op { * @return transformed output result of compound operator */ template - CUDA_HOST_DEVICE_CALLABLE static ResultType compute_result(const IntermediateType& input, - const cudf::size_type& count, - const cudf::size_type& ddof) + CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input, + const cudf::size_type& count, + const cudf::size_type& ddof) { // Enforced interface return Derived::template intermediate::compute_result(input, count, ddof); @@ -230,10 +230,9 @@ struct mean : public compound_op { using IntermediateType = ResultType; // sum value // compute `mean` from intermediate type `IntermediateType` - CUDA_HOST_DEVICE_CALLABLE - static ResultType compute_result(const IntermediateType& input, - const cudf::size_type& count, - const cudf::size_type& ddof) + CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input, + const cudf::size_type& count, + const cudf::size_type& ddof) { return (input / count); }; @@ -252,10 +251,9 @@ struct variance : public compound_op { using IntermediateType = var_std; // with sum of value, and sum of squared value // compute `variance` from intermediate type `IntermediateType` - CUDA_HOST_DEVICE_CALLABLE - static ResultType compute_result(const IntermediateType& input, - const cudf::size_type& count, - const cudf::size_type& ddof) + CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input, + const cudf::size_type& count, + const cudf::size_type& ddof) { ResultType mean = input.value / count; ResultType asum = input.value_squared; @@ -279,10 +277,9 @@ struct standard_deviation : public compound_op { using IntermediateType = var_std; // with sum of value, and sum of squared value // compute `standard deviation` from intermediate type `IntermediateType` - CUDA_HOST_DEVICE_CALLABLE - static ResultType compute_result(const IntermediateType& input, - const cudf::size_type& count, - const cudf::size_type& ddof) + CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input, + const cudf::size_type& count, + const cudf::size_type& ddof) { using intermediateOp = variance::template intermediate; ResultType var = intermediateOp::compute_result(input, count, ddof); diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh index 95605dc8a71..a59ad4c42ee 100644 --- a/cpp/include/cudf/detail/utilities/device_operators.cuh +++ b/cpp/include/cudf/detail/utilities/device_operators.cuh @@ -40,7 +40,7 @@ namespace detail { template ()>* = nullptr> -CUDA_HOST_DEVICE_CALLABLE auto min(LHS const& lhs, RHS const& rhs) +CUDF_HOST_DEVICE inline auto min(LHS const& lhs, RHS const& rhs) { return std::min(lhs, rhs); } @@ -51,7 +51,7 @@ CUDA_HOST_DEVICE_CALLABLE auto min(LHS const& lhs, RHS const& rhs) template ()>* = nullptr> -CUDA_HOST_DEVICE_CALLABLE auto max(LHS const& lhs, RHS const& rhs) +CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs) { return std::max(lhs, rhs); } @@ -62,7 +62,7 @@ CUDA_HOST_DEVICE_CALLABLE auto max(LHS const& lhs, RHS const& rhs) */ struct DeviceSum { template ()>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs + rhs) + CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs + rhs) { return lhs + rhs; } @@ -94,13 +94,13 @@ struct DeviceSum { */ struct DeviceCount { template ()>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs) + CUDF_HOST_DEVICE inline T operator()(const T& lhs, const T& rhs) { return T{DeviceCount{}(lhs.time_since_epoch(), rhs.time_since_epoch())}; } template ()>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE T operator()(const T&, const T& rhs) + CUDF_HOST_DEVICE inline T operator()(const T&, const T& rhs) { return rhs + T{1}; } @@ -117,7 +117,7 @@ struct DeviceCount { */ struct DeviceMin { template - CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) + CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(cudf::detail::min(lhs, rhs)) { return numeric::detail::min(lhs, rhs); @@ -142,7 +142,7 @@ struct DeviceMin { // @brief identity specialized for string_view template >* = nullptr> - CUDA_HOST_DEVICE_CALLABLE static constexpr T identity() + CUDF_HOST_DEVICE inline static constexpr T identity() { return string_view::max(); } @@ -159,7 +159,7 @@ struct DeviceMin { */ struct DeviceMax { template - CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) + CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(cudf::detail::max(lhs, rhs)) { return numeric::detail::max(lhs, rhs); @@ -183,7 +183,7 @@ struct DeviceMax { } template >* = nullptr> - CUDA_HOST_DEVICE_CALLABLE static constexpr T identity() + CUDF_HOST_DEVICE inline static constexpr T identity() { return string_view::min(); } @@ -200,7 +200,7 @@ struct DeviceMax { */ struct DeviceProduct { template ()>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs * rhs) + CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs * rhs) { return lhs * rhs; } @@ -224,7 +224,7 @@ struct DeviceProduct { */ struct DeviceAnd { template ::value>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs & rhs) + CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs & rhs) { return (lhs & rhs); } @@ -235,7 +235,7 @@ struct DeviceAnd { */ struct DeviceOr { template ::value>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs | rhs) + CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs | rhs) { return (lhs | rhs); } @@ -246,7 +246,7 @@ struct DeviceOr { */ struct DeviceXor { template ::value>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs ^ rhs) + CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs ^ rhs) { return (lhs ^ rhs); } @@ -258,7 +258,7 @@ struct DeviceXor { struct DeviceLeadLag { const size_type row_offset; - explicit CUDA_HOST_DEVICE_CALLABLE DeviceLeadLag(size_type offset_) : row_offset(offset_) {} + explicit CUDF_HOST_DEVICE inline DeviceLeadLag(size_type offset_) : row_offset(offset_) {} }; } // namespace cudf diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index f3390d9387b..c35d24ddeac 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -31,7 +31,7 @@ namespace detail { * Normalization of floating point NaNs and zeros, passthrough for all other values. */ template -T CUDA_DEVICE_CALLABLE normalize_nans_and_zeros(T const& key) +T __device__ inline normalize_nans_and_zeros(T const& key) { if constexpr (is_floating_point()) { if (isnan(key)) { @@ -50,7 +50,7 @@ T CUDA_DEVICE_CALLABLE normalize_nans_and_zeros(T const& key) * Licensed under the MIT license. * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT */ -void CUDA_DEVICE_CALLABLE uint32ToLowercaseHexString(uint32_t num, char* destination) +void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destination) { // Transform 0xABCD1234 => 0x0000ABCD00001234 => 0x0B0A0D0C02010403 uint64_t x = num; @@ -86,12 +86,12 @@ struct MurmurHash3_32 { MurmurHash3_32() = default; constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {} - CUDA_DEVICE_CALLABLE uint32_t rotl32(uint32_t x, int8_t r) const + __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const { return (x << r) | (x >> (32 - r)); } - CUDA_DEVICE_CALLABLE uint32_t fmix32(uint32_t h) const + __device__ inline uint32_t fmix32(uint32_t h) const { h ^= h >> 16; h *= 0x85ebca6b; @@ -118,7 +118,7 @@ struct MurmurHash3_32 { * * @returns A hash value that intelligently combines the lhs and rhs hash values */ - CUDA_DEVICE_CALLABLE result_type hash_combine(result_type lhs, result_type rhs) + __device__ inline result_type hash_combine(result_type lhs, result_type rhs) { result_type combined{lhs}; @@ -127,11 +127,11 @@ struct MurmurHash3_32 { return combined; } - result_type CUDA_DEVICE_CALLABLE operator()(Key const& key) const { return compute(key); } + result_type __device__ inline operator()(Key const& key) const { return compute(key); } // compute wrapper for floating point types template ::value>* = nullptr> - hash_value_type CUDA_DEVICE_CALLABLE compute_floating_point(T const& key) const + hash_value_type __device__ inline compute_floating_point(T const& key) const { if (key == T{0.0}) { return compute(T{0.0}); @@ -144,7 +144,7 @@ struct MurmurHash3_32 { } template - result_type CUDA_DEVICE_CALLABLE compute(TKey const& key) const + result_type __device__ inline compute(TKey const& key) const { constexpr int len = sizeof(argument_type); uint8_t const* const data = reinterpret_cast(&key); @@ -191,7 +191,7 @@ struct MurmurHash3_32 { }; template <> -hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32::operator()(bool const& key) const +hash_value_type __device__ inline MurmurHash3_32::operator()(bool const& key) const { return this->compute(static_cast(key)); } @@ -200,8 +200,8 @@ hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32::operator()(bool const * @brief Specialization of MurmurHash3_32 operator for strings. */ template <> -hash_value_type CUDA_DEVICE_CALLABLE -MurmurHash3_32::operator()(cudf::string_view const& key) const +hash_value_type __device__ inline MurmurHash3_32::operator()( + cudf::string_view const& key) const { auto const len = key.size_bytes(); uint8_t const* data = reinterpret_cast(key.data()); @@ -249,49 +249,49 @@ MurmurHash3_32::operator()(cudf::string_view const& key) cons } template <> -hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32::operator()(float const& key) const +hash_value_type __device__ inline MurmurHash3_32::operator()(float const& key) const { return this->compute_floating_point(key); } template <> -hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32::operator()(double const& key) const +hash_value_type __device__ inline MurmurHash3_32::operator()(double const& key) const { return this->compute_floating_point(key); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -MurmurHash3_32::operator()(numeric::decimal32 const& key) const +hash_value_type __device__ inline MurmurHash3_32::operator()( + numeric::decimal32 const& key) const { return this->compute(key.value()); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -MurmurHash3_32::operator()(numeric::decimal64 const& key) const +hash_value_type __device__ inline MurmurHash3_32::operator()( + numeric::decimal64 const& key) const { return this->compute(key.value()); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -MurmurHash3_32::operator()(numeric::decimal128 const& key) const +hash_value_type __device__ inline MurmurHash3_32::operator()( + numeric::decimal128 const& key) const { return this->compute(key.value()); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -MurmurHash3_32::operator()(cudf::list_view const& key) const +hash_value_type __device__ inline MurmurHash3_32::operator()( + cudf::list_view const& key) const { cudf_assert(false && "List column hashing is not supported"); return 0; } template <> -hash_value_type CUDA_DEVICE_CALLABLE -MurmurHash3_32::operator()(cudf::struct_view const& key) const +hash_value_type __device__ inline MurmurHash3_32::operator()( + cudf::struct_view const& key) const { cudf_assert(false && "Direct hashing of struct_view is not supported"); return 0; @@ -305,12 +305,12 @@ struct SparkMurmurHash3_32 { SparkMurmurHash3_32() = default; constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {} - CUDA_DEVICE_CALLABLE uint32_t rotl32(uint32_t x, int8_t r) const + __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const { return (x << r) | (x >> (32 - r)); } - CUDA_DEVICE_CALLABLE uint32_t fmix32(uint32_t h) const + __device__ inline uint32_t fmix32(uint32_t h) const { h ^= h >> 16; h *= 0x85ebca6b; @@ -320,11 +320,11 @@ struct SparkMurmurHash3_32 { return h; } - result_type CUDA_DEVICE_CALLABLE operator()(Key const& key) const { return compute(key); } + result_type __device__ inline operator()(Key const& key) const { return compute(key); } // compute wrapper for floating point types template ::value>* = nullptr> - hash_value_type CUDA_DEVICE_CALLABLE compute_floating_point(T const& key) const + hash_value_type __device__ inline compute_floating_point(T const& key) const { if (isnan(key)) { T nan = std::numeric_limits::quiet_NaN(); @@ -335,7 +335,7 @@ struct SparkMurmurHash3_32 { } template - result_type CUDA_DEVICE_CALLABLE compute(TKey const& key) const + result_type __device__ inline compute(TKey const& key) const { constexpr int len = sizeof(TKey); int8_t const* const data = reinterpret_cast(&key); @@ -379,71 +379,68 @@ struct SparkMurmurHash3_32 { }; template <> -hash_value_type CUDA_DEVICE_CALLABLE SparkMurmurHash3_32::operator()(bool const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()(bool const& key) const { return this->compute(key); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -SparkMurmurHash3_32::operator()(int8_t const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()(int8_t const& key) const { return this->compute(key); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -SparkMurmurHash3_32::operator()(uint8_t const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()(uint8_t const& key) const { return this->compute(key); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -SparkMurmurHash3_32::operator()(int16_t const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()(int16_t const& key) const { return this->compute(key); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -SparkMurmurHash3_32::operator()(uint16_t const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()( + uint16_t const& key) const { return this->compute(key); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -SparkMurmurHash3_32::operator()(numeric::decimal32 const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()( + numeric::decimal32 const& key) const { return this->compute(key.value()); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -SparkMurmurHash3_32::operator()(numeric::decimal64 const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()( + numeric::decimal64 const& key) const { return this->compute(key.value()); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -SparkMurmurHash3_32::operator()(numeric::decimal128 const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()( + numeric::decimal128 const& key) const { return this->compute<__int128_t>(key.value()); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -SparkMurmurHash3_32::operator()(cudf::list_view const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()( + cudf::list_view const& key) const { cudf_assert(false && "List column hashing is not supported"); return 0; } template <> -hash_value_type CUDA_DEVICE_CALLABLE -SparkMurmurHash3_32::operator()(cudf::struct_view const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()( + cudf::struct_view const& key) const { cudf_assert(false && "Direct hashing of struct_view is not supported"); return 0; @@ -453,8 +450,8 @@ SparkMurmurHash3_32::operator()(cudf::struct_view const& key) * @brief Specialization of MurmurHash3_32 operator for strings. */ template <> -hash_value_type CUDA_DEVICE_CALLABLE -SparkMurmurHash3_32::operator()(cudf::string_view const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()( + cudf::string_view const& key) const { auto const len = key.size_bytes(); int8_t const* data = reinterpret_cast(key.data()); @@ -499,14 +496,13 @@ SparkMurmurHash3_32::operator()(cudf::string_view const& key) } template <> -hash_value_type CUDA_DEVICE_CALLABLE SparkMurmurHash3_32::operator()(float const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()(float const& key) const { return this->compute_floating_point(key); } template <> -hash_value_type CUDA_DEVICE_CALLABLE -SparkMurmurHash3_32::operator()(double const& key) const +hash_value_type __device__ inline SparkMurmurHash3_32::operator()(double const& key) const { return this->compute_floating_point(key); } diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index ddedab3944c..fe501279fd5 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -1,7 +1,7 @@ /* * Copyright 2019 BlazingDB, Inc. * Copyright 2019 Eyal Rozenberg - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,12 +33,18 @@ namespace cudf { //! Utility functions namespace util { /** - * Finds the smallest integer not less than `number_to_round` and modulo `S` is - * zero. This function assumes that `number_to_round` is non-negative and - * `modulus` is positive. + * @brief Rounds `number_to_round` up to the next multiple of modulus + * + * @tparam S type to return + * @param number_to_round number that is being rounded + * @param modulus value to which to round + * @return smallest integer greater than `number_to_round` and modulo `S` is zero. + * + * @note This function assumes that `number_to_round` is non-negative and + * `modulus` is positive. The safety is in regard to rollover. */ template -inline S round_up_safe(S number_to_round, S modulus) +S round_up_safe(S number_to_round, S modulus) { auto remainder = number_to_round % modulus; if (remainder == 0) { return number_to_round; } @@ -50,18 +56,44 @@ inline S round_up_safe(S number_to_round, S modulus) } /** - * Finds the largest integer not greater than `number_to_round` and modulo `S` is - * zero. This function assumes that `number_to_round` is non-negative and - * `modulus` is positive. + * @brief Rounds `number_to_round` down to the last multiple of modulus + * + * @tparam S type to return + * @param number_to_round number that is being rounded + * @param modulus value to which to round + * @return largest integer not greater than `number_to_round` and modulo `S` is zero. + * + * @note This function assumes that `number_to_round` is non-negative and + * `modulus` is positive and does not check for overflow. */ template -inline S round_down_safe(S number_to_round, S modulus) +S round_down_safe(S number_to_round, S modulus) noexcept { auto remainder = number_to_round % modulus; auto rounded_down = number_to_round - remainder; return rounded_down; } +/** + * @brief Rounds `number_to_round` up to the next multiple of modulus + * + * @tparam S type to return + * @param number_to_round number that is being rounded + * @param modulus value to which to round + * @return smallest integer greater than `number_to_round` and modulo `S` is zero. + * + * @note This function assumes that `number_to_round` is non-negative and + * `modulus` is positive and does not check for overflow. + */ +template +constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept +{ + auto remainder = number_to_round % modulus; + if (remainder == 0) { return number_to_round; } + auto rounded_up = number_to_round - remainder + modulus; + return rounded_up; +} + /** * Divides the left-hand-side by the right-hand-side, rounding up * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3. @@ -75,16 +107,16 @@ inline S round_down_safe(S number_to_round, S modulus) * the result will be incorrect */ template -constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept +constexpr S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept { return (dividend + divisor - 1) / divisor; } namespace detail { template -constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, - I divisor) noexcept +constexpr I div_rounding_up_safe(std::integral_constant, + I dividend, + I divisor) noexcept { // TODO: This could probably be implemented faster return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) @@ -92,9 +124,7 @@ constexpr inline I div_rounding_up_safe(std::integral_constant, } template -constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, - I divisor) noexcept +constexpr I div_rounding_up_safe(std::integral_constant, I dividend, I divisor) noexcept { auto quotient = dividend / divisor; auto remainder = dividend % divisor; @@ -116,14 +146,14 @@ constexpr inline I div_rounding_up_safe(std::integral_constant, * approach of using (dividend + divisor - 1) / divisor */ template -constexpr inline I div_rounding_up_safe(I dividend, I divisor) noexcept +constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept { using i_is_a_signed_type = std::integral_constant::value>; return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor); } template -constexpr inline bool is_a_power_of_two(I val) noexcept +constexpr bool is_a_power_of_two(I val) noexcept { static_assert(std::is_integral::value, "This function only applies to integral types"); return ((val - 1) & val) == 0; @@ -153,7 +183,7 @@ constexpr inline bool is_a_power_of_two(I val) noexcept * @return Absolute value if value type is signed. */ template -constexpr inline auto absolute_value(T value) -> T +constexpr auto absolute_value(T value) -> T { if constexpr (cuda::std::is_signed()) return numeric::detail::abs(value); return value; diff --git a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh index 05a788abd45..12774f57c6a 100644 --- a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh +++ b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh @@ -43,14 +43,13 @@ struct null_replacing_transformer { using type = ResultType; Functor f; type replacement; - CUDA_HOST_DEVICE_CALLABLE - null_replacing_transformer(type null_replacement, Functor transformer) + CUDF_HOST_DEVICE inline null_replacing_transformer(type null_replacement, Functor transformer) : f(transformer), replacement(null_replacement) { } template - CUDA_HOST_DEVICE_CALLABLE type operator()(thrust::pair const& pair_value) + CUDF_HOST_DEVICE inline type operator()(thrust::pair const& pair_value) { if (pair_value.second) return f(pair_value.first); @@ -76,22 +75,21 @@ struct meanvar { ElementType value_squared; /// the value of squared cudf::size_type count; /// the count - CUDA_HOST_DEVICE_CALLABLE - meanvar(ElementType _value = 0, ElementType _value_squared = 0, cudf::size_type _count = 0) + CUDF_HOST_DEVICE inline meanvar(ElementType _value = 0, + ElementType _value_squared = 0, + cudf::size_type _count = 0) : value(_value), value_squared(_value_squared), count(_count){}; using this_t = cudf::meanvar; - CUDA_HOST_DEVICE_CALLABLE - this_t operator+(this_t const& rhs) const + CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const { return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared), (this->count + rhs.count)); }; - CUDA_HOST_DEVICE_CALLABLE - bool operator==(this_t const& rhs) const + CUDF_HOST_DEVICE inline bool operator==(this_t const& rhs) const { return ((this->value == rhs.value) && (this->value_squared == rhs.value_squared) && (this->count == rhs.count)); @@ -113,8 +111,10 @@ struct meanvar { */ template struct transformer_squared { - CUDA_HOST_DEVICE_CALLABLE - ElementType operator()(ElementType const& value) { return (value * value); }; + CUDF_HOST_DEVICE inline ElementType operator()(ElementType const& value) + { + return (value * value); + }; }; /** @@ -130,8 +130,7 @@ template struct transformer_meanvar { using ResultType = meanvar; - CUDA_HOST_DEVICE_CALLABLE - ResultType operator()(thrust::pair const& pair) + CUDF_HOST_DEVICE inline ResultType operator()(thrust::pair const& pair) { ElementType v = pair.first; return meanvar(v, v * v, (pair.second) ? 1 : 0); diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index e8223b53997..727dce0db9d 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -79,7 +79,7 @@ template && is_supported_representation_type())>* = nullptr> -CUDA_HOST_DEVICE_CALLABLE Rep ipow(T exponent) +CUDF_HOST_DEVICE inline Rep ipow(T exponent) { cudf_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible."); if (exponent == 0) return static_cast(1); @@ -108,7 +108,7 @@ CUDA_HOST_DEVICE_CALLABLE Rep ipow(T exponent) * @return Shifted value of type T */ template -CUDA_HOST_DEVICE_CALLABLE constexpr T right_shift(T const& val, scale_type const& scale) +CUDF_HOST_DEVICE inline constexpr T right_shift(T const& val, scale_type const& scale) { return val / ipow(static_cast(scale)); } @@ -125,7 +125,7 @@ CUDA_HOST_DEVICE_CALLABLE constexpr T right_shift(T const& val, scale_type const * @return Shifted value of type T */ template -CUDA_HOST_DEVICE_CALLABLE constexpr T left_shift(T const& val, scale_type const& scale) +CUDF_HOST_DEVICE inline constexpr T left_shift(T const& val, scale_type const& scale) { return val * ipow(static_cast(-scale)); } @@ -144,7 +144,7 @@ CUDA_HOST_DEVICE_CALLABLE constexpr T left_shift(T const& val, scale_type const& * @return Shifted value of type T */ template -CUDA_HOST_DEVICE_CALLABLE constexpr T shift(T const& val, scale_type const& scale) +CUDF_HOST_DEVICE inline constexpr T shift(T const& val, scale_type const& scale) { if (scale == 0) return val; @@ -179,7 +179,7 @@ template () && is_supported_representation_type()>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE explicit fixed_point(T const& value, scale_type const& scale) + CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale) : _value{static_cast(detail::shift(value, scale))}, _scale{scale} { } @@ -226,7 +226,7 @@ class fixed_point { template () && is_supported_representation_type()>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE explicit fixed_point(T const& value, scale_type const& scale) + CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale) // `value` is cast to `Rep` to avoid overflow in cases where // constructing to `Rep` that is wider than `T` : _value{detail::shift(static_cast(value), scale)}, _scale{scale} @@ -238,8 +238,10 @@ class fixed_point { * * @param s scaled_integer that contains scale and already shifted value */ - CUDA_HOST_DEVICE_CALLABLE - explicit fixed_point(scaled_integer s) : _value{s.value}, _scale{s.scale} {} + CUDF_HOST_DEVICE inline explicit fixed_point(scaled_integer s) + : _value{s.value}, _scale{s.scale} + { + } /** * @brief "Scale-less" constructor that constructs `fixed_point` number with a specified @@ -247,7 +249,7 @@ class fixed_point { */ template ()>* = nullptr> - CUDA_HOST_DEVICE_CALLABLE fixed_point(T const& value) + CUDF_HOST_DEVICE inline fixed_point(T const& value) : _value{static_cast(value)}, _scale{scale_type{0}} { } @@ -256,8 +258,7 @@ class fixed_point { * @brief Default constructor that constructs `fixed_point` number with a * value and scale of zero */ - CUDA_HOST_DEVICE_CALLABLE - fixed_point() : _value{0}, _scale{scale_type{0}} {} + CUDF_HOST_DEVICE inline fixed_point() : _value{0}, _scale{scale_type{0}} {} /** * @brief Explicit conversion operator for casting to floating point types @@ -289,7 +290,7 @@ class fixed_point { return static_cast(detail::shift(value, scale_type{-_scale})); } - CUDA_HOST_DEVICE_CALLABLE operator scaled_integer() const + CUDF_HOST_DEVICE inline operator scaled_integer() const { return scaled_integer{_value, _scale}; } @@ -299,21 +300,21 @@ class fixed_point { * * @return The underlying value of the `fixed_point` number */ - CUDA_HOST_DEVICE_CALLABLE rep value() const { return _value; } + CUDF_HOST_DEVICE inline rep value() const { return _value; } /** * @brief Method that returns the scale of the `fixed_point` number * * @return The scale of the `fixed_point` number */ - CUDA_HOST_DEVICE_CALLABLE scale_type scale() const { return _scale; } + CUDF_HOST_DEVICE inline scale_type scale() const { return _scale; } /** * @brief Explicit conversion operator to `bool` * * @return The `fixed_point` value as a boolean (zero is `false`, nonzero is `true`) */ - CUDA_HOST_DEVICE_CALLABLE explicit constexpr operator bool() const + CUDF_HOST_DEVICE inline explicit constexpr operator bool() const { return static_cast(_value); } @@ -326,7 +327,7 @@ class fixed_point { * @return The sum */ template - CUDA_HOST_DEVICE_CALLABLE fixed_point& operator+=(fixed_point const& rhs) + CUDF_HOST_DEVICE inline fixed_point& operator+=(fixed_point const& rhs) { *this = *this + rhs; return *this; @@ -340,7 +341,7 @@ class fixed_point { * @return The product */ template - CUDA_HOST_DEVICE_CALLABLE fixed_point& operator*=(fixed_point const& rhs) + CUDF_HOST_DEVICE inline fixed_point& operator*=(fixed_point const& rhs) { *this = *this * rhs; return *this; @@ -354,7 +355,7 @@ class fixed_point { * @return The difference */ template - CUDA_HOST_DEVICE_CALLABLE fixed_point& operator-=(fixed_point const& rhs) + CUDF_HOST_DEVICE inline fixed_point& operator-=(fixed_point const& rhs) { *this = *this - rhs; return *this; @@ -368,7 +369,7 @@ class fixed_point { * @return The quotient */ template - CUDA_HOST_DEVICE_CALLABLE fixed_point& operator/=(fixed_point const& rhs) + CUDF_HOST_DEVICE inline fixed_point& operator/=(fixed_point const& rhs) { *this = *this / rhs; return *this; @@ -379,8 +380,7 @@ class fixed_point { * * @return The incremented result */ - CUDA_HOST_DEVICE_CALLABLE - fixed_point& operator++() + CUDF_HOST_DEVICE inline fixed_point& operator++() { *this = *this + fixed_point{1, scale_type{_scale}}; return *this; @@ -398,7 +398,7 @@ class fixed_point { * @return The resulting `fixed_point` sum */ template - CUDA_HOST_DEVICE_CALLABLE friend fixed_point operator+( + CUDF_HOST_DEVICE inline friend fixed_point operator+( fixed_point const& lhs, fixed_point const& rhs); /** @@ -413,7 +413,7 @@ class fixed_point { * @return The resulting `fixed_point` difference */ template - CUDA_HOST_DEVICE_CALLABLE friend fixed_point operator-( + CUDF_HOST_DEVICE inline friend fixed_point operator-( fixed_point const& lhs, fixed_point const& rhs); /** @@ -426,7 +426,7 @@ class fixed_point { * @return The resulting `fixed_point` product */ template - CUDA_HOST_DEVICE_CALLABLE friend fixed_point operator*( + CUDF_HOST_DEVICE inline friend fixed_point operator*( fixed_point const& lhs, fixed_point const& rhs); /** @@ -439,7 +439,7 @@ class fixed_point { * @return The resulting `fixed_point` quotient */ template - CUDA_HOST_DEVICE_CALLABLE friend fixed_point operator/( + CUDF_HOST_DEVICE inline friend fixed_point operator/( fixed_point const& lhs, fixed_point const& rhs); /** @@ -454,8 +454,8 @@ class fixed_point { * @return true if `lhs` and `rhs` are equal, false if not */ template - CUDA_HOST_DEVICE_CALLABLE friend bool operator==(fixed_point const& lhs, - fixed_point const& rhs); + CUDF_HOST_DEVICE inline friend bool operator==(fixed_point const& lhs, + fixed_point const& rhs); /** * @brief operator != (for comparing two `fixed_point` numbers) @@ -469,8 +469,8 @@ class fixed_point { * @return true if `lhs` and `rhs` are not equal, false if not */ template - CUDA_HOST_DEVICE_CALLABLE friend bool operator!=(fixed_point const& lhs, - fixed_point const& rhs); + CUDF_HOST_DEVICE inline friend bool operator!=(fixed_point const& lhs, + fixed_point const& rhs); /** * @brief operator <= (for comparing two `fixed_point` numbers) @@ -484,8 +484,8 @@ class fixed_point { * @return true if `lhs` less than or equal to `rhs`, false if not */ template - CUDA_HOST_DEVICE_CALLABLE friend bool operator<=(fixed_point const& lhs, - fixed_point const& rhs); + CUDF_HOST_DEVICE inline friend bool operator<=(fixed_point const& lhs, + fixed_point const& rhs); /** * @brief operator >= (for comparing two `fixed_point` numbers) @@ -499,8 +499,8 @@ class fixed_point { * @return true if `lhs` greater than or equal to `rhs`, false if not */ template - CUDA_HOST_DEVICE_CALLABLE friend bool operator>=(fixed_point const& lhs, - fixed_point const& rhs); + CUDF_HOST_DEVICE inline friend bool operator>=(fixed_point const& lhs, + fixed_point const& rhs); /** * @brief operator < (for comparing two `fixed_point` numbers) @@ -514,8 +514,8 @@ class fixed_point { * @return true if `lhs` less than `rhs`, false if not */ template - CUDA_HOST_DEVICE_CALLABLE friend bool operator<(fixed_point const& lhs, - fixed_point const& rhs); + CUDF_HOST_DEVICE inline friend bool operator<(fixed_point const& lhs, + fixed_point const& rhs); /** * @brief operator > (for comparing two `fixed_point` numbers) @@ -529,8 +529,8 @@ class fixed_point { * @return true if `lhs` greater than `rhs`, false if not */ template - CUDA_HOST_DEVICE_CALLABLE friend bool operator>(fixed_point const& lhs, - fixed_point const& rhs); + CUDF_HOST_DEVICE inline friend bool operator>(fixed_point const& lhs, + fixed_point const& rhs); /** * @brief Method for creating a `fixed_point` number with a new `scale` @@ -541,7 +541,7 @@ class fixed_point { * @param scale The `scale` of the returned `fixed_point` number * @return `fixed_point` number with a new `scale` */ - CUDA_HOST_DEVICE_CALLABLE fixed_point rescaled(scale_type scale) const + CUDF_HOST_DEVICE inline fixed_point rescaled(scale_type scale) const { if (scale == _scale) return *this; Rep const value = detail::shift(_value, scale_type{scale - _scale}); @@ -580,7 +580,7 @@ class fixed_point { * @return true if addition causes overflow, false otherwise */ template -CUDA_HOST_DEVICE_CALLABLE auto addition_overflow(T lhs, T rhs) +CUDF_HOST_DEVICE inline auto addition_overflow(T lhs, T rhs) { return rhs > 0 ? lhs > cuda::std::numeric_limits::max() - rhs : lhs < cuda::std::numeric_limits::min() - rhs; @@ -595,7 +595,7 @@ CUDA_HOST_DEVICE_CALLABLE auto addition_overflow(T lhs, T rhs) * @return true if subtraction causes overflow, false otherwise */ template -CUDA_HOST_DEVICE_CALLABLE auto subtraction_overflow(T lhs, T rhs) +CUDF_HOST_DEVICE inline auto subtraction_overflow(T lhs, T rhs) { return rhs > 0 ? lhs < cuda::std::numeric_limits::min() + rhs : lhs > cuda::std::numeric_limits::max() + rhs; @@ -610,7 +610,7 @@ CUDA_HOST_DEVICE_CALLABLE auto subtraction_overflow(T lhs, T rhs) * @return true if division causes overflow, false otherwise */ template -CUDA_HOST_DEVICE_CALLABLE auto division_overflow(T lhs, T rhs) +CUDF_HOST_DEVICE inline auto division_overflow(T lhs, T rhs) { return lhs == cuda::std::numeric_limits::min() && rhs == -1; } @@ -624,7 +624,7 @@ CUDA_HOST_DEVICE_CALLABLE auto division_overflow(T lhs, T rhs) * @return true if multiplication causes overflow, false otherwise */ template -CUDA_HOST_DEVICE_CALLABLE auto multiplication_overflow(T lhs, T rhs) +CUDF_HOST_DEVICE inline auto multiplication_overflow(T lhs, T rhs) { auto const min = cuda::std::numeric_limits::min(); auto const max = cuda::std::numeric_limits::max(); @@ -638,8 +638,8 @@ CUDA_HOST_DEVICE_CALLABLE auto multiplication_overflow(T lhs, T rhs) // PLUS Operation template -CUDA_HOST_DEVICE_CALLABLE fixed_point operator+(fixed_point const& lhs, - fixed_point const& rhs) +CUDF_HOST_DEVICE inline fixed_point operator+(fixed_point const& lhs, + fixed_point const& rhs) { auto const scale = std::min(lhs._scale, rhs._scale); auto const sum = lhs.rescaled(scale)._value + rhs.rescaled(scale)._value; @@ -656,8 +656,8 @@ CUDA_HOST_DEVICE_CALLABLE fixed_point operator+(fixed_point -CUDA_HOST_DEVICE_CALLABLE fixed_point operator-(fixed_point const& lhs, - fixed_point const& rhs) +CUDF_HOST_DEVICE inline fixed_point operator-(fixed_point const& lhs, + fixed_point const& rhs) { auto const scale = std::min(lhs._scale, rhs._scale); auto const diff = lhs.rescaled(scale)._value - rhs.rescaled(scale)._value; @@ -674,8 +674,8 @@ CUDA_HOST_DEVICE_CALLABLE fixed_point operator-(fixed_point -CUDA_HOST_DEVICE_CALLABLE fixed_point operator*(fixed_point const& lhs, - fixed_point const& rhs) +CUDF_HOST_DEVICE inline fixed_point operator*(fixed_point const& lhs, + fixed_point const& rhs) { #if defined(__CUDACC_DEBUG__) @@ -689,8 +689,8 @@ CUDA_HOST_DEVICE_CALLABLE fixed_point operator*(fixed_point -CUDA_HOST_DEVICE_CALLABLE fixed_point operator/(fixed_point const& lhs, - fixed_point const& rhs) +CUDF_HOST_DEVICE inline fixed_point operator/(fixed_point const& lhs, + fixed_point const& rhs) { #if defined(__CUDACC_DEBUG__) @@ -704,8 +704,8 @@ CUDA_HOST_DEVICE_CALLABLE fixed_point operator/(fixed_point -CUDA_HOST_DEVICE_CALLABLE bool operator==(fixed_point const& lhs, - fixed_point const& rhs) +CUDF_HOST_DEVICE inline bool operator==(fixed_point const& lhs, + fixed_point const& rhs) { auto const scale = std::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value == rhs.rescaled(scale)._value; @@ -713,8 +713,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator==(fixed_point const& lhs, // EQUALITY NOT COMPARISON Operation template -CUDA_HOST_DEVICE_CALLABLE bool operator!=(fixed_point const& lhs, - fixed_point const& rhs) +CUDF_HOST_DEVICE inline bool operator!=(fixed_point const& lhs, + fixed_point const& rhs) { auto const scale = std::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value != rhs.rescaled(scale)._value; @@ -722,8 +722,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator!=(fixed_point const& lhs, // LESS THAN OR EQUAL TO Operation template -CUDA_HOST_DEVICE_CALLABLE bool operator<=(fixed_point const& lhs, - fixed_point const& rhs) +CUDF_HOST_DEVICE inline bool operator<=(fixed_point const& lhs, + fixed_point const& rhs) { auto const scale = std::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value <= rhs.rescaled(scale)._value; @@ -731,8 +731,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator<=(fixed_point const& lhs, // GREATER THAN OR EQUAL TO Operation template -CUDA_HOST_DEVICE_CALLABLE bool operator>=(fixed_point const& lhs, - fixed_point const& rhs) +CUDF_HOST_DEVICE inline bool operator>=(fixed_point const& lhs, + fixed_point const& rhs) { auto const scale = std::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value >= rhs.rescaled(scale)._value; @@ -740,8 +740,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator>=(fixed_point const& lhs, // LESS THAN Operation template -CUDA_HOST_DEVICE_CALLABLE bool operator<(fixed_point const& lhs, - fixed_point const& rhs) +CUDF_HOST_DEVICE inline bool operator<(fixed_point const& lhs, + fixed_point const& rhs) { auto const scale = std::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value < rhs.rescaled(scale)._value; @@ -749,8 +749,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator<(fixed_point const& lhs, // GREATER THAN Operation template -CUDA_HOST_DEVICE_CALLABLE bool operator>(fixed_point const& lhs, - fixed_point const& rhs) +CUDF_HOST_DEVICE inline bool operator>(fixed_point const& lhs, + fixed_point const& rhs) { auto const scale = std::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value > rhs.rescaled(scale)._value; diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp index 2b50e273517..be900f252f6 100644 --- a/cpp/include/cudf/fixed_point/temporary.hpp +++ b/cpp/include/cudf/fixed_point/temporary.hpp @@ -62,13 +62,13 @@ constexpr auto abs(T value) } template -CUDA_HOST_DEVICE_CALLABLE auto min(T lhs, T rhs) +CUDF_HOST_DEVICE inline auto min(T lhs, T rhs) { return lhs < rhs ? lhs : rhs; } template -CUDA_HOST_DEVICE_CALLABLE auto max(T lhs, T rhs) +CUDF_HOST_DEVICE inline auto max(T lhs, T rhs) { return lhs > rhs ? lhs : rhs; } diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 512a90b3249..8f06de99f05 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -416,6 +416,11 @@ class table_input_metadata { struct partition_info { size_type start_row; size_type num_rows; + + partition_info() = default; + partition_info(size_type start_row, size_type num_rows) : start_row(start_row), num_rows(num_rows) + { + } }; } // namespace io diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh index 7d0586ed6a6..bdf68037944 100644 --- a/cpp/include/cudf/lists/detail/scatter_helper.cuh +++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh @@ -65,9 +65,9 @@ struct unbound_list_view { * @param lists_column The actual source/target lists column * @param row_index Index of the row in lists_column that this instance represents */ - CUDA_DEVICE_CALLABLE unbound_list_view(label_type scatter_source_label, - cudf::detail::lists_column_device_view const& lists_column, - size_type const& row_index) + __device__ inline unbound_list_view(label_type scatter_source_label, + cudf::detail::lists_column_device_view const& lists_column, + size_type const& row_index) : _label{scatter_source_label}, _row_index{row_index} { _size = list_device_view{lists_column, row_index}.size(); @@ -81,9 +81,9 @@ struct unbound_list_view { * @param row_index Index of the row that this instance represents in the source/target column * @param size The number of elements in this list row */ - CUDA_DEVICE_CALLABLE unbound_list_view(label_type scatter_source_label, - size_type const& row_index, - size_type const& size) + __device__ inline unbound_list_view(label_type scatter_source_label, + size_type const& row_index, + size_type const& size) : _label{scatter_source_label}, _row_index{row_index}, _size{size} { } @@ -91,17 +91,17 @@ struct unbound_list_view { /** * @brief Returns number of elements in this list row. */ - CUDA_DEVICE_CALLABLE size_type size() const { return _size; } + __device__ inline size_type size() const { return _size; } /** * @brief Returns whether this row came from the `scatter()` source or target */ - CUDA_DEVICE_CALLABLE label_type label() const { return _label; } + __device__ inline label_type label() const { return _label; } /** * @brief Returns the index in the source/target column */ - CUDA_DEVICE_CALLABLE size_type row_index() const { return _row_index; } + __device__ inline size_type row_index() const { return _row_index; } /** * @brief Binds to source/target column (depending on SOURCE/TARGET labels), @@ -111,9 +111,9 @@ struct unbound_list_view { * @param scatter_target Target column for the scatter operation * @return A (bound) list_view for the row that this object represents */ - CUDA_DEVICE_CALLABLE list_device_view - bind_to_column(lists_column_device_view const& scatter_source, - lists_column_device_view const& scatter_target) const + __device__ inline list_device_view bind_to_column( + lists_column_device_view const& scatter_source, + lists_column_device_view const& scatter_target) const { return list_device_view(_label == label_type::SOURCE ? scatter_source : scatter_target, _row_index); diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh index 659fb1e6b2a..5071f046e0c 100644 --- a/cpp/include/cudf/lists/list_device_view.cuh +++ b/cpp/include/cudf/lists/list_device_view.cuh @@ -32,8 +32,8 @@ class list_device_view { public: list_device_view() = default; - CUDA_DEVICE_CALLABLE list_device_view(lists_column_device_view const& lists_column, - size_type const& row_index) + __device__ inline list_device_view(lists_column_device_view const& lists_column, + size_type const& row_index) : lists_column(lists_column), _row_index(row_index) { column_device_view const& offsets = lists_column.offsets(); @@ -69,7 +69,7 @@ class list_device_view { * The offset of this element as stored in the child column (i.e. 5) * may be fetched using this method. */ - CUDA_DEVICE_CALLABLE size_type element_offset(size_type idx) const + __device__ inline size_type element_offset(size_type idx) const { cudf_assert(idx >= 0 && idx < size() && "idx out of bounds"); return begin_offset + idx; @@ -83,7 +83,7 @@ class list_device_view { * @return The element at the specified index of the list row. */ template - CUDA_DEVICE_CALLABLE T element(size_type idx) const + __device__ inline T element(size_type idx) const { return lists_column.child().element(element_offset(idx)); } @@ -91,7 +91,7 @@ class list_device_view { /** * @brief Checks whether element is null at specified index in the list row. */ - CUDA_DEVICE_CALLABLE bool is_null(size_type idx) const + __device__ inline bool is_null(size_type idx) const { cudf_assert(idx >= 0 && idx < size() && "Index out of bounds."); auto element_offset = begin_offset + idx; @@ -101,17 +101,17 @@ class list_device_view { /** * @brief Checks whether this list row is null. */ - CUDA_DEVICE_CALLABLE bool is_null() const { return lists_column.is_null(_row_index); } + __device__ inline bool is_null() const { return lists_column.is_null(_row_index); } /** * @brief Fetches the number of elements in this list row. */ - CUDA_DEVICE_CALLABLE size_type size() const { return _size; } + __device__ inline size_type size() const { return _size; } /** * @brief Fetches the lists_column_device_view that contains this list. */ - CUDA_DEVICE_CALLABLE lists_column_device_view const& get_column() const { return lists_column; } + __device__ inline lists_column_device_view const& get_column() const { return lists_column; } template struct pair_accessor; @@ -141,7 +141,7 @@ class list_device_view { * 2. `p.second == false` */ template - CUDA_DEVICE_CALLABLE const_pair_iterator pair_begin() const + __device__ inline const_pair_iterator pair_begin() const { return const_pair_iterator{thrust::counting_iterator(0), pair_accessor{*this}}; } @@ -151,7 +151,7 @@ class list_device_view { * list_device_view. */ template - CUDA_DEVICE_CALLABLE const_pair_iterator pair_end() const + __device__ inline const_pair_iterator pair_end() const { return const_pair_iterator{thrust::counting_iterator(size()), pair_accessor{*this}}; @@ -173,7 +173,7 @@ class list_device_view { * 2. `p.second == false` */ template - CUDA_DEVICE_CALLABLE const_pair_rep_iterator pair_rep_begin() const + __device__ inline const_pair_rep_iterator pair_rep_begin() const { return const_pair_rep_iterator{thrust::counting_iterator(0), pair_rep_accessor{*this}}; @@ -184,7 +184,7 @@ class list_device_view { * list_device_view. */ template - CUDA_DEVICE_CALLABLE const_pair_rep_iterator pair_rep_end() const + __device__ inline const_pair_rep_iterator pair_rep_end() const { return const_pair_rep_iterator{thrust::counting_iterator(size()), pair_rep_accessor{*this}}; @@ -215,7 +215,7 @@ class list_device_view { * * @param _list The `list_device_view` whose rows are being accessed. */ - explicit CUDA_HOST_DEVICE_CALLABLE pair_accessor(list_device_view const& _list) : list{_list} {} + explicit CUDF_HOST_DEVICE inline pair_accessor(list_device_view const& _list) : list{_list} {} /** * @brief Accessor for the {data, validity} pair at the specified index @@ -223,8 +223,7 @@ class list_device_view { * @param i Index into the list_device_view * @return A pair of data element and its validity flag. */ - CUDA_DEVICE_CALLABLE - thrust::pair operator()(cudf::size_type i) const + __device__ inline thrust::pair operator()(cudf::size_type i) const { return {list.element(i), !list.is_null(i)}; } @@ -253,8 +252,7 @@ class list_device_view { * * @param _list The `list_device_view` whose rows are being accessed. */ - explicit CUDA_HOST_DEVICE_CALLABLE pair_rep_accessor(list_device_view const& _list) - : list{_list} + explicit CUDF_HOST_DEVICE inline pair_rep_accessor(list_device_view const& _list) : list{_list} { } @@ -264,21 +262,20 @@ class list_device_view { * @param i Index into the list_device_view * @return A pair of data element and its validity flag. */ - CUDA_DEVICE_CALLABLE - thrust::pair operator()(cudf::size_type i) const + __device__ inline thrust::pair operator()(cudf::size_type i) const { return {get_rep(i), !list.is_null(i)}; } private: template , void>* = nullptr> - CUDA_DEVICE_CALLABLE rep_type get_rep(cudf::size_type i) const + __device__ inline rep_type get_rep(cudf::size_type i) const { return list.element(i); } template , void>* = nullptr> - CUDA_DEVICE_CALLABLE rep_type get_rep(cudf::size_type i) const + __device__ inline rep_type get_rep(cudf::size_type i) const { return list.element(i).value(); } @@ -291,7 +288,7 @@ class list_device_view { */ struct list_size_functor { column_device_view const d_column; - CUDA_HOST_DEVICE_CALLABLE list_size_functor(column_device_view const& d_col) : d_column(d_col) + CUDF_HOST_DEVICE inline list_size_functor(column_device_view const& d_col) : d_column(d_col) { #if defined(__CUDA_ARCH__) cudf_assert(d_col.type().id() == type_id::LIST && "Only list type column is supported"); @@ -299,7 +296,7 @@ struct list_size_functor { CUDF_EXPECTS(d_col.type().id() == type_id::LIST, "Only list type column is supported"); #endif } - CUDA_DEVICE_CALLABLE size_type operator()(size_type idx) + __device__ inline size_type operator()(size_type idx) { if (d_column.is_null(idx)) return size_type{0}; auto d_offsets = diff --git a/cpp/include/cudf/lists/lists_column_device_view.cuh b/cpp/include/cudf/lists/lists_column_device_view.cuh index d8f082c9a42..aff088a7f44 100644 --- a/cpp/include/cudf/lists/lists_column_device_view.cuh +++ b/cpp/include/cudf/lists/lists_column_device_view.cuh @@ -46,12 +46,12 @@ class lists_column_device_view { /** * @brief Fetches number of rows in the lists column */ - CUDA_HOST_DEVICE_CALLABLE cudf::size_type size() const { return underlying.size(); } + CUDF_HOST_DEVICE inline cudf::size_type size() const { return underlying.size(); } /** * @brief Fetches the offsets column of the underlying list column. */ - CUDA_DEVICE_CALLABLE column_device_view offsets() const + __device__ inline column_device_view offsets() const { return underlying.child(lists_column_view::offsets_column_index); } @@ -59,7 +59,7 @@ class lists_column_device_view { /** * @brief Fetches the child column of the underlying list column. */ - CUDA_DEVICE_CALLABLE column_device_view child() const + __device__ inline column_device_view child() const { return underlying.child(lists_column_view::child_column_index); } @@ -67,19 +67,19 @@ class lists_column_device_view { /** * @brief Indicates whether the list column is nullable. */ - CUDA_DEVICE_CALLABLE bool nullable() const { return underlying.nullable(); } + __device__ inline bool nullable() const { return underlying.nullable(); } /** * @brief Indicates whether the row (i.e. list) at the specified * index is null. */ - CUDA_DEVICE_CALLABLE bool is_null(size_type idx) const { return underlying.is_null(idx); } + __device__ inline bool is_null(size_type idx) const { return underlying.is_null(idx); } /** * @brief Fetches the offset of the underlying column_device_view, * in case it is a sliced/offset column. */ - CUDA_DEVICE_CALLABLE size_type offset() const { return underlying.offset(); } + __device__ inline size_type offset() const { return underlying.offset(); } private: column_device_view underlying; diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp index 9081fa23eec..8435c47eaf5 100644 --- a/cpp/include/cudf/strings/json.hpp +++ b/cpp/include/cudf/strings/json.hpp @@ -48,7 +48,7 @@ class get_json_object_options { * @brief Returns true/false depending on whether single-quotes for representing strings * are allowed. */ - CUDA_HOST_DEVICE_CALLABLE bool get_allow_single_quotes() const { return allow_single_quotes; } + CUDF_HOST_DEVICE inline bool get_allow_single_quotes() const { return allow_single_quotes; } /** * @brief Returns true/false depending on whether individually returned string values have @@ -72,7 +72,7 @@ class get_json_object_options { * * @endcode */ - CUDA_HOST_DEVICE_CALLABLE bool get_strip_quotes_from_single_strings() const + CUDF_HOST_DEVICE inline bool get_strip_quotes_from_single_strings() const { return strip_quotes_from_single_strings; } diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 238d55d580e..43a90997c86 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -80,7 +80,7 @@ static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; * * @return An empty string */ -CUDA_HOST_DEVICE_CALLABLE string_view string_view::min() { return string_view(); } +CUDF_HOST_DEVICE inline string_view string_view::min() { return string_view(); } /** * @brief Return maximum value associated with the string type @@ -91,7 +91,7 @@ CUDA_HOST_DEVICE_CALLABLE string_view string_view::min() { return string_view(); * @return A string value which represents the highest possible valid UTF-8 encoded * character. */ -CUDA_HOST_DEVICE_CALLABLE string_view string_view::max() +CUDF_HOST_DEVICE inline string_view string_view::max() { const char* psentinel{nullptr}; #if defined(__CUDA_ARCH__) diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index be182cb0e9d..22409ab3dc7 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -51,20 +51,20 @@ class string_view { /** * @brief Return the number of bytes in this string */ - CUDA_HOST_DEVICE_CALLABLE size_type size_bytes() const { return _bytes; } + CUDF_HOST_DEVICE inline size_type size_bytes() const { return _bytes; } /** * @brief Return the number of characters in this string */ - CUDA_DEVICE_CALLABLE size_type length() const; + __device__ inline size_type length() const; /** * @brief Return a pointer to the internal device array */ - CUDA_HOST_DEVICE_CALLABLE const char* data() const { return _data; } + CUDF_HOST_DEVICE inline const char* data() const { return _data; } /** * @brief Return true if string has no characters */ - CUDA_HOST_DEVICE_CALLABLE bool empty() const { return size_bytes() == 0; } + CUDF_HOST_DEVICE inline bool empty() const { return size_bytes() == 0; } /** * @brief Handy iterator for navigating through encoded characters. @@ -76,28 +76,28 @@ class string_view { using reference = char_utf8&; using pointer = char_utf8*; using iterator_category = std::input_iterator_tag; - CUDA_DEVICE_CALLABLE const_iterator(const string_view& str, size_type pos); + __device__ inline const_iterator(const string_view& str, size_type pos); const_iterator(const const_iterator& mit) = default; const_iterator(const_iterator&& mit) = default; const_iterator& operator=(const const_iterator&) = default; const_iterator& operator=(const_iterator&&) = default; - CUDA_DEVICE_CALLABLE const_iterator& operator++(); - CUDA_DEVICE_CALLABLE const_iterator operator++(int); - CUDA_DEVICE_CALLABLE const_iterator& operator+=(difference_type); - CUDA_DEVICE_CALLABLE const_iterator operator+(difference_type); - CUDA_DEVICE_CALLABLE const_iterator& operator--(); - CUDA_DEVICE_CALLABLE const_iterator operator--(int); - CUDA_DEVICE_CALLABLE const_iterator& operator-=(difference_type); - CUDA_DEVICE_CALLABLE const_iterator operator-(difference_type); - CUDA_DEVICE_CALLABLE bool operator==(const const_iterator&) const; - CUDA_DEVICE_CALLABLE bool operator!=(const const_iterator&) const; - CUDA_DEVICE_CALLABLE bool operator<(const const_iterator&) const; - CUDA_DEVICE_CALLABLE bool operator<=(const const_iterator&) const; - CUDA_DEVICE_CALLABLE bool operator>(const const_iterator&) const; - CUDA_DEVICE_CALLABLE bool operator>=(const const_iterator&) const; - CUDA_DEVICE_CALLABLE char_utf8 operator*() const; - CUDA_DEVICE_CALLABLE size_type position() const; - CUDA_DEVICE_CALLABLE size_type byte_offset() const; + __device__ inline const_iterator& operator++(); + __device__ inline const_iterator operator++(int); + __device__ inline const_iterator& operator+=(difference_type); + __device__ inline const_iterator operator+(difference_type); + __device__ inline const_iterator& operator--(); + __device__ inline const_iterator operator--(int); + __device__ inline const_iterator& operator-=(difference_type); + __device__ inline const_iterator operator-(difference_type); + __device__ inline bool operator==(const const_iterator&) const; + __device__ inline bool operator!=(const const_iterator&) const; + __device__ inline bool operator<(const const_iterator&) const; + __device__ inline bool operator<=(const const_iterator&) const; + __device__ inline bool operator>(const const_iterator&) const; + __device__ inline bool operator>=(const const_iterator&) const; + __device__ inline char_utf8 operator*() const; + __device__ inline size_type position() const; + __device__ inline size_type byte_offset() const; private: const char* p{}; @@ -109,24 +109,24 @@ class string_view { /** * @brief Return new iterator pointing to the beginning of this string */ - CUDA_DEVICE_CALLABLE const_iterator begin() const; + __device__ inline const_iterator begin() const; /** * @brief Return new iterator pointing past the end of this string */ - CUDA_DEVICE_CALLABLE const_iterator end() const; + __device__ inline const_iterator end() const; /** * @brief Return single UTF-8 character at the given character position * * @param pos Character position */ - CUDA_DEVICE_CALLABLE char_utf8 operator[](size_type pos) const; + __device__ inline char_utf8 operator[](size_type pos) const; /** * @brief Return the byte offset from data() for a given character position * * @param pos Character position */ - CUDA_DEVICE_CALLABLE size_type byte_offset(size_type pos) const; + __device__ inline size_type byte_offset(size_type pos) const; /** * @brief Comparing target string with this string. Each character is compared @@ -141,7 +141,7 @@ class string_view { * not match is greater in the arg string, or all compared characters * match but the arg string is longer. */ - CUDA_DEVICE_CALLABLE int compare(const string_view& str) const; + __device__ inline int compare(const string_view& str) const; /** * @brief Comparing target string with this string. Each character is compared * as a UTF-8 code-point value. @@ -156,32 +156,32 @@ class string_view { * not match is greater in the arg string, or all compared characters * match but the arg string is longer. */ - CUDA_DEVICE_CALLABLE int compare(const char* str, size_type bytes) const; + __device__ inline int compare(const char* str, size_type bytes) const; /** * @brief Returns true if rhs matches this string exactly. */ - CUDA_DEVICE_CALLABLE bool operator==(const string_view& rhs) const; + __device__ inline bool operator==(const string_view& rhs) const; /** * @brief Returns true if rhs does not match this string. */ - CUDA_DEVICE_CALLABLE bool operator!=(const string_view& rhs) const; + __device__ inline bool operator!=(const string_view& rhs) const; /** * @brief Returns true if this string is ordered before rhs. */ - CUDA_DEVICE_CALLABLE bool operator<(const string_view& rhs) const; + __device__ inline bool operator<(const string_view& rhs) const; /** * @brief Returns true if rhs is ordered before this string. */ - CUDA_DEVICE_CALLABLE bool operator>(const string_view& rhs) const; + __device__ inline bool operator>(const string_view& rhs) const; /** * @brief Returns true if this string matches or is ordered before rhs. */ - CUDA_DEVICE_CALLABLE bool operator<=(const string_view& rhs) const; + __device__ inline bool operator<=(const string_view& rhs) const; /** * @brief Returns true if rhs matches or is ordered before this string. */ - CUDA_DEVICE_CALLABLE bool operator>=(const string_view& rhs) const; + __device__ inline bool operator>=(const string_view& rhs) const; /** * @brief Returns the character position of the first occurrence where the @@ -193,9 +193,9 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return -1 if str is not found in this string. */ - CUDA_DEVICE_CALLABLE size_type find(const string_view& str, - size_type pos = 0, - size_type count = -1) const; + __device__ inline size_type find(const string_view& str, + size_type pos = 0, + size_type count = -1) const; /** * @brief Returns the character position of the first occurrence where the * array str is found in this string within the character range [pos,pos+n). @@ -207,10 +207,10 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return -1 if arg string is not found in this string. */ - CUDA_DEVICE_CALLABLE size_type find(const char* str, - size_type bytes, - size_type pos = 0, - size_type count = -1) const; + __device__ inline size_type find(const char* str, + size_type bytes, + size_type pos = 0, + size_type count = -1) const; /** * @brief Returns the character position of the first occurrence where * character is found in this string within the character range [pos,pos+n). @@ -221,9 +221,9 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return -1 if arg string is not found in this string. */ - CUDA_DEVICE_CALLABLE size_type find(char_utf8 character, - size_type pos = 0, - size_type count = -1) const; + __device__ inline size_type find(char_utf8 character, + size_type pos = 0, + size_type count = -1) const; /** * @brief Returns the character position of the last occurrence where the * argument str is found in this string within the character range [pos,pos+n). @@ -234,9 +234,9 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return -1 if arg string is not found in this string. */ - CUDA_DEVICE_CALLABLE size_type rfind(const string_view& str, - size_type pos = 0, - size_type count = -1) const; + __device__ inline size_type rfind(const string_view& str, + size_type pos = 0, + size_type count = -1) const; /** * @brief Returns the character position of the last occurrence where the * array str is found in this string within the character range [pos,pos+n). @@ -248,10 +248,10 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return -1 if arg string is not found in this string. */ - CUDA_DEVICE_CALLABLE size_type rfind(const char* str, - size_type bytes, - size_type pos = 0, - size_type count = -1) const; + __device__ inline size_type rfind(const char* str, + size_type bytes, + size_type pos = 0, + size_type count = -1) const; /** * @brief Returns the character position of the last occurrence where * character is found in this string within the character range [pos,pos+n). @@ -262,9 +262,9 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return -1 if arg string is not found in this string. */ - CUDA_DEVICE_CALLABLE size_type rfind(char_utf8 character, - size_type pos = 0, - size_type count = -1) const; + __device__ inline size_type rfind(char_utf8 character, + size_type pos = 0, + size_type count = -1) const; /** * @brief Return a sub-string of this string. The original string and device @@ -274,7 +274,7 @@ class string_view { * @param length Number of characters from start to include in the sub-string. * @return New instance pointing to a subset of the characters within this instance. */ - CUDA_DEVICE_CALLABLE string_view substr(size_type start, size_type length) const; + __device__ inline string_view substr(size_type start, size_type length) const; /** * @brief Return minimum value associated with the string type @@ -284,7 +284,7 @@ class string_view { * * @return An empty string */ - CUDA_HOST_DEVICE_CALLABLE static string_view min(); + CUDF_HOST_DEVICE inline static string_view min(); /** * @brief Return maximum value associated with the string type @@ -295,12 +295,12 @@ class string_view { * @return A string value which represents the highest possible valid UTF-8 encoded * character. */ - CUDA_HOST_DEVICE_CALLABLE static string_view max(); + CUDF_HOST_DEVICE inline static string_view max(); /** * @brief Default constructor represents an empty string. */ - CUDA_HOST_DEVICE_CALLABLE string_view() : _data(""), _bytes(0), _length(0) {} + CUDF_HOST_DEVICE inline string_view() : _data(""), _bytes(0), _length(0) {} /** * @brief Create instance from existing device char array. @@ -308,7 +308,7 @@ class string_view { * @param data Device char array encoded in UTF8. * @param bytes Number of bytes in data array. */ - CUDA_HOST_DEVICE_CALLABLE string_view(const char* data, size_type bytes) + CUDF_HOST_DEVICE inline string_view(const char* data, size_type bytes) : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH) { } @@ -330,7 +330,7 @@ class string_view { * @param bytepos Byte position from start of _data. * @return The character position for the specified byte. */ - CUDA_DEVICE_CALLABLE size_type character_offset(size_type bytepos) const; + __device__ inline size_type character_offset(size_type bytepos) const; }; namespace strings { @@ -386,7 +386,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte) * @param[out] character Single char_utf8 value. * @return The number of bytes in the character */ -CUDA_HOST_DEVICE_CALLABLE size_type to_char_utf8(const char* str, char_utf8& character) +CUDF_HOST_DEVICE inline size_type to_char_utf8(const char* str, char_utf8& character) { size_type const chr_width = bytes_in_utf8_byte(static_cast(*str)); @@ -413,7 +413,7 @@ CUDA_HOST_DEVICE_CALLABLE size_type to_char_utf8(const char* str, char_utf8& cha * @param[out] str Allocated char array with enough space to hold the encoded character. * @return The number of bytes in the character */ -CUDA_HOST_DEVICE_CALLABLE size_type from_char_utf8(char_utf8 character, char* str) +CUDF_HOST_DEVICE inline size_type from_char_utf8(char_utf8 character, char* str) { size_type const chr_width = bytes_in_char_utf8(character); for (size_type idx = 0; idx < chr_width; ++idx) { diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 13d5f8e06bc..459a4182aa0 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -17,19 +17,9 @@ #pragma once #ifdef __CUDACC__ -#define CUDA_HOST_DEVICE_CALLABLE __host__ __device__ inline -#define CUDA_DEVICE_CALLABLE __device__ inline - -// This version of the macro maximizes the chances of inlining when applied to -// a callable that is called on the GPU. -#define CUDF_HDFI __host__ __device__ __forceinline__ -#define CUDF_DFI __device__ __forceinline__ +#define CUDF_HOST_DEVICE __host__ __device__ #else -#define CUDA_HOST_DEVICE_CALLABLE inline -#define CUDA_DEVICE_CALLABLE inline - -#define CUDF_HDFI inline -#define CUDF_DFI inline +#define CUDF_HOST_DEVICE #endif #include diff --git a/cpp/include/cudf/utilities/bit.hpp b/cpp/include/cudf/utilities/bit.hpp index cbd09fa7b0d..f4a70463de3 100644 --- a/cpp/include/cudf/utilities/bit.hpp +++ b/cpp/include/cudf/utilities/bit.hpp @@ -42,7 +42,7 @@ namespace detail { #endif template -constexpr CUDA_HOST_DEVICE_CALLABLE std::size_t size_in_bits() +constexpr CUDF_HOST_DEVICE inline std::size_t size_in_bits() { static_assert(CHAR_BIT == 8, "Size of a byte must be 8 bits."); return sizeof(T) * CHAR_BIT; @@ -58,7 +58,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE std::size_t size_in_bits() /** * @brief Returns the index of the word containing the specified bit. */ -constexpr CUDA_HOST_DEVICE_CALLABLE size_type word_index(size_type bit_index) +constexpr CUDF_HOST_DEVICE inline size_type word_index(size_type bit_index) { return bit_index / detail::size_in_bits(); } @@ -66,7 +66,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE size_type word_index(size_type bit_index) /** * @brief Returns the position within a word of the specified bit. */ -constexpr CUDA_HOST_DEVICE_CALLABLE size_type intra_word_index(size_type bit_index) +constexpr CUDF_HOST_DEVICE inline size_type intra_word_index(size_type bit_index) { return bit_index % detail::size_in_bits(); } @@ -80,7 +80,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE size_type intra_word_index(size_type bit_ind * @param bitmask The bitmask containing the bit to set * @param bit_index Index of the bit to set */ -CUDA_HOST_DEVICE_CALLABLE void set_bit_unsafe(bitmask_type* bitmask, size_type bit_index) +CUDF_HOST_DEVICE inline void set_bit_unsafe(bitmask_type* bitmask, size_type bit_index) { assert(nullptr != bitmask); bitmask[word_index(bit_index)] |= (bitmask_type{1} << intra_word_index(bit_index)); @@ -95,7 +95,7 @@ CUDA_HOST_DEVICE_CALLABLE void set_bit_unsafe(bitmask_type* bitmask, size_type b * @param bitmask The bitmask containing the bit to clear * @param bit_index The index of the bit to clear */ -CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask, size_type bit_index) +CUDF_HOST_DEVICE inline void clear_bit_unsafe(bitmask_type* bitmask, size_type bit_index) { assert(nullptr != bitmask); bitmask[word_index(bit_index)] &= ~(bitmask_type{1} << intra_word_index(bit_index)); @@ -109,7 +109,7 @@ CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask, size_type * @return true The specified bit is `1` * @return false The specified bit is `0` */ -CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask, size_type bit_index) +CUDF_HOST_DEVICE inline bool bit_is_set(bitmask_type const* bitmask, size_type bit_index) { assert(nullptr != bitmask); return bitmask[word_index(bit_index)] & (bitmask_type{1} << intra_word_index(bit_index)); @@ -125,9 +125,9 @@ CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask, size_type * @return false The specified bit is `0` * @return `default_value` if `bitmask` is nullptr */ -CUDA_HOST_DEVICE_CALLABLE bool bit_value_or(bitmask_type const* bitmask, - size_type bit_index, - bool default_value) +CUDF_HOST_DEVICE inline bool bit_value_or(bitmask_type const* bitmask, + size_type bit_index, + bool default_value) { return bitmask != nullptr ? bit_is_set(bitmask, bit_index) : default_value; } @@ -140,7 +140,7 @@ CUDA_HOST_DEVICE_CALLABLE bool bit_value_or(bitmask_type const* bitmask, * @param n The number of least significant bits to set * @return A bitmask word with `n` least significant bits set */ -constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_least_significant_bits(size_type n) +constexpr CUDF_HOST_DEVICE inline bitmask_type set_least_significant_bits(size_type n) { constexpr_assert(0 <= n && n < static_cast(detail::size_in_bits())); return ((bitmask_type{1} << n) - 1); @@ -154,7 +154,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_least_significant_bits(size * @param n The number of most significant bits to set * @return A bitmask word with `n` most significant bits set */ -constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_most_significant_bits(size_type n) +constexpr CUDF_HOST_DEVICE inline bitmask_type set_most_significant_bits(size_type n) { constexpr size_type word_size{detail::size_in_bits()}; constexpr_assert(0 <= n && n < word_size); diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index d1bd3049ba3..0b3b3a5df76 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -177,7 +177,7 @@ inline bool is_equality_comparable(data_type type) template constexpr inline bool is_numeric() { - return cuda::std::is_integral() or std::is_floating_point::value; + return cuda::std::is_arithmetic(); } struct is_numeric_impl { diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp index d7d38aba4f3..0c6a6ee244c 100644 --- a/cpp/include/cudf/utilities/type_dispatcher.hpp +++ b/cpp/include/cudf/utilities/type_dispatcher.hpp @@ -420,7 +420,9 @@ using scalar_device_type_t = typename type_to_scalar_type_impl::ScalarDeviceT template