From 8db918fca81c5f1189721903e7b829a0e3556eb8 Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Wed, 3 Aug 2022 18:48:17 +0000 Subject: [PATCH 001/121] Groupby Apply with JIT (First Commit) --- python/cudf/cudf/core/groupby/groupby.py | 15 +- python/cudf/cudf/core/udf/function.cu | 620 ++++++++++++++++ python/cudf/cudf/core/udf/groupby_function.py | 666 ++++++++++++++++++ python/cudf/cudf/core/udf/templates.py | 23 + python/cudf/cudf/core/udf/utils.py | 11 + python/cudf/cudf/tests/test_groupby.py | 57 ++ 6 files changed, 1391 insertions(+), 1 deletion(-) create mode 100644 python/cudf/cudf/core/udf/function.cu create mode 100644 python/cudf/cudf/core/udf/groupby_function.py diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index c651cfdf3a1..e9ee2c0016f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -20,6 +20,7 @@ from cudf.core.column_accessor import ColumnAccessor from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex +from cudf.core.udf.groupby_function import jit_groupby_apply from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate @@ -540,7 +541,7 @@ def pipe(self, func, *args, **kwargs): """ return cudf.core.common.pipe(self, func, *args, **kwargs) - def apply(self, function, *args): + def apply(self, function, *args, engine="nonjit"): """Apply a python transformation function over the grouped chunk. Parameters @@ -609,6 +610,17 @@ def mult(df): raise TypeError(f"type {type(function)} is not callable") group_names, offsets, _, grouped_values = self._grouped() + # jit groupby apply only returns Series + if engine == "jit": + chunk_results = jit_groupby_apply( + offsets, grouped_values, function, *args + ) + result = cudf.Series(chunk_results, index=group_names) + result.index.names = self.grouping.names + if self._sort: + result = result.sort_index() + return result + ngroups = len(offsets) - 1 if ngroups > self._MAX_GROUPS_BEFORE_WARN: warnings.warn( @@ -620,6 +632,7 @@ def mult(df): grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) ] chunk_results = [function(chk, *args) for chk in chunks] + if not len(chunk_results): return self.obj.head(0) diff --git a/python/cudf/cudf/core/udf/function.cu b/python/cudf/cudf/core/udf/function.cu new file mode 100644 index 00000000000..728a8d84f63 --- /dev/null +++ b/python/cudf/cudf/core/udf/function.cu @@ -0,0 +1,620 @@ +// Copyright (c) 2020-2022, NVIDIA CORPORATION. + +#include + +// double atomicAdd +__device__ __forceinline__ double atomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = + (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + + } while (assumed != old); + + return __longlong_as_double(old); +} + +// double atomicMax +__device__ __forceinline__ double atomicMax(double *address, double val) +{ + unsigned long long old = __double_as_longlong(*address); + while(val > __longlong_as_double(old)) + { + unsigned long long assumed = old; + if((old = atomicCAS((unsigned long long *)address, assumed, __double_as_longlong(val))) == assumed) + break; + } + return __longlong_as_double(old); +} + +// double atomicMin +__device__ __forceinline__ double atomicMin(double *address, double val) +{ + unsigned long long old = __double_as_longlong(*address); + while(val < __longlong_as_double(old)) + { + unsigned long long assumed = old; + if((old = atomicCAS((unsigned long long *)address, assumed, __double_as_longlong(val))) == assumed) + break; + } + return __longlong_as_double(old); +} + +extern "C" __device__ int BlockSum_int64(int64_t *numba_return_value, int64_t *data, int64_t size) { + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + int64_t local_sum = 0; + + __shared__ int64_t sum; + + if (tid == 0) + sum = 0; + + __syncthreads(); + + // Calculate local sum for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + int64_t load = data[tid + ITEM * tb_size]; + local_sum += load; + } + } + + atomicAdd((unsigned long long*) &sum, (unsigned long long) local_sum); + + __syncthreads(); + + *numba_return_value = sum; + + return 0; +} + +extern "C" __device__ int BlockSum_float64(double *numba_return_value, double *data, int64_t size) { + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + double local_sum = 0; + + __shared__ double sum; + + if (tid == 0) + sum = 0; + + __syncthreads(); + + // Calculate local sum for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + double load = data[tid + ITEM * tb_size]; + local_sum += load; + } + } + + atomicAdd(&sum, local_sum); + + __syncthreads(); + + *numba_return_value = sum; + + return 0; +} + + +extern "C" __device__ int BlockMean_int64(double *numba_return_value, int64_t *data, int64_t size) { + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + int64_t local_sum = 0; + double mean; + + __shared__ int64_t sum; + + if (tid == 0) + sum = 0; + + __syncthreads(); + + // Calculate local sum for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + int64_t load = data[tid + ITEM * tb_size]; + local_sum += load; + } + } + + atomicAdd((unsigned long long*) &sum, (unsigned long long) local_sum); + + __syncthreads(); + + mean = sum * 1.0 / size; + + *numba_return_value = mean; + + return 0; + +} + +extern "C" __device__ int BlockMean_float64(double *numba_return_value, double *data, int64_t size) { + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + double local_sum = 0; + double mean; + + __shared__ double sum; + + if (tid == 0) + sum = 0; + + __syncthreads(); + + // Calculate local sum for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + double load = data[tid + ITEM * tb_size]; + local_sum += load; + } + } + + atomicAdd(&sum, local_sum); + + __syncthreads(); + + mean = sum * 1.0 / size; + + *numba_return_value = mean; + + return 0; + +} + + + +extern "C" __device__ int BlockStd_int64(double *numba_return_value, int64_t *data, int64_t size) { + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + int64_t local_sum = 0; + double local_var = 0; + double mean; + double std; + + __shared__ int64_t sum; + __shared__ double var; + + if (tid == 0) { + sum = 0; + var = 0; + } + + __syncthreads(); + + // Calculate local sum for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + int64_t load = data[tid + ITEM * tb_size]; + local_sum += load; + } + } + + atomicAdd((unsigned long long*) &sum, (unsigned long long) local_sum); + + __syncthreads(); + + mean = sum * 1.0 / size; + + // Calculate local sum for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + int64_t load = data[tid + ITEM * tb_size]; + double temp = load - mean; + temp = pow(temp, 2); + local_var += temp; + } + } + + atomicAdd(&var, local_var); + + __syncthreads(); + + std = sqrt(var / (size - 1)); + + *numba_return_value = std; + + return 0; +} + +extern "C" __device__ int BlockStd_float64(double *numba_return_value, double *data, int64_t size) { + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + double local_sum = 0; + double local_var = 0; + double mean; + double std; + + __shared__ double sum; + __shared__ double var; + + if (tid == 0) { + sum = 0; + var = 0; + } + + __syncthreads(); + + // Calculate local sum for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + double load = data[tid + ITEM * tb_size]; + local_sum += load; + } + } + + atomicAdd(&sum, local_sum); + + __syncthreads(); + + mean = sum * 1.0 / size; + + // Calculate local sum for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + double load = data[tid + ITEM * tb_size]; + double temp = load - mean; + temp = pow(temp, 2); + local_var += temp; + } + } + + atomicAdd(&var, local_var); + + __syncthreads(); + + std = sqrt(var / (size - 1)); + + *numba_return_value = std; + + return 0; +} + +extern "C" __device__ int BlockVar_int64(double *numba_return_value, int64_t *data, int64_t size) { + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + int64_t local_sum = 0; + double local_var = 0; + double mean; + + __shared__ int64_t sum; + __shared__ double var; + + if (tid == 0) { + sum = 0; + var = 0; + } + + __syncthreads(); + + // Calculate local sum for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + int64_t load = data[tid + ITEM * tb_size]; + local_sum += load; + } + } + + atomicAdd((unsigned long long*) &sum, (unsigned long long) local_sum); + + __syncthreads(); + + mean = sum * 1.0 / size; + + // Calculate local sum for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + int64_t load = data[tid + ITEM * tb_size]; + double temp = load - mean; + temp = pow(temp, 2); + local_var += temp; + } + } + + atomicAdd(&var, local_var); + + __syncthreads(); + + var = var / (size - 1); + + *numba_return_value = var; + + return 0; +} + +extern "C" __device__ int BlockVar_float64(double *numba_return_value, double *data, int64_t size) { + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + double local_sum = 0; + double local_var = 0; + double mean; + + __shared__ double sum; + __shared__ double var; + + if (tid == 0) { + sum = 0; + var = 0; + } + + __syncthreads(); + + // Calculate local sum for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + double load = data[tid + ITEM * tb_size]; + local_sum += load; + } + } + + atomicAdd(&sum, local_sum); + + __syncthreads(); + + mean = sum * 1.0 / size; + + // Calculate local sum for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + double load = data[tid + ITEM * tb_size]; + double temp = load - mean; + temp = pow(temp, 2); + local_var += temp; + } + } + + atomicAdd(&var, local_var); + + __syncthreads(); + + var = var / (size - 1); + + *numba_return_value = var; + + return 0; +} + + +// Calculate maximum of the group, return the scalar +extern "C" __device__ int BlockMax_int32(int *numba_return_value, int *data, int64_t size) { + + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + int local_max = 0; + + __shared__ int smax; + + if (tid == 0) + smax = 0; + + __syncthreads(); + + // Calculate local max for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + int load = data[tid + ITEM * tb_size]; + local_max = max(local_max, load); + } + } + + __syncthreads(); + + // Calculate local max for each group + atomicMax(&smax, local_max); + + __syncthreads(); + + *numba_return_value = smax; + + return 0; +} + +// Calculate maximum of the group, return the scalar +extern "C" __device__ int BlockMax_int64(int64_t *numba_return_value, int64_t *data, int64_t size) { + + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + int64_t local_max = 0; + + __shared__ int64_t smax; + + if (tid == 0) + smax = 0; + + __syncthreads(); + + // Calculate local max for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + int64_t load = data[tid + ITEM * tb_size]; + local_max = max(local_max, load); + } + } + + __syncthreads(); + + // Calculate local max for each group + atomicMax((long long*) (&smax), (long long) local_max); + + __syncthreads(); + + *numba_return_value = smax; + + return 0; +} + +// Calculate maximum of the group, return the scalar +extern "C" __device__ int BlockMax_float64(double *numba_return_value, double *data, int64_t size) { + + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + double local_max = 0; + + __shared__ double smax; + + if (tid == 0) + smax = 0; + + __syncthreads(); + + // Calculate local max for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + double load = data[tid + ITEM * tb_size]; + local_max = max(local_max, load); + } + } + + __syncthreads(); + + // Calculate local max for each group + atomicMax((&smax), local_max); + + __syncthreads(); + + *numba_return_value = smax; + + return 0; +} + +// Calculate minimum of the group, return the scalar +extern "C" __device__ int BlockMin_int32(int *numba_return_value, int *data, int64_t size) { + + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + int local_min = INT_MAX; + + __shared__ int smin; + + if (tid == 0) + smin = INT_MAX; + + __syncthreads(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + int load = data[tid + ITEM * tb_size]; + local_min = min(local_min, load); + } + } + + __syncthreads(); + + // Calculate local max for each group + atomicMin(&smin, local_min); + + __syncthreads(); + + *numba_return_value = smin; + + return 0; +} + +// Calculate minimum of the group, return the scalar +extern "C" __device__ int BlockMin_int64(int64_t *numba_return_value, int64_t *data, int64_t size) { + + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + int64_t local_min = INT_MAX; + + __shared__ int64_t smin; + + if (tid == 0) + smin = INT_MAX; + + __syncthreads(); + + // Calculate local max for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + int64_t load = data[tid + ITEM * tb_size]; + local_min = min(local_min, load); + } + } + + __syncthreads(); + + // Calculate local max for each group + atomicMin((long long*) (&smin), (long long)local_min); + + __syncthreads(); + + *numba_return_value = smin; + + return 0; +} + +// Calculate minimum of the group, return the scalar +extern "C" __device__ int BlockMin_float64(double *numba_return_value, double *data, int64_t size) { + + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + double local_min = INT_MAX; + + __shared__ double smin; + + if (tid == 0) + smin = INT_MAX; + + __syncthreads(); + + // Calculate local max for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + double load = data[tid + ITEM * tb_size]; + local_min = min(local_min, load); + } + } + + __syncthreads(); + + // Calculate local max for each group + atomicMin((&smin), local_min); + + __syncthreads(); + + *numba_return_value = smin; + + return 0; +} \ No newline at end of file diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py new file mode 100644 index 00000000000..2a9bb7338f1 --- /dev/null +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -0,0 +1,666 @@ +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + +import math +import os + +import cupy as cp +import numba +import numpy as np +from numba import cuda, types +from numba.core import cgutils +from numba.core.extending import ( + lower_builtin, + make_attribute_wrapper, + models, + register_model, + type_callable, + typeof_impl, +) +from numba.core.typing import signature as nb_signature +from numba.core.typing.templates import AbstractTemplate, AttributeTemplate +from numba.cuda.cudadecl import registry as cuda_registry +from numba.cuda.cudaimpl import lower as cuda_lower +from numba.np import numpy_support +from numba.types import Record + +from cudf.core.column import as_column +from cudf.core.udf.templates import ( + group_initializer_template, + groupby_apply_kernel_template, +) +from cudf.core.udf.utils import ( + _all_dtypes_from_frame, + _get_kernel_groupby_apply, + _get_udf_return_type, + _supported_cols_from_frame, + _supported_dtypes_from_frame, +) + +numba.config.CUDA_USE_NVIDIA_BINDING = 1 +# Disable occupancy warnings to avoid polluting output when there are few +# groups. +numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + + +class Group(object): + def __init__(self, group_data, size, dtype): + self.group_data = group_data + self.size = size + self.dtype = dtype + + +class GroupType(numba.types.Type): + def __init__(self, group_scalar_type): + self.group_scalar_type = group_scalar_type + self.group_data_type = types.CPointer(group_scalar_type) + self.size_type = types.int64 + super().__init__(name=f"Group({self.group_scalar_type})") + + +@typeof_impl.register(Group) +def typeof_group(val, c): + return GroupType( + numba.np.numpy_support.from_dtype(val.dtype) + ) # converting from numpy type to numba type + + +@type_callable(Group) +def type_group(context): + def typer(group_data, size): + if isinstance(group_data, types.Array) and isinstance( + size, types.Integer + ): + return GroupType(group_data.dtype) + + return typer + + +@register_model(GroupType) +class GroupModel(models.StructModel): + def __init__( + self, dmm, fe_type + ): # fe_type is fully instantiated group type + members = [ + ("group_data", types.CPointer(fe_type.group_scalar_type)), + ("size", types.int64), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +my_max_int32 = cuda.declare_device( + "BlockMax_int32", "types.int32(types.CPointer(types.int32),types.int64)" +) + +my_max_int64 = cuda.declare_device( + "BlockMax_int64", "types.int64(types.CPointer(types.int64),types.int64)" +) + +my_max_float64 = cuda.declare_device( + "BlockMax_float64", + "types.float64(types.CPointer(types.float64),types.int64)", +) + +my_min_int32 = cuda.declare_device( + "BlockMin_int32", "types.int32(types.CPointer(types.int32),types.int64)" +) + +my_min_int64 = cuda.declare_device( + "BlockMin_int64", "types.int64(types.CPointer(types.int64),types.int64)" +) + +my_min_float64 = cuda.declare_device( + "BlockMin_float64", + "types.float64(types.CPointer(types.float64),types.int64)", +) + +my_count_int64 = cuda.declare_device( + "BlockCount_int64", + "types.int64(types.CPointer(types.int64),types.int64)", +) + +my_count_float64 = cuda.declare_device( + "BlockCount_float64", + "types.int64(types.CPointer(types.float64),types.int64)", +) + +my_sum_int64 = cuda.declare_device( + "BlockSum_int64", "types.int64(types.CPointer(types.int64),types.int64)" +) + +my_sum_float64 = cuda.declare_device( + "BlockSum_float64", + "types.float64(types.CPointer(types.float64),types.int64)", +) + +my_mean_int64 = cuda.declare_device( + "BlockMean_int64", + "types.float64(types.CPointer(types.int64),types.int64)", +) + +my_mean_float64 = cuda.declare_device( + "BlockMean_float64", + "types.float64(types.CPointer(types.float64),types.int64)", +) + +my_std_int64 = cuda.declare_device( + "BlockStd_int64", + "types.float64(types.CPointer(types.int64),types.int64)", +) + +my_std_float64 = cuda.declare_device( + "BlockStd_float64", + "types.float64(types.CPointer(types.float64),types.int64)", +) + +my_var_int64 = cuda.declare_device( + "BlockVar_int64", + "types.float64(types.CPointer(types.int64),types.int64)", +) + +my_var_float64 = cuda.declare_device( + "BlockVar_float64", + "types.float64(types.CPointer(types.float64),types.int64)", +) + +# Path to the source containing the foreign function +basedir = os.path.dirname(os.path.realpath(__file__)) +dev_func_ptx = os.path.join(basedir, "function.ptx") + + +def call_my_max_int32(data, size): + return my_max_int32(data, size) + + +def call_my_max_int64(data, size): + return my_max_int64(data, size) + + +def call_my_max_float64(data, size): + return my_max_float64(data, size) + + +def call_my_min_int32(data, size): + return my_min_int32(data, size) + + +def call_my_min_int64(data, size): + return my_min_int64(data, size) + + +def call_my_min_float64(data, size): + return my_min_float64(data, size) + + +def call_my_count_int64(data, size): + return my_count_int64(data, size) + + +def call_my_count_float64(data, size): + return my_count_float64(data, size) + + +def call_my_sum_int64(data, size): + return my_sum_int64(data, size) + + +def call_my_sum_float64(data, size): + return my_sum_float64(data, size) + + +def call_my_mean_int64(data, size): + return my_mean_int64(data, size) + + +def call_my_mean_float64(data, size): + return my_mean_float64(data, size) + + +def call_my_std_int64(data, size): + return my_std_int64(data, size) + + +def call_my_std_float64(data, size): + return my_std_float64(data, size) + + +def call_my_var_int64(data, size): + return my_var_int64(data, size) + + +def call_my_var_float64(data, size): + return my_var_float64(data, size) + + +@lower_builtin(Group, types.Array, types.int64) +def group_constructor(context, builder, sig, args): + group_data, size = args + + grp = cgutils.create_struct_proxy(sig.return_type)(context, builder) + + arr_group_data = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=group_data + ) + group_data_ptr = arr_group_data.data + + grp.group_data = group_data_ptr + grp.size = size + + return grp._getvalue() + + +make_attribute_wrapper(GroupType, "group_data", "group_data") +make_attribute_wrapper(GroupType, "size", "size") + + +class GroupMax(AbstractTemplate): + key = "GroupType.max" + + def generic(self, args, kws): + return nb_signature(self.this.group_scalar_type, recvr=self.this) + + +class GroupMin(AbstractTemplate): + key = "GroupType.min" + + def generic(self, args, kws): + return nb_signature(self.this.group_scalar_type, recvr=self.this) + + +class GroupSize(AbstractTemplate): + key = "GroupType.size" + + def generic(self, args, kws): + return nb_signature(types.int64, recvr=self.this) + + +class GroupCount(AbstractTemplate): + key = "GroupType.count" + + def generic(self, args, kws): + return nb_signature(types.int64, recvr=self.this) + + +class GroupSum(AbstractTemplate): + key = "GroupType.sum" + + def generic(self, args, kws): + return nb_signature(self.this.group_scalar_type, recvr=self.this) + + +class GroupMean(AbstractTemplate): + key = "GroupType.mean" + + def generic(self, args, kws): + return nb_signature(types.float64, recvr=self.this) + + +class GroupStd(AbstractTemplate): + key = "GroupType.std" + + def generic(self, args, kws): + return nb_signature(types.float64, recvr=self.this) + + +class GroupVar(AbstractTemplate): + key = "GroupType.var" + + def generic(self, args, kws): + return nb_signature(types.float64, recvr=self.this) + + +@cuda_registry.register_attr +class GroupAttr(AttributeTemplate): + key = GroupType + + def resolve_max(self, mod): + return types.BoundFunction(GroupMax, GroupType(mod.group_scalar_type)) + + def resolve_min(self, mod): + return types.BoundFunction(GroupMin, GroupType(mod.group_scalar_type)) + + def resolve_size(self, mod): + return types.BoundFunction(GroupSize, GroupType(mod.group_scalar_type)) + + def resolve_count(self, mod): + return types.BoundFunction( + GroupCount, GroupType(mod.group_scalar_type) + ) + + def resolve_sum(self, mod): + return types.BoundFunction(GroupSum, GroupType(mod.group_scalar_type)) + + def resolve_mean(self, mod): + return types.BoundFunction(GroupMean, GroupType(mod.group_scalar_type)) + + def resolve_std(self, mod): + return types.BoundFunction(GroupStd, GroupType(mod.group_scalar_type)) + + def resolve_var(self, mod): + return types.BoundFunction(GroupVar, GroupType(mod.group_scalar_type)) + + +@cuda_lower("GroupType.max", GroupType(types.int32)) +@cuda_lower("GroupType.max", GroupType(types.int64)) +@cuda_lower("GroupType.max", GroupType(types.float64)) +def cuda_Group_max(context, builder, sig, args): + retty = sig.return_type + + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + grp_type = sig.args[0] + + group_dataty = grp_type.group_data_type + group_data_ptr = builder.alloca(grp.group_data.type) + builder.store(grp.group_data, group_data_ptr) + + if grp_type.group_scalar_type == types.int32: + func = call_my_max_int32 + elif grp_type.group_scalar_type == types.int64: + func = call_my_max_int64 + elif grp_type.group_scalar_type == types.float64: + func = call_my_max_float64 + + result = context.compile_internal( + builder, + func, + nb_signature(retty, group_dataty, grp_type.size_type), + (builder.load(group_data_ptr), grp.size), + ) + + return result + + +@cuda_lower("GroupType.min", GroupType(types.int32)) +@cuda_lower("GroupType.min", GroupType(types.int64)) +@cuda_lower("GroupType.min", GroupType(types.float64)) +def cuda_Group_min(context, builder, sig, args): + retty = sig.return_type + + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + grp_type = sig.args[0] + + group_dataty = grp_type.group_data_type + group_data_ptr = builder.alloca(grp.group_data.type) + builder.store(grp.group_data, group_data_ptr) + + if grp_type.group_scalar_type == types.int32: + func = call_my_min_int32 + elif grp_type.group_scalar_type == types.int64: + func = call_my_min_int64 + elif grp_type.group_scalar_type == types.float64: + func = call_my_min_float64 + + result = context.compile_internal( + builder, + func, + nb_signature(retty, group_dataty, grp_type.size_type), + (builder.load(group_data_ptr), grp.size), + ) + return result + + +@cuda_lower("GroupType.size", GroupType(types.int64)) +@cuda_lower("GroupType.size", GroupType(types.float64)) +def cuda_Group_size(context, builder, sig, args): + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + result = grp.size + return result + + +@cuda_lower("GroupType.count", GroupType(types.int64)) +@cuda_lower("GroupType.count", GroupType(types.float64)) +def cuda_Group_count(context, builder, sig, args): + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + result = grp.size + return result + + +@cuda_lower("GroupType.sum", GroupType(types.int64)) +@cuda_lower("GroupType.sum", GroupType(types.float64)) +def cuda_Group_sum(context, builder, sig, args): + retty = sig.return_type + + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + grp_type = sig.args[0] + + group_dataty = grp_type.group_data_type + group_data_ptr = builder.alloca(grp.group_data.type) + builder.store(grp.group_data, group_data_ptr) + + if grp_type.group_scalar_type == types.int64: + func = call_my_sum_int64 + elif grp_type.group_scalar_type == types.float64: + func = call_my_sum_float64 + + result = context.compile_internal( + builder, + func, + nb_signature(retty, group_dataty, grp_type.size_type), + (builder.load(group_data_ptr), grp.size), + ) + return result + + +@cuda_lower("GroupType.mean", GroupType(types.int64)) +@cuda_lower("GroupType.mean", GroupType(types.float64)) +def cuda_Group_mean(context, builder, sig, args): + retty = sig.return_type + + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + grp_type = sig.args[0] + + group_dataty = grp_type.group_data_type + group_data_ptr = builder.alloca(grp.group_data.type) + builder.store(grp.group_data, group_data_ptr) + + if grp_type.group_scalar_type == types.int64: + func = call_my_mean_int64 + elif grp_type.group_scalar_type == types.float64: + func = call_my_mean_float64 + + result = context.compile_internal( + builder, + func, + nb_signature(retty, group_dataty, grp_type.size_type), + (builder.load(group_data_ptr), grp.size), + ) + return result + + +@cuda_lower("GroupType.std", GroupType(types.int64)) +@cuda_lower("GroupType.std", GroupType(types.float64)) +def cuda_Group_std(context, builder, sig, args): + retty = sig.return_type + + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + grp_type = sig.args[0] + + group_dataty = grp_type.group_data_type + group_data_ptr = builder.alloca(grp.group_data.type) + builder.store(grp.group_data, group_data_ptr) + + if grp_type.group_scalar_type == types.int64: + func = call_my_std_int64 + elif grp_type.group_scalar_type == types.float64: + func = call_my_std_float64 + + result = context.compile_internal( + builder, + func, + nb_signature(retty, group_dataty, grp_type.size_type), + (builder.load(group_data_ptr), grp.size), + ) + return result + + +@cuda_lower("GroupType.var", GroupType(types.int64)) +@cuda_lower("GroupType.var", GroupType(types.float64)) +def cuda_Group_var(context, builder, sig, args): + retty = sig.return_type + + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + grp_type = sig.args[0] + + group_dataty = grp_type.group_data_type + group_data_ptr = builder.alloca(grp.group_data.type) + builder.store(grp.group_data, group_data_ptr) + + if grp_type.group_scalar_type == types.int64: + func = call_my_var_int64 + elif grp_type.group_scalar_type == types.float64: + func = call_my_var_float64 + + result = context.compile_internal( + builder, + func, + nb_signature(retty, group_dataty, grp_type.size_type), + (builder.load(group_data_ptr), grp.size), + ) + return result + + +def _get_frame_groupby_type(dtype): + """ + Get the numba `Record` type corresponding to a frame. + Models the column as a dictionary like data structure + containing GroupTypes. + Large parts of this function are copied with comments + from the Numba internals and slightly modified to + account for validity bools to be present in the final + struct. + See numba.np.numpy_support.from_struct_dtype for details. + """ + + # Create the numpy structured type corresponding to the numpy dtype. + + fields = [] + offset = 0 + + sizes = [val[0].itemsize for val in dtype.fields.values()] + for i, (name, info) in enumerate(dtype.fields.items()): + # *info* consists of the element dtype, its offset from the beginning + # of the record, and an optional "title" containing metadata. + # We ignore the offset in info because its value assumes no masking; + # instead, we compute the correct offset based on the masked type. + elemdtype = info[0] + title = info[2] if len(info) == 3 else None + ty = numpy_support.from_dtype(elemdtype) + infos = { + "type": GroupType(ty), + "offset": offset, + "title": title, + } + fields.append((name, infos)) + + # increment offset by itemsize plus one byte for validity + offset += 8 + 8 # group struct size (2 pointers and 1 integer) + + # Align the next member of the struct to be a multiple of the + # memory access size, per PTX ISA 7.4/5.4.5 + if i < len(sizes) - 1: + # next_itemsize = sizes[i + 1] + next_itemsize = 8 + offset = int(math.ceil(offset / next_itemsize) * next_itemsize) + + # Numba requires that structures are aligned for the CUDA target + _is_aligned_struct = True + return Record(fields, offset, _is_aligned_struct) + + +def _groupby_apply_kernel_string_from_template(frame, args): + """ + Function to write numba kernels for `DataFrame.apply` as a string. + Workaround until numba supports functions that use `*args` + + Both the number of input columns as well as their nullability and any + scalar arguments may vary, so the kernels vary significantly. See + templates.py for the full row kernel template and more details. + """ + # Create argument list for kernel + frame = _supported_cols_from_frame(frame) + + input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))]) + extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))]) + + # Generate the initializers for each device function argument + initializers = [] + for i, (colname, col) in enumerate(frame.items()): + idx = str(i) + initializers.append( + group_initializer_template.format(idx=idx, name=colname) + ) + + return groupby_apply_kernel_template.format( + input_columns=input_columns, + extra_args=extra_args, + group_initializers="\n".join(initializers), + ) + + +def _get_groupby_apply_kernel(frame, func, args): + dataframe_group_type = _get_frame_groupby_type( + np.dtype(list(_all_dtypes_from_frame(frame).items())) + ) + return_type = _get_udf_return_type(dataframe_group_type, func, args) + + np_field_types = np.dtype( + list(_supported_dtypes_from_frame(frame).items()) + ) + dataframe_group_type = _get_frame_groupby_type(np_field_types) + + # Dict of 'local' variables into which `_kernel` is defined + global_exec_context = { + "cuda": cuda, + "Group": Group, + "dataframe_group_type": dataframe_group_type, + "types": types, + } + kernel_string = _groupby_apply_kernel_string_from_template(frame, args) + + kernel = _get_kernel_groupby_apply( + kernel_string, global_exec_context, func, dev_func_ptx + ) + + return kernel, return_type + + +def jit_groupby_apply(offsets, grouped_values, function, *args): + ngroups = len(offsets) - 1 + + kernel, return_type = _get_groupby_apply_kernel( + grouped_values, function, args + ) + + return_type = numpy_support.as_dtype(return_type) + + output = cp.empty(ngroups, dtype=return_type) + + launch_args = [cp.asarray(offsets), output] + + for col in _supported_cols_from_frame(grouped_values).values(): + launch_args.append(cp.asarray(col)) + + launch_args += list(args) + + stream = cuda.default_stream() + + kernel[ngroups, 256](*launch_args) + + stream.synchronize() + + return as_column(output, dtype=output.dtype) diff --git a/python/cudf/cudf/core/udf/templates.py b/python/cudf/cudf/core/udf/templates.py index 3ac7083582f..a8b3c902136 100644 --- a/python/cudf/cudf/core/udf/templates.py +++ b/python/cudf/cudf/core/udf/templates.py @@ -14,6 +14,11 @@ row["{name}"] = masked_{idx} """ +group_initializer_template = """\ + arr_{idx} = input_col_{idx}[offset[block_id]:offset[block_id+1]] + dataframe_group["{name}"] = Group(arr_{idx}, size) +""" + row_kernel_template = """\ def _kernel(retval, size, {input_columns}, {input_offsets}, {extra_args}): i = cuda.grid(1) @@ -52,3 +57,21 @@ def _kernel(retval, size, input_col_0, offset_0, {extra_args}): ret_data_arr[i] = ret_masked.value ret_mask_arr[i] = ret_masked.valid """ + +groupby_apply_kernel_template = """ +def _kernel(offset, out, {input_columns}, {extra_args}): + tid = cuda.threadIdx.x + block_id = cuda.blockIdx.x + tb_size = cuda.blockDim.x + + recarray = cuda.local.array(1, dtype=dataframe_group_type) + dataframe_group = recarray[0] + + if block_id < (len(offset) - 1): + + size = offset[block_id+1] - offset[block_id] + +{group_initializers} + + out[block_id] = f_(dataframe_group, {extra_args}) +""" diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index f5c270a3705..36a6b011022 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -216,3 +216,14 @@ def _get_kernel(kernel_string, globals_, sig, func): kernel = cuda.jit(sig)(_kernel) return kernel + + +def _get_kernel_groupby_apply(kernel_string, globals_, func, dev_func_ptx): + """template kernel compilation helper function for groupby apply""" + f_ = cuda.jit(device=True)(func) + globals_["f_"] = f_ + exec(kernel_string, globals_) + _kernel = globals_["_kernel"] + kernel = cuda.jit(link=[dev_func_ptx])(_kernel) + + return kernel diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index bd5e9fe017b..122f584fc50 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -364,6 +364,63 @@ def emulate(df): assert_groupby_results_equal(expect, got) +def test_groupby_apply_jit(): + np.random.seed(0) + df = DataFrame() + nelem = 20 + df["key1"] = np.random.randint(0, 3, nelem) + df["key2"] = np.random.randint(0, 2, nelem) + df["val1"] = np.random.random(nelem) + df["val2"] = np.random.random(nelem) + + expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False) + got_grpby = df.groupby(["key1", "key2"]) + + def foo(df): + return df["val1"].max() + df["val2"].min() + + expect = expect_grpby.apply(foo) + got_nonjit = got_grpby.apply(foo) + got_jit = got_grpby.apply(foo, engine="jit") + assert_groupby_results_equal(expect, got_nonjit) + assert_groupby_results_equal(expect, got_jit) + + +def create_test_groupby_apply_jit_args_params(): + def f1(df, k): + return df["val1"].max() + df["val2"].min() + k + + def f2(df, k, L): + return df["val1"].sum() - df["val2"].var() + (k / L) + + def f3(df, k, L, m): + return ((k * df["val1"].mean()) + (L * df["val2"].std())) / m + + return [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] + + +@pytest.mark.parametrize( + "func,args", create_test_groupby_apply_jit_args_params() +) +def test_groupby_apply_jit_args(func, args): + np.random.seed(0) + df = DataFrame() + nelem = 20 + df["key1"] = np.random.randint(0, 3, nelem) + df["key2"] = np.random.randint(0, 2, nelem) + df["val1"] = np.random.random(nelem) + df["val2"] = np.random.random(nelem) + + expect_grpby = df.to_pandas().groupby(["key1", "key2"]) + got_grpby = df.groupby(["key1", "key2"]) + + expect = expect_grpby.apply(func, *args) + got_nonjit = got_grpby.apply(func, *args) + got_jit = got_grpby.apply(func, *args, engine="jit") + assert_groupby_results_equal(expect, got_nonjit) + assert_groupby_results_equal(expect, got_jit) + + @pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) @pytest.mark.parametrize( "func", From 2d6b4c99dcbe9d77b4ada57abd7dd50e4383488b Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Thu, 4 Aug 2022 14:38:10 +0000 Subject: [PATCH 002/121] Fix error in Pytest --- python/cudf/cudf/core/groupby/groupby.py | 14 +++++++++++--- python/cudf/cudf/core/udf/groupby_function.py | 1 - 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e9ee2c0016f..dcfe4f89ee8 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -75,7 +75,14 @@ class GroupBy(Serializable, Reducible, Scannable): _MAX_GROUPS_BEFORE_WARN = 100 def __init__( - self, obj, by=None, level=None, sort=False, as_index=True, dropna=True + self, + obj, + by=None, + level=None, + sort=False, + as_index=True, + dropna=True, + engine="nonjit", ): """ Group a DataFrame or Series by a set of columns. @@ -111,6 +118,7 @@ def __init__( self._level = level self._sort = sort self._dropna = dropna + self._engine = engine if isinstance(by, _Grouping): by._obj = self.obj @@ -610,8 +618,8 @@ def mult(df): raise TypeError(f"type {type(function)} is not callable") group_names, offsets, _, grouped_values = self._grouped() - # jit groupby apply only returns Series - if engine == "jit": + self._engine = engine + if self._engine == "jit": chunk_results = jit_groupby_apply( offsets, grouped_values, function, *args ) diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 2a9bb7338f1..c87b675e227 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -36,7 +36,6 @@ _supported_dtypes_from_frame, ) -numba.config.CUDA_USE_NVIDIA_BINDING = 1 # Disable occupancy warnings to avoid polluting output when there are few # groups. numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 From fd8680e2253396ddd4a55a63910ad7a26e55ef2e Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Fri, 5 Aug 2022 13:55:49 +0000 Subject: [PATCH 003/121] JIT Caching Support --- python/cudf/cudf/core/udf/function.cu | 198 +++++++++++++++++- python/cudf/cudf/core/udf/groupby_function.py | 10 +- 2 files changed, 198 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/udf/function.cu b/python/cudf/cudf/core/udf/function.cu index 728a8d84f63..acbdac1465c 100644 --- a/python/cudf/cudf/core/udf/function.cu +++ b/python/cudf/cudf/core/udf/function.cu @@ -410,7 +410,7 @@ extern "C" __device__ int BlockMax_int32(int *numba_return_value, int *data, int int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; - int local_max = 0; + int local_max = INT_MIN; __shared__ int smax; @@ -446,7 +446,7 @@ extern "C" __device__ int BlockMax_int64(int64_t *numba_return_value, int64_t *d int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; - int64_t local_max = 0; + int64_t local_max = INT64_MIN; __shared__ int64_t smax; @@ -482,7 +482,7 @@ extern "C" __device__ int BlockMax_float64(double *numba_return_value, double *d int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; - double local_max = 0; + double local_max = -DBL_MAX; __shared__ double smax; @@ -553,7 +553,7 @@ extern "C" __device__ int BlockMin_int64(int64_t *numba_return_value, int64_t *d int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; - int64_t local_min = INT_MAX; + int64_t local_min = INT64_MAX; __shared__ int64_t smin; @@ -589,7 +589,7 @@ extern "C" __device__ int BlockMin_float64(double *numba_return_value, double *d int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; - double local_min = INT_MAX; + double local_min = DBL_MAX; __shared__ double smin; @@ -616,5 +616,193 @@ extern "C" __device__ int BlockMin_float64(double *numba_return_value, double *d *numba_return_value = smin; + return 0; +} + +// Calculate minimum of the group, return the scalar +extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t *data, int64_t index, int64_t size) { + + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + int64_t local_max = INT64_MIN; + int64_t local_idx = -1; + + __shared__ int64_t smax; + __shared__ int64_t sidx; + + if (tid == 0) + smax = 0; + + __syncthreads(); + + // Calculate local max for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + int64_t load = data[tid + ITEM * tb_size]; + if (load > local_max) { + local_max = load; + local_idx = index[tid + ITEM * tb_size] + } + } + } + + __syncthreads(); + + // Calculate local max for each group + atomicMax((long long*) (&smax), (long long) local_max); + + __syncthreads(); + + if (local_max == smax) { + atomicMin((long long*) (&sidx), (long long)local_idx); + } + + __syncthreads(); + + *numba_return_value = sidx; + + return 0; +} + +// Calculate minimum of the group, return the scalar +extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, double *data, int64_t index, int64_t size) { + + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + double local_max = -DBL_MAX; + int64_t local_idx = -1; + + __shared__ double smax; + __shared__ int64_t sidx; + + if (tid == 0) + smax = 0; + + __syncthreads(); + + // Calculate local max for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + double load = data[tid + ITEM * tb_size]; + if (load > local_max) { + local_max = load; + local_idx = index[tid + ITEM * tb_size] + } + } + } + + __syncthreads(); + + // Calculate local max for each group + atomicMax((&smax), local_max); + + __syncthreads(); + + if (local_max == smax) { + atomicMin((long long*) (&sidx), (long long)local_idx); + } + + __syncthreads(); + + *numba_return_value = sidx; + + return 0; +} + +// Calculate minimum of the group, return the scalar +extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t *data, int64_t index, int64_t size) { + + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + int64_t local_min = INT64_MAX; + int64_t local_idx = -1; + + __shared__ int64_t smin; + __shared__ int64_t sidx; + + if (tid == 0) + smin = INT_MAX; + + __syncthreads(); + + // Calculate local max for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + int64_t load = data[tid + ITEM * tb_size]; + if (load < local_min) { + local_min = load; + local_idx = index[tid + ITEM * tb_size] + } + } + } + + __syncthreads(); + + // Calculate local max for each group + atomicMin((long long*) (&smin), (long long) local_min); + + __syncthreads(); + + if (local_min == smin) { + atomicMin((long long*) (&sidx), (long long)local_idx); + } + + __syncthreads(); + + *numba_return_value = sidx; + + return 0; +} + +// Calculate minimum of the group, return the scalar +extern "C" __device__ int BlockIdxMin_float64(int64_t *numba_return_value, double *data, int64_t index, int64_t size) { + + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + double local_min = DBL_MAX; + int64_t local_idx = -1; + + __shared__ double smin; + __shared__ int64_t sidx; + + if (tid == 0) + smin = INT_MAX; + + __syncthreads(); + + // Calculate local max for each thread + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { + if (tid + (ITEM * tb_size) < size) { + double load = data[tid + ITEM * tb_size]; + if (load < local_min) { + local_min = load; + local_idx = index[tid + ITEM * tb_size] + } + } + } + + __syncthreads(); + + // Calculate local max for each group + atomicMin((&smin), local_min); + + __syncthreads(); + + if (local_min == smin) { + atomicMin((long long*) (&sidx), (long long)local_idx); + } + + __syncthreads(); + + *numba_return_value = sidx; + return 0; } \ No newline at end of file diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index c87b675e227..f17ce9d6d32 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -30,6 +30,7 @@ ) from cudf.core.udf.utils import ( _all_dtypes_from_frame, + _compile_or_get, _get_kernel_groupby_apply, _get_udf_return_type, _supported_cols_from_frame, @@ -615,6 +616,7 @@ def _get_groupby_apply_kernel(frame, func, args): dataframe_group_type = _get_frame_groupby_type( np.dtype(list(_all_dtypes_from_frame(frame).items())) ) + return_type = _get_udf_return_type(dataframe_group_type, func, args) np_field_types = np.dtype( @@ -641,12 +643,10 @@ def _get_groupby_apply_kernel(frame, func, args): def jit_groupby_apply(offsets, grouped_values, function, *args): ngroups = len(offsets) - 1 - kernel, return_type = _get_groupby_apply_kernel( - grouped_values, function, args + kernel, return_type = _compile_or_get( + grouped_values, function, args, _get_groupby_apply_kernel ) - return_type = numpy_support.as_dtype(return_type) - output = cp.empty(ngroups, dtype=return_type) launch_args = [cp.asarray(offsets), output] @@ -658,7 +658,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): stream = cuda.default_stream() - kernel[ngroups, 256](*launch_args) + kernel[ngroups, 256, stream](*launch_args) stream.synchronize() From 922065819f0dfe5db260904fb6c9be4a07d875c1 Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Fri, 5 Aug 2022 17:08:36 +0000 Subject: [PATCH 004/121] Add IdxMax and IdxMin --- python/cudf/cudf/core/udf/function.cu | 53 +++-- python/cudf/cudf/core/udf/groupby_function.py | 215 ++++++++++++++++-- python/cudf/cudf/core/udf/templates.py | 5 +- 3 files changed, 225 insertions(+), 48 deletions(-) diff --git a/python/cudf/cudf/core/udf/function.cu b/python/cudf/cudf/core/udf/function.cu index acbdac1465c..47e3947c65a 100644 --- a/python/cudf/cudf/core/udf/function.cu +++ b/python/cudf/cudf/core/udf/function.cu @@ -1,6 +1,7 @@ // Copyright (c) 2020-2022, NVIDIA CORPORATION. #include +#include // double atomicAdd __device__ __forceinline__ double atomicAdd(double* address, double val) @@ -415,7 +416,7 @@ extern "C" __device__ int BlockMax_int32(int *numba_return_value, int *data, int __shared__ int smax; if (tid == 0) - smax = 0; + smax = INT_MIN; __syncthreads(); @@ -451,7 +452,7 @@ extern "C" __device__ int BlockMax_int64(int64_t *numba_return_value, int64_t *d __shared__ int64_t smax; if (tid == 0) - smax = 0; + smax = INT64_MIN; __syncthreads(); @@ -487,7 +488,7 @@ extern "C" __device__ int BlockMax_float64(double *numba_return_value, double *d __shared__ double smax; if (tid == 0) - smax = 0; + smax = -DBL_MAX; __syncthreads(); @@ -558,7 +559,7 @@ extern "C" __device__ int BlockMin_int64(int64_t *numba_return_value, int64_t *d __shared__ int64_t smin; if (tid == 0) - smin = INT_MAX; + smin = INT64_MAX; __syncthreads(); @@ -594,7 +595,7 @@ extern "C" __device__ int BlockMin_float64(double *numba_return_value, double *d __shared__ double smin; if (tid == 0) - smin = INT_MAX; + smin = DBL_MAX; __syncthreads(); @@ -620,7 +621,7 @@ extern "C" __device__ int BlockMin_float64(double *numba_return_value, double *d } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t *data, int64_t index, int64_t size) { +extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t *data, int64_t* index, int64_t size) { int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on @@ -631,8 +632,10 @@ extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t __shared__ int64_t smax; __shared__ int64_t sidx; - if (tid == 0) - smax = 0; + if (tid == 0) { + smax = INT64_MIN; + sidx = INT64_MAX; + } __syncthreads(); @@ -643,7 +646,7 @@ extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t int64_t load = data[tid + ITEM * tb_size]; if (load > local_max) { local_max = load; - local_idx = index[tid + ITEM * tb_size] + local_idx = index[tid + ITEM * tb_size]; } } } @@ -656,7 +659,7 @@ extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t __syncthreads(); if (local_max == smax) { - atomicMin((long long*) (&sidx), (long long)local_idx); + atomicMin((long long*) (&sidx), (long long) local_idx); } __syncthreads(); @@ -667,7 +670,7 @@ extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, double *data, int64_t index, int64_t size) { +extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, double *data, int64_t* index, int64_t size) { int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on @@ -678,8 +681,10 @@ extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, doubl __shared__ double smax; __shared__ int64_t sidx; - if (tid == 0) - smax = 0; + if (tid == 0) { + smax = -DBL_MAX; + sidx = INT64_MAX; + } __syncthreads(); @@ -690,7 +695,7 @@ extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, doubl double load = data[tid + ITEM * tb_size]; if (load > local_max) { local_max = load; - local_idx = index[tid + ITEM * tb_size] + local_idx = index[tid + ITEM * tb_size]; } } } @@ -714,7 +719,7 @@ extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, doubl } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t *data, int64_t index, int64_t size) { +extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t *data, int64_t* index, int64_t size) { int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on @@ -725,8 +730,10 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t __shared__ int64_t smin; __shared__ int64_t sidx; - if (tid == 0) - smin = INT_MAX; + if (tid == 0) { + smin = INT64_MAX; + sidx = INT64_MAX; + } __syncthreads(); @@ -737,7 +744,7 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t int64_t load = data[tid + ITEM * tb_size]; if (load < local_min) { local_min = load; - local_idx = index[tid + ITEM * tb_size] + local_idx = index[tid + ITEM * tb_size]; } } } @@ -761,7 +768,7 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMin_float64(int64_t *numba_return_value, double *data, int64_t index, int64_t size) { +extern "C" __device__ int BlockIdxMin_float64(int64_t *numba_return_value, double *data, int64_t* index, int64_t size) { int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on @@ -772,8 +779,10 @@ extern "C" __device__ int BlockIdxMin_float64(int64_t *numba_return_value, doubl __shared__ double smin; __shared__ int64_t sidx; - if (tid == 0) - smin = INT_MAX; + if (tid == 0) { + smin = DBL_MAX; + sidx = INT64_MAX; + } __syncthreads(); @@ -784,7 +793,7 @@ extern "C" __device__ int BlockIdxMin_float64(int64_t *numba_return_value, doubl double load = data[tid + ITEM * tb_size]; if (load < local_min) { local_min = load; - local_idx = index[tid + ITEM * tb_size] + local_idx = index[tid + ITEM * tb_size]; } } } diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index f17ce9d6d32..9fd2be5679a 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -43,34 +43,43 @@ class Group(object): - def __init__(self, group_data, size, dtype): + def __init__(self, group_data, size, index, dtype, index_dtype): self.group_data = group_data self.size = size + self.index = index self.dtype = dtype + self.index_dtype = index_dtype class GroupType(numba.types.Type): - def __init__(self, group_scalar_type): + def __init__(self, group_scalar_type, index_type=types.int64): self.group_scalar_type = group_scalar_type + self.index_type = index_type self.group_data_type = types.CPointer(group_scalar_type) self.size_type = types.int64 - super().__init__(name=f"Group({self.group_scalar_type})") + self.group_index_type = types.CPointer(index_type) + super().__init__( + name=f"Group({self.group_scalar_type}, {self.index_type})" + ) @typeof_impl.register(Group) def typeof_group(val, c): return GroupType( - numba.np.numpy_support.from_dtype(val.dtype) + numba.np.numpy_support.from_dtype(val.dtype), + numba.np.numpy_support.from_dtype(val.index_dtype), ) # converting from numpy type to numba type @type_callable(Group) def type_group(context): - def typer(group_data, size): - if isinstance(group_data, types.Array) and isinstance( - size, types.Integer + def typer(group_data, size, index): + if ( + isinstance(group_data, types.Array) + and isinstance(size, types.Integer) + and isinstance(index, types.Array) ): - return GroupType(group_data.dtype) + return GroupType(group_data.dtype, index.dtype) return typer @@ -83,6 +92,7 @@ def __init__( members = [ ("group_data", types.CPointer(fe_type.group_scalar_type)), ("size", types.int64), + ("index", types.CPointer(fe_type.index_type)), ] models.StructModel.__init__(self, dmm, fe_type, members) @@ -162,6 +172,30 @@ def __init__( "types.float64(types.CPointer(types.float64),types.int64)", ) +my_idxmax_int64 = cuda.declare_device( + "BlockIdxMax_int64", + "types.int64(types.CPointer(types.int64)," + + "types.CPointer(types.int64),types.int64)", +) + +my_idxmax_float64 = cuda.declare_device( + "BlockIdxMax_float64", + "types.int64(types.CPointer(types.float64)," + + "types.CPointer(types.int64),types.int64)", +) + +my_idxmin_int64 = cuda.declare_device( + "BlockIdxMin_int64", + "types.int64(types.CPointer(types.int64)," + + "types.CPointer(types.int64),types.int64)", +) + +my_idxmin_float64 = cuda.declare_device( + "BlockIdxMin_float64", + "types.int64(types.CPointer(types.float64)," + + "types.CPointer(types.int64),types.int64)", +) + # Path to the source containing the foreign function basedir = os.path.dirname(os.path.realpath(__file__)) dev_func_ptx = os.path.join(basedir, "function.ptx") @@ -231,9 +265,25 @@ def call_my_var_float64(data, size): return my_var_float64(data, size) -@lower_builtin(Group, types.Array, types.int64) +def call_my_idxmax_int64(data, index, size): + return my_idxmax_int64(data, index, size) + + +def call_my_idxmax_float64(data, index, size): + return my_idxmax_float64(data, index, size) + + +def call_my_idxmin_int64(data, index, size): + return my_idxmin_int64(data, index, size) + + +def call_my_idxmin_float64(data, index, size): + return my_idxmin_float64(data, index, size) + + +@lower_builtin(Group, types.Array, types.int64, types.Array) def group_constructor(context, builder, sig, args): - group_data, size = args + group_data, size, index = args grp = cgutils.create_struct_proxy(sig.return_type)(context, builder) @@ -242,13 +292,20 @@ def group_constructor(context, builder, sig, args): ) group_data_ptr = arr_group_data.data + arr_index = cgutils.create_struct_proxy(sig.args[2])( + context, builder, value=index + ) + index_ptr = arr_index.data + grp.group_data = group_data_ptr + grp.index = index_ptr grp.size = size return grp._getvalue() make_attribute_wrapper(GroupType, "group_data", "group_data") +make_attribute_wrapper(GroupType, "index", "index") make_attribute_wrapper(GroupType, "size", "size") @@ -308,35 +365,73 @@ def generic(self, args, kws): return nb_signature(types.float64, recvr=self.this) +class GroupIdxMax(AbstractTemplate): + key = "GroupType.idxmax" + + def generic(self, args, kws): + return nb_signature(self.this.index_type, recvr=self.this) + + +class GroupIdxMin(AbstractTemplate): + key = "GroupType.idxmin" + + def generic(self, args, kws): + return nb_signature(self.this.index_type, recvr=self.this) + + @cuda_registry.register_attr class GroupAttr(AttributeTemplate): key = GroupType def resolve_max(self, mod): - return types.BoundFunction(GroupMax, GroupType(mod.group_scalar_type)) + return types.BoundFunction( + GroupMax, GroupType(mod.group_scalar_type, mod.index_type) + ) def resolve_min(self, mod): - return types.BoundFunction(GroupMin, GroupType(mod.group_scalar_type)) + return types.BoundFunction( + GroupMin, GroupType(mod.group_scalar_type, mod.index_type) + ) def resolve_size(self, mod): - return types.BoundFunction(GroupSize, GroupType(mod.group_scalar_type)) + return types.BoundFunction( + GroupSize, GroupType(mod.group_scalar_type, mod.index_type) + ) def resolve_count(self, mod): return types.BoundFunction( - GroupCount, GroupType(mod.group_scalar_type) + GroupCount, GroupType(mod.group_scalar_type, mod.index_type) ) def resolve_sum(self, mod): - return types.BoundFunction(GroupSum, GroupType(mod.group_scalar_type)) + return types.BoundFunction( + GroupSum, GroupType(mod.group_scalar_type, mod.index_type) + ) def resolve_mean(self, mod): - return types.BoundFunction(GroupMean, GroupType(mod.group_scalar_type)) + return types.BoundFunction( + GroupMean, GroupType(mod.group_scalar_type, mod.index_type) + ) def resolve_std(self, mod): - return types.BoundFunction(GroupStd, GroupType(mod.group_scalar_type)) + return types.BoundFunction( + GroupStd, GroupType(mod.group_scalar_type, mod.index_type) + ) def resolve_var(self, mod): - return types.BoundFunction(GroupVar, GroupType(mod.group_scalar_type)) + return types.BoundFunction( + GroupVar, GroupType(mod.group_scalar_type, mod.index_type) + ) + + def resolve_idxmax(self, mod): + return types.BoundFunction( + GroupIdxMax, GroupType(mod.group_scalar_type, mod.index_type) + ) + + def resolve_idxmin(self, mod): + return types.BoundFunction( + GroupIdxMin, GroupType(mod.group_scalar_type, mod.index_type) + ) @cuda_lower("GroupType.max", GroupType(types.int32)) @@ -534,7 +629,71 @@ def cuda_Group_var(context, builder, sig, args): return result -def _get_frame_groupby_type(dtype): +@cuda_lower("GroupType.idxmax", GroupType(types.int64, types.int64)) +@cuda_lower("GroupType.idxmax", GroupType(types.float64, types.int64)) +def cuda_Group_idxmax(context, builder, sig, args): + retty = sig.return_type + + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + grp_type = sig.args[0] + + group_dataty = grp_type.group_data_type + group_data_ptr = builder.alloca(grp.group_data.type) + builder.store(grp.group_data, group_data_ptr) + + index_dataty = grp_type.group_index_type + index_ptr = builder.alloca(grp.index.type) + builder.store(grp.index, index_ptr) + + if grp_type.group_scalar_type == types.int64: + func = call_my_idxmax_int64 + elif grp_type.group_scalar_type == types.float64: + func = call_my_idxmax_float64 + + result = context.compile_internal( + builder, + func, + nb_signature(retty, group_dataty, index_dataty, grp_type.size_type), + (builder.load(group_data_ptr), builder.load(index_ptr), grp.size), + ) + return result + + +@cuda_lower("GroupType.idxmin", GroupType(types.int64, types.int64)) +@cuda_lower("GroupType.idxmin", GroupType(types.float64, types.int64)) +def cuda_Group_idxmin(context, builder, sig, args): + retty = sig.return_type + + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + grp_type = sig.args[0] + + group_dataty = grp_type.group_data_type + group_data_ptr = builder.alloca(grp.group_data.type) + builder.store(grp.group_data, group_data_ptr) + + index_dataty = grp_type.group_index_type + index_ptr = builder.alloca(grp.index.type) + builder.store(grp.index, index_ptr) + + if grp_type.group_scalar_type == types.int64: + func = call_my_idxmin_int64 + elif grp_type.group_scalar_type == types.float64: + func = call_my_idxmin_float64 + + result = context.compile_internal( + builder, + func, + nb_signature(retty, group_dataty, index_dataty, grp_type.size_type), + (builder.load(group_data_ptr), builder.load(index_ptr), grp.size), + ) + return result + + +def _get_frame_groupby_type(dtype, index_dtype): """ Get the numba `Record` type corresponding to a frame. Models the column as a dictionary like data structure @@ -560,15 +719,16 @@ def _get_frame_groupby_type(dtype): elemdtype = info[0] title = info[2] if len(info) == 3 else None ty = numpy_support.from_dtype(elemdtype) + indexty = numpy_support.from_dtype(index_dtype) infos = { - "type": GroupType(ty), + "type": GroupType(ty, indexty), "offset": offset, "title": title, } fields.append((name, infos)) # increment offset by itemsize plus one byte for validity - offset += 8 + 8 # group struct size (2 pointers and 1 integer) + offset += 8 + 8 + 8 # group struct size (2 pointers and 1 integer) # Align the next member of the struct to be a multiple of the # memory access size, per PTX ISA 7.4/5.4.5 @@ -614,7 +774,8 @@ def _groupby_apply_kernel_string_from_template(frame, args): def _get_groupby_apply_kernel(frame, func, args): dataframe_group_type = _get_frame_groupby_type( - np.dtype(list(_all_dtypes_from_frame(frame).items())) + np.dtype(list(_all_dtypes_from_frame(frame).items())), + frame.index.dtype, ) return_type = _get_udf_return_type(dataframe_group_type, func, args) @@ -622,7 +783,9 @@ def _get_groupby_apply_kernel(frame, func, args): np_field_types = np.dtype( list(_supported_dtypes_from_frame(frame).items()) ) - dataframe_group_type = _get_frame_groupby_type(np_field_types) + dataframe_group_type = _get_frame_groupby_type( + np_field_types, frame.index.dtype + ) # Dict of 'local' variables into which `_kernel` is defined global_exec_context = { @@ -649,7 +812,11 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): output = cp.empty(ngroups, dtype=return_type) - launch_args = [cp.asarray(offsets), output] + launch_args = [ + cp.asarray(offsets), + output, + cp.asarray(grouped_values.index), + ] for col in _supported_cols_from_frame(grouped_values).values(): launch_args.append(cp.asarray(col)) diff --git a/python/cudf/cudf/core/udf/templates.py b/python/cudf/cudf/core/udf/templates.py index a8b3c902136..a4eca4a7efe 100644 --- a/python/cudf/cudf/core/udf/templates.py +++ b/python/cudf/cudf/core/udf/templates.py @@ -16,7 +16,7 @@ group_initializer_template = """\ arr_{idx} = input_col_{idx}[offset[block_id]:offset[block_id+1]] - dataframe_group["{name}"] = Group(arr_{idx}, size) + dataframe_group["{name}"] = Group(arr_{idx}, size, arr_index) """ row_kernel_template = """\ @@ -59,7 +59,7 @@ def _kernel(retval, size, input_col_0, offset_0, {extra_args}): """ groupby_apply_kernel_template = """ -def _kernel(offset, out, {input_columns}, {extra_args}): +def _kernel(offset, out, index, {input_columns}, {extra_args}): tid = cuda.threadIdx.x block_id = cuda.blockIdx.x tb_size = cuda.blockDim.x @@ -70,6 +70,7 @@ def _kernel(offset, out, {input_columns}, {extra_args}): if block_id < (len(offset) - 1): size = offset[block_id+1] - offset[block_id] + arr_index = index[offset[block_id]:offset[block_id+1]] {group_initializers} From f4bc7c4e7ddcc870940bbd57ce1732efdc235e1e Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Fri, 5 Aug 2022 17:10:58 +0000 Subject: [PATCH 005/121] Add IdxMax and IdxMin --- python/cudf/cudf/core/udf/groupby_function.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 9fd2be5679a..2b3898072f8 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -175,25 +175,25 @@ def __init__( my_idxmax_int64 = cuda.declare_device( "BlockIdxMax_int64", "types.int64(types.CPointer(types.int64)," - + "types.CPointer(types.int64),types.int64)", + "types.CPointer(types.int64),types.int64)", ) my_idxmax_float64 = cuda.declare_device( "BlockIdxMax_float64", "types.int64(types.CPointer(types.float64)," - + "types.CPointer(types.int64),types.int64)", + "types.CPointer(types.int64),types.int64)", ) my_idxmin_int64 = cuda.declare_device( "BlockIdxMin_int64", "types.int64(types.CPointer(types.int64)," - + "types.CPointer(types.int64),types.int64)", + "types.CPointer(types.int64),types.int64)", ) my_idxmin_float64 = cuda.declare_device( "BlockIdxMin_float64", "types.int64(types.CPointer(types.float64)," - + "types.CPointer(types.int64),types.int64)", + "types.CPointer(types.int64),types.int64)", ) # Path to the source containing the foreign function From 8659149e8ce7623c21bed6a097d4d85efd58a5be Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Sun, 14 Aug 2022 16:13:29 +0000 Subject: [PATCH 006/121] Dynamic Launch Parameter --- python/cudf/cudf/core/groupby/groupby.py | 7 +- python/cudf/cudf/core/udf/groupby_function.py | 65 ++++++++++++------- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index dcfe4f89ee8..af771bb32b7 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -83,6 +83,7 @@ def __init__( as_index=True, dropna=True, engine="nonjit", + cache=True, ): """ Group a DataFrame or Series by a set of columns. @@ -119,6 +120,7 @@ def __init__( self._sort = sort self._dropna = dropna self._engine = engine + self._cache = cache if isinstance(by, _Grouping): by._obj = self.obj @@ -549,7 +551,7 @@ def pipe(self, func, *args, **kwargs): """ return cudf.core.common.pipe(self, func, *args, **kwargs) - def apply(self, function, *args, engine="nonjit"): + def apply(self, function, *args, engine="nonjit", cache=True): """Apply a python transformation function over the grouped chunk. Parameters @@ -619,9 +621,10 @@ def mult(df): group_names, offsets, _, grouped_values = self._grouped() self._engine = engine + self._cache = cache if self._engine == "jit": chunk_results = jit_groupby_apply( - offsets, grouped_values, function, *args + offsets, grouped_values, function, *args, cache=cache ) result = cudf.Series(chunk_results, index=group_names) result.index.names = self.grouping.names diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 2b3898072f8..6f8237396cb 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -19,6 +19,7 @@ from numba.core.typing import signature as nb_signature from numba.core.typing.templates import AbstractTemplate, AttributeTemplate from numba.cuda.cudadecl import registry as cuda_registry +from numba.cuda.cudadrv.devices import get_context from numba.cuda.cudaimpl import lower as cuda_lower from numba.np import numpy_support from numba.types import Record @@ -36,6 +37,7 @@ _supported_cols_from_frame, _supported_dtypes_from_frame, ) +from cudf.utils.utils import _cudf_nvtx_annotate # Disable occupancy warnings to avoid polluting output when there are few # groups. @@ -123,16 +125,6 @@ def __init__( "types.float64(types.CPointer(types.float64),types.int64)", ) -my_count_int64 = cuda.declare_device( - "BlockCount_int64", - "types.int64(types.CPointer(types.int64),types.int64)", -) - -my_count_float64 = cuda.declare_device( - "BlockCount_float64", - "types.int64(types.CPointer(types.float64),types.int64)", -) - my_sum_int64 = cuda.declare_device( "BlockSum_int64", "types.int64(types.CPointer(types.int64),types.int64)" ) @@ -225,14 +217,6 @@ def call_my_min_float64(data, size): return my_min_float64(data, size) -def call_my_count_int64(data, size): - return my_count_int64(data, size) - - -def call_my_count_float64(data, size): - return my_count_float64(data, size) - - def call_my_sum_int64(data, size): return my_sum_int64(data, size) @@ -803,12 +787,19 @@ def _get_groupby_apply_kernel(frame, func, args): return kernel, return_type -def jit_groupby_apply(offsets, grouped_values, function, *args): +@_cudf_nvtx_annotate +def jit_groupby_apply(offsets, grouped_values, function, *args, cache=True): ngroups = len(offsets) - 1 - kernel, return_type = _compile_or_get( - grouped_values, function, args, _get_groupby_apply_kernel - ) + if cache is True: + kernel, return_type = _compile_or_get( + grouped_values, function, args, _get_groupby_apply_kernel + ) + else: + kernel, return_type = _get_groupby_apply_kernel( + grouped_values, function, args + ) + return_type = numpy_support.as_dtype(return_type) output = cp.empty(ngroups, dtype=return_type) @@ -823,9 +814,37 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): launch_args += list(args) + max_group_size = cp.diff(offsets).max() + + if max_group_size >= 1024: + if ngroups < 100: + blocklim = 1024 + else: + blocklim = 256 + else: + blocklim = ((max_group_size + 32 - 1) / 32) * 32 + + if kernel.specialized: + specialized = kernel + else: + specialized = kernel.specialize(*launch_args) + + # Ask the driver to give a good config + ctx = get_context() + # Dispatcher is specialized, so there's only one definition - get + # it so we can get the cufunc from the code library + kern_def = next(iter(specialized.overloads.values())) + kwargs = dict( + func=kern_def._codelibrary.get_cufunc(), + b2d_func=0, + memsize=0, + blocksizelimit=blocklim, + ) + _, tpb = ctx.get_max_potential_block_size(**kwargs) + stream = cuda.default_stream() - kernel[ngroups, 256, stream](*launch_args) + specialized[ngroups, tpb, stream](*launch_args) stream.synchronize() From b7ede4312cb3c6fa1ccd86ee3a2c944d5f8b53fc Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Fri, 19 Aug 2022 18:35:59 +0000 Subject: [PATCH 007/121] Code cleanup #1 --- python/cudf/cudf/core/groupby/groupby.py | 54 ++- python/cudf/cudf/core/udf/function.cu | 308 ++++++++++-------- python/cudf/cudf/core/udf/groupby_function.py | 10 +- 3 files changed, 194 insertions(+), 178 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index af771bb32b7..f65000b2b30 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -82,7 +82,6 @@ def __init__( sort=False, as_index=True, dropna=True, - engine="nonjit", cache=True, ): """ @@ -119,8 +118,6 @@ def __init__( self._level = level self._sort = sort self._dropna = dropna - self._engine = engine - self._cache = cache if isinstance(by, _Grouping): by._obj = self.obj @@ -551,7 +548,7 @@ def pipe(self, func, *args, **kwargs): """ return cudf.core.common.pipe(self, func, *args, **kwargs) - def apply(self, function, *args, engine="nonjit", cache=True): + def apply(self, function, *args, engine=None, cache=True): """Apply a python transformation function over the grouped chunk. Parameters @@ -620,44 +617,39 @@ def mult(df): raise TypeError(f"type {type(function)} is not callable") group_names, offsets, _, grouped_values = self._grouped() - self._engine = engine - self._cache = cache - if self._engine == "jit": + if engine == "numba": chunk_results = jit_groupby_apply( offsets, grouped_values, function, *args, cache=cache ) result = cudf.Series(chunk_results, index=group_names) result.index.names = self.grouping.names - if self._sort: - result = result.sort_index() - return result - - ngroups = len(offsets) - 1 - if ngroups > self._MAX_GROUPS_BEFORE_WARN: - warnings.warn( - f"GroupBy.apply() performance scales poorly with " - f"number of groups. Got {ngroups} groups." - ) + else: + ngroups = len(offsets) - 1 + if ngroups > self._MAX_GROUPS_BEFORE_WARN: + warnings.warn( + f"GroupBy.apply() performance scales poorly with " + f"number of groups. Got {ngroups} groups." + ) - chunks = [ - grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) - ] - chunk_results = [function(chk, *args) for chk in chunks] + chunks = [ + grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) + ] + chunk_results = [function(chk, *args) for chk in chunks] - if not len(chunk_results): - return self.obj.head(0) + if not len(chunk_results): + return self.obj.head(0) - if cudf.api.types.is_scalar(chunk_results[0]): - result = cudf.Series(chunk_results, index=group_names) - result.index.names = self.grouping.names - elif isinstance(chunk_results[0], cudf.Series): - if isinstance(self.obj, cudf.DataFrame): - result = cudf.concat(chunk_results, axis=1).T + if cudf.api.types.is_scalar(chunk_results[0]): + result = cudf.Series(chunk_results, index=group_names) result.index.names = self.grouping.names + elif isinstance(chunk_results[0], cudf.Series): + if isinstance(self.obj, cudf.DataFrame): + result = cudf.concat(chunk_results, axis=1).T + result.index.names = self.grouping.names + else: + result = cudf.concat(chunk_results) else: result = cudf.concat(chunk_results) - else: - result = cudf.concat(chunk_results) if self._sort: result = result.sort_index() diff --git a/python/cudf/cudf/core/udf/function.cu b/python/cudf/cudf/core/udf/function.cu index 47e3947c65a..749088f8307 100644 --- a/python/cudf/cudf/core/udf/function.cu +++ b/python/cudf/cudf/core/udf/function.cu @@ -1,7 +1,23 @@ -// Copyright (c) 2020-2022, NVIDIA CORPORATION. - -#include -#include +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +using size_type = int; // double atomicAdd __device__ __forceinline__ double atomicAdd(double* address, double val) @@ -47,10 +63,11 @@ __device__ __forceinline__ double atomicMin(double *address, double val) return __longlong_as_double(old); } -extern "C" __device__ int BlockSum_int64(int64_t *numba_return_value, int64_t *data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockSum_int64(int64_t *numba_return_value, int64_t const *data, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; int64_t local_sum = 0; __shared__ int64_t sum; @@ -62,9 +79,9 @@ extern "C" __device__ int BlockSum_int64(int64_t *numba_return_value, int64_t *d // Calculate local sum for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - int64_t load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; local_sum += load; } } @@ -78,10 +95,11 @@ extern "C" __device__ int BlockSum_int64(int64_t *numba_return_value, int64_t *d return 0; } -extern "C" __device__ int BlockSum_float64(double *numba_return_value, double *data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockSum_float64(double *numba_return_value, double const *data, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; double local_sum = 0; __shared__ double sum; @@ -93,9 +111,9 @@ extern "C" __device__ int BlockSum_float64(double *numba_return_value, double *d // Calculate local sum for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - double load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; local_sum += load; } } @@ -110,10 +128,11 @@ extern "C" __device__ int BlockSum_float64(double *numba_return_value, double *d } -extern "C" __device__ int BlockMean_int64(double *numba_return_value, int64_t *data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockMean_int64(double *numba_return_value, int64_t const *data, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; int64_t local_sum = 0; double mean; @@ -126,9 +145,9 @@ extern "C" __device__ int BlockMean_int64(double *numba_return_value, int64_t *d // Calculate local sum for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - int64_t load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; local_sum += load; } } @@ -137,7 +156,7 @@ extern "C" __device__ int BlockMean_int64(double *numba_return_value, int64_t *d __syncthreads(); - mean = sum * 1.0 / size; + mean = sum / static_cast(size); *numba_return_value = mean; @@ -145,10 +164,11 @@ extern "C" __device__ int BlockMean_int64(double *numba_return_value, int64_t *d } -extern "C" __device__ int BlockMean_float64(double *numba_return_value, double *data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockMean_float64(double *numba_return_value, double const *data, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; double local_sum = 0; double mean; @@ -161,9 +181,9 @@ extern "C" __device__ int BlockMean_float64(double *numba_return_value, double * // Calculate local sum for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - double load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; local_sum += load; } } @@ -172,7 +192,7 @@ extern "C" __device__ int BlockMean_float64(double *numba_return_value, double * __syncthreads(); - mean = sum * 1.0 / size; + mean = sum / static_cast(size); *numba_return_value = mean; @@ -182,10 +202,11 @@ extern "C" __device__ int BlockMean_float64(double *numba_return_value, double * -extern "C" __device__ int BlockStd_int64(double *numba_return_value, int64_t *data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockStd_int64(double *numba_return_value, int64_t const *data, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; int64_t local_sum = 0; double local_var = 0; double mean; @@ -203,9 +224,9 @@ extern "C" __device__ int BlockStd_int64(double *numba_return_value, int64_t *da // Calculate local sum for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - int64_t load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; local_sum += load; } } @@ -214,13 +235,13 @@ extern "C" __device__ int BlockStd_int64(double *numba_return_value, int64_t *da __syncthreads(); - mean = sum * 1.0 / size; + mean = sum / static_cast(size); // Calculate local sum for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - int64_t load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; double temp = load - mean; temp = pow(temp, 2); local_var += temp; @@ -238,10 +259,11 @@ extern "C" __device__ int BlockStd_int64(double *numba_return_value, int64_t *da return 0; } -extern "C" __device__ int BlockStd_float64(double *numba_return_value, double *data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockStd_float64(double *numba_return_value, double const *data, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; double local_sum = 0; double local_var = 0; double mean; @@ -259,9 +281,9 @@ extern "C" __device__ int BlockStd_float64(double *numba_return_value, double *d // Calculate local sum for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - double load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; local_sum += load; } } @@ -270,13 +292,13 @@ extern "C" __device__ int BlockStd_float64(double *numba_return_value, double *d __syncthreads(); - mean = sum * 1.0 / size; + mean = sum / static_cast(size); // Calculate local sum for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - double load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; double temp = load - mean; temp = pow(temp, 2); local_var += temp; @@ -294,10 +316,11 @@ extern "C" __device__ int BlockStd_float64(double *numba_return_value, double *d return 0; } -extern "C" __device__ int BlockVar_int64(double *numba_return_value, int64_t *data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockVar_int64(double *numba_return_value, int64_t const *data, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; int64_t local_sum = 0; double local_var = 0; double mean; @@ -314,9 +337,9 @@ extern "C" __device__ int BlockVar_int64(double *numba_return_value, int64_t *da // Calculate local sum for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - int64_t load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; local_sum += load; } } @@ -325,13 +348,13 @@ extern "C" __device__ int BlockVar_int64(double *numba_return_value, int64_t *da __syncthreads(); - mean = sum * 1.0 / size; + mean = sum / static_cast(size); // Calculate local sum for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - int64_t load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; double temp = load - mean; temp = pow(temp, 2); local_var += temp; @@ -349,10 +372,11 @@ extern "C" __device__ int BlockVar_int64(double *numba_return_value, int64_t *da return 0; } -extern "C" __device__ int BlockVar_float64(double *numba_return_value, double *data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockVar_float64(double *numba_return_value, double const *data, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; double local_sum = 0; double local_var = 0; double mean; @@ -369,9 +393,9 @@ extern "C" __device__ int BlockVar_float64(double *numba_return_value, double *d // Calculate local sum for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - double load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; local_sum += load; } } @@ -380,13 +404,13 @@ extern "C" __device__ int BlockVar_float64(double *numba_return_value, double *d __syncthreads(); - mean = sum * 1.0 / size; + mean = sum / static_cast(size); // Calculate local sum for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - double load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; double temp = load - mean; temp = pow(temp, 2); local_var += temp; @@ -407,10 +431,10 @@ extern "C" __device__ int BlockVar_float64(double *numba_return_value, double *d // Calculate maximum of the group, return the scalar extern "C" __device__ int BlockMax_int32(int *numba_return_value, int *data, int64_t size) { - - int tid = threadIdx.x; int tb_size = blockDim.x; + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; int local_max = INT_MIN; __shared__ int smax; @@ -422,9 +446,9 @@ extern "C" __device__ int BlockMax_int32(int *numba_return_value, int *data, int // Calculate local max for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - int load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + int load = data[tid + item * tb_size]; local_max = max(local_max, load); } } @@ -442,11 +466,11 @@ extern "C" __device__ int BlockMax_int32(int *numba_return_value, int *data, int } // Calculate maximum of the group, return the scalar -extern "C" __device__ int BlockMax_int64(int64_t *numba_return_value, int64_t *data, int64_t size) { - - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockMax_int64(int64_t *numba_return_value, int64_t const *data, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; int64_t local_max = INT64_MIN; __shared__ int64_t smax; @@ -458,9 +482,9 @@ extern "C" __device__ int BlockMax_int64(int64_t *numba_return_value, int64_t *d // Calculate local max for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - int64_t load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; local_max = max(local_max, load); } } @@ -478,11 +502,11 @@ extern "C" __device__ int BlockMax_int64(int64_t *numba_return_value, int64_t *d } // Calculate maximum of the group, return the scalar -extern "C" __device__ int BlockMax_float64(double *numba_return_value, double *data, int64_t size) { - - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockMax_float64(double *numba_return_value, double const *data, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; double local_max = -DBL_MAX; __shared__ double smax; @@ -494,9 +518,9 @@ extern "C" __device__ int BlockMax_float64(double *numba_return_value, double *d // Calculate local max for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - double load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; local_max = max(local_max, load); } } @@ -515,10 +539,10 @@ extern "C" __device__ int BlockMax_float64(double *numba_return_value, double *d // Calculate minimum of the group, return the scalar extern "C" __device__ int BlockMin_int32(int *numba_return_value, int *data, int64_t size) { - - int tid = threadIdx.x; int tb_size = blockDim.x; + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; int local_min = INT_MAX; __shared__ int smin; @@ -529,9 +553,9 @@ extern "C" __device__ int BlockMin_int32(int *numba_return_value, int *data, int __syncthreads(); #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - int load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + int load = data[tid + item * tb_size]; local_min = min(local_min, load); } } @@ -549,11 +573,11 @@ extern "C" __device__ int BlockMin_int32(int *numba_return_value, int *data, int } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockMin_int64(int64_t *numba_return_value, int64_t *data, int64_t size) { - - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockMin_int64(int64_t *numba_return_value, int64_t const *data, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; int64_t local_min = INT64_MAX; __shared__ int64_t smin; @@ -565,9 +589,9 @@ extern "C" __device__ int BlockMin_int64(int64_t *numba_return_value, int64_t *d // Calculate local max for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - int64_t load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; local_min = min(local_min, load); } } @@ -585,11 +609,11 @@ extern "C" __device__ int BlockMin_int64(int64_t *numba_return_value, int64_t *d } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockMin_float64(double *numba_return_value, double *data, int64_t size) { - - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockMin_float64(double *numba_return_value, double const *data, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; double local_min = DBL_MAX; __shared__ double smin; @@ -601,9 +625,9 @@ extern "C" __device__ int BlockMin_float64(double *numba_return_value, double *d // Calculate local max for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - double load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; local_min = min(local_min, load); } } @@ -621,11 +645,11 @@ extern "C" __device__ int BlockMin_float64(double *numba_return_value, double *d } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t *data, int64_t* index, int64_t size) { - - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t const *data, int64_t* index, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; int64_t local_max = INT64_MIN; int64_t local_idx = -1; @@ -641,12 +665,12 @@ extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t // Calculate local max for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - int64_t load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; if (load > local_max) { local_max = load; - local_idx = index[tid + ITEM * tb_size]; + local_idx = index[tid + item * tb_size]; } } } @@ -670,11 +694,11 @@ extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, double *data, int64_t* index, int64_t size) { - - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, double const *data, int64_t* index, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; double local_max = -DBL_MAX; int64_t local_idx = -1; @@ -690,12 +714,12 @@ extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, doubl // Calculate local max for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - double load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; if (load > local_max) { local_max = load; - local_idx = index[tid + ITEM * tb_size]; + local_idx = index[tid + item * tb_size]; } } } @@ -719,11 +743,11 @@ extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, doubl } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t *data, int64_t* index, int64_t size) { - - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t const *data, int64_t* index, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; int64_t local_min = INT64_MAX; int64_t local_idx = -1; @@ -739,12 +763,12 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t // Calculate local max for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - int64_t load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; if (load < local_min) { local_min = load; - local_idx = index[tid + ITEM * tb_size]; + local_idx = index[tid + item * tb_size]; } } } @@ -768,11 +792,11 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMin_float64(int64_t *numba_return_value, double *data, int64_t* index, int64_t size) { - - int tid = threadIdx.x; int tb_size = blockDim.x; +extern "C" __device__ int BlockIdxMin_float64(int64_t *numba_return_value, double const *data, int64_t* index, int64_t size) { + int tid = threadIdx.x; + int tb_size = blockDim.x; // Calculate how many elements each thread is working on - int ITEMS_PER_THREAD = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + tb_size - 1) / tb_size; double local_min = DBL_MAX; int64_t local_idx = -1; @@ -788,12 +812,12 @@ extern "C" __device__ int BlockIdxMin_float64(int64_t *numba_return_value, doubl // Calculate local max for each thread #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - if (tid + (ITEM * tb_size) < size) { - double load = data[tid + ITEM * tb_size]; + for (size_type item = 0; item < items_per_thread; item++) { + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; if (load < local_min) { local_min = load; - local_idx = index[tid + ITEM * tb_size]; + local_idx = index[tid + item * tb_size]; } } } diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 6f8237396cb..0e835b2b9e6 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -816,11 +816,11 @@ def jit_groupby_apply(offsets, grouped_values, function, *args, cache=True): max_group_size = cp.diff(offsets).max() - if max_group_size >= 1024: - if ngroups < 100: - blocklim = 1024 - else: - blocklim = 256 + if max_group_size >= 1000: + # if ngroups < 100: + # blocklim = 1024 + # else: + blocklim = 256 else: blocklim = ((max_group_size + 32 - 1) / 32) * 32 From f98fc63ce0caaeb6f2839987a3f48a3d4b2e9b7b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 23 Sep 2022 16:33:00 -0700 Subject: [PATCH 008/121] Add support for building the JIT functions with the rest of the build. --- python/cudf/CMakeLists.txt | 7 +- python/cudf/cudf/core/udf/CMakeLists.txt | 100 +++++++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 python/cudf/cudf/core/udf/CMakeLists.txt diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 72e1779401f..17ea2c5c0a4 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -25,7 +25,11 @@ project( # language to be enabled here. The test project that is built in scikit-build to verify # various linking options for the python library is hardcoded to build with C, so until # that is fixed we need to keep C. - C CXX + C + CXX + # Temporarily enabling for groupby UDFs compilation until we come up with a better + # solution. + CUDA ) option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files" @@ -63,3 +67,4 @@ include(rapids-cython) rapids_cython_init() add_subdirectory(cudf/_lib) +add_subdirectory(cudf/core/udf) diff --git a/python/cudf/cudf/core/udf/CMakeLists.txt b/python/cudf/cudf/core/udf/CMakeLists.txt new file mode 100644 index 00000000000..5583553ec36 --- /dev/null +++ b/python/cudf/cudf/core/udf/CMakeLists.txt @@ -0,0 +1,100 @@ +# ============================================================================= +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +cmake_minimum_required(VERSION 3.20.1) + +include(rapids-cmake) +include(rapids-cuda) +include(rapids-find) + +rapids_cuda_init_architectures(GROUPBY_UDF) + +# Create a project so that we can enable CUDA architectures in this file. +project( + groupby-udf-cpp + VERSION 0.0.0 # Placeholder since this isn't a real project + LANGUAGES CUDA +) + +rapids_find_package(CUDAToolkit REQUIRED) + +# include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) rapids_cpm_libcudacxx(BUILD_EXPORT_SET +# strings-udf-exports INSTALL_EXPORT_SET strings-udf-exports) + +# add_library(groupby_udf_cpp SHARED function.cu) target_include_directories( groupby_udf_cpp PUBLIC +# "$" ) + +# set_target_properties( groupby_udf_cpp PROPERTIES BUILD_RPATH "\$ORIGIN" INSTALL_RPATH "\$ORIGIN" +# CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON +# POSITION_INDEPENDENT_CODE ON INTERFACE_POSITION_INDEPENDENT_CODE ON ) +# +# set(UDF_CXX_FLAGS) set(UDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) +# target_compile_options( groupby_udf_cpp PRIVATE "$<$:${UDF_CXX_FLAGS}>" +# "$<$:${UDF_CUDA_FLAGS}>" +# ) target_link_libraries(groupby_udf_cpp PUBLIC cudf::cudf CUDA::nvrtc) install(TARGETS +# groupby_udf_cpp DESTINATION ./cudf/core/udf/) + +# This function will copy the generated PTX file from its generator-specific location in the build +# tree into a specified location in the build tree from which we can install it. +function(copy_ptx_to_location target destination) + set(cmake_generated_file + "${CMAKE_CURRENT_BINARY_DIR}/cmake/cp_${target}_$>_ptx.cmake" + ) + file( + GENERATE + OUTPUT "${cmake_generated_file}" + CONTENT + " +set(ptx_paths \"$\") +file(COPY \${ptx_paths} DESTINATION \"${destination}\")" + ) + + add_custom_target( + ${target}_cp_ptx ALL + COMMAND ${CMAKE_COMMAND} -P "${cmake_generated_file}" + DEPENDS $ + COMMENT "Copying PTX files to '${destination}'" + ) +endfunction() + +# Create the shim library for each architecture. +set(GROUPBY_FUNCTION_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true) + +foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + set(tgt function_${arch}) + + add_library(${tgt} OBJECT function.cu) + set_target_properties( + ${tgt} + PROPERTIES CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON + CUDA_ARCHITECTURES ${arch} + CUDA_PTX_COMPILATION ON + ) + + target_include_directories(${tgt} PUBLIC include) + target_compile_options( + ${tgt} PRIVATE "$<$:${GROUPBY_FUNCTION_CUDA_FLAGS}>" + ) + target_link_libraries(${tgt} PUBLIC CUDA::nvrtc) + + copy_ptx_to_location(${tgt} "${CMAKE_CURRENT_BINARY_DIR}/") + install( + FILES $ + DESTINATION ./cudf/core/udf/ + RENAME ${tgt}.ptx + ) +endforeach() From 11edd370c699186b38c736b2160747b8a07c5f6e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 23 Sep 2022 16:51:21 -0700 Subject: [PATCH 009/121] Make engine name consistent with tests --- python/cudf/cudf/core/groupby/groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index cc3d0f04642..dc1823264ff 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -755,7 +755,7 @@ def mult(df): raise TypeError(f"type {type(function)} is not callable") group_names, offsets, group_keys, grouped_values = self._grouped() - if engine == "numba": + if engine == "jit": chunk_results = jit_groupby_apply( offsets, grouped_values, function, *args, cache=cache ) @@ -773,7 +773,6 @@ def mult(df): grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) ] chunk_results = [function(chk, *args) for chk in chunks] - if not len(chunk_results): return self.obj.head(0) From 1e1241649daf8df10e235cad66cae2adc7b0b032 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 23 Sep 2022 16:51:48 -0700 Subject: [PATCH 010/121] Generalize compiled PTX selection for CUDA arch. --- python/cudf/cudf/core/udf/groupby_function.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 0e835b2b9e6..26d7e8a81c0 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -1,5 +1,6 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. +import glob import math import os @@ -188,9 +189,24 @@ def __init__( "types.CPointer(types.int64),types.int64)", ) -# Path to the source containing the foreign function -basedir = os.path.dirname(os.path.realpath(__file__)) -dev_func_ptx = os.path.join(basedir, "function.ptx") +# Load the highest compute capability file available that is less than +# the current device's. +files = glob.glob( + os.path.join(os.path.dirname(os.path.realpath(__file__)), "function_*.ptx") +) +if len(files) == 0: + raise RuntimeError( + "This strings_udf installation is missing the necessary PTX " + "files. Please file an issue reporting this error and how you " + "installed cudf and strings_udf." + ) +dev = cuda.get_current_device() +cc = "".join(str(x) for x in dev.compute_capability) +sms = [os.path.basename(f).rstrip(".ptx").lstrip("function_") for f in files] +selected_sm = max(sm for sm in sms if sm < cc) +dev_func_ptx = os.path.join( + os.path.dirname(__file__), f"function_{selected_sm}.ptx" +) def call_my_max_int32(data, size): From d348fb87bc77e20038e1d882ac921292dbe8667d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 23 Sep 2022 16:52:09 -0700 Subject: [PATCH 011/121] Cleanup of strings_udf PTX detection --- python/strings_udf/strings_udf/__init__.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/strings_udf/strings_udf/__init__.py b/python/strings_udf/strings_udf/__init__.py index 94bd2531779..3ce4f74898a 100644 --- a/python/strings_udf/strings_udf/__init__.py +++ b/python/strings_udf/strings_udf/__init__.py @@ -48,11 +48,6 @@ def compiler_from_ptx_file(path): files = glob.glob( os.path.join(os.path.dirname(__file__), "shim_*.ptx") ) - dev = cuda.get_current_device() - cc = "".join(str(x) for x in dev.compute_capability) - files = glob.glob( - os.path.join(os.path.dirname(__file__), "shim_*.ptx") - ) if len(files) == 0: raise RuntimeError( "This strings_udf installation is missing the necessary PTX " @@ -62,6 +57,9 @@ def compiler_from_ptx_file(path): sms = [ os.path.basename(f).rstrip(".ptx").lstrip("shim_") for f in files ] + + dev = cuda.get_current_device() + cc = "".join(str(x) for x in dev.compute_capability) selected_sm = max(sm for sm in sms if sm < cc) ptxpath = os.path.join( os.path.dirname(__file__), f"shim_{selected_sm}.ptx" From 795e5809af758d732d721470ef52d3841dcb766b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 23 Sep 2022 16:53:41 -0700 Subject: [PATCH 012/121] Fix tests with some hacks so that we can start validating. --- python/cudf/cudf/tests/test_groupby.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 3000f5f11b1..e9a1ca4dc86 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -385,8 +385,16 @@ def foo(df): return df["val1"].max() + df["val2"].min() expect = expect_grpby.apply(foo) - got_nonjit = got_grpby.apply(foo) - got_jit = got_grpby.apply(foo, engine="jit") + # TODO: Due to some inconsistencies between how pandas and cudf handle the + # created index we get different columns in the index vs the data and a + # different name. For now I'm hacking around this to test the core + # functionality, but we'll need to update that eventually. + names = list(expect.columns) + names[2] = 0 + expect.columns = names + # TODO: Shouldn't have to reset_index below + got_nonjit = got_grpby.apply(foo).reset_index() + got_jit = got_grpby.apply(foo, engine="jit").reset_index() assert_groupby_results_equal(expect, got_nonjit) assert_groupby_results_equal(expect, got_jit) From 0ce0a90e1959c438fd42d1ab981fd984290b328b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 23 Sep 2022 16:54:50 -0700 Subject: [PATCH 013/121] Standardize the engine argument handling so that we get clear errors. --- python/cudf/cudf/core/groupby/groupby.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index dc1823264ff..cab57e0208f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -686,7 +686,7 @@ def pipe(self, func, *args, **kwargs): """ return cudf.core.common.pipe(self, func, *args, **kwargs) - def apply(self, function, *args, engine=None, cache=True): + def apply(self, function, *args, engine="cudf", cache=True): """Apply a python transformation function over the grouped chunk. Parameters @@ -761,7 +761,7 @@ def mult(df): ) result = cudf.Series(chunk_results, index=group_names) result.index.names = self.grouping.names - else: + elif engine == "cudf": ngroups = len(offsets) - 1 if ngroups > self._MAX_GROUPS_BEFORE_WARN: warnings.warn( @@ -790,6 +790,8 @@ def mult(df): index_data = group_keys._data.copy(deep=True) index_data[None] = grouped_values.index._column result.index = cudf.MultiIndex._from_data(index_data) + else: + raise ValueError("Unsupported engine!.") if self._sort: result = result.sort_index() From 3493d49a9d167d972ee42d13a66d93eb10354cb2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 23 Sep 2022 17:15:24 -0700 Subject: [PATCH 014/121] Update style. --- python/cudf/cudf/core/udf/function.cu | 620 ++++++++++++++------------ 1 file changed, 324 insertions(+), 296 deletions(-) diff --git a/python/cudf/cudf/core/udf/function.cu b/python/cudf/cudf/core/udf/function.cu index 749088f8307..e44ed632160 100644 --- a/python/cudf/cudf/core/udf/function.cu +++ b/python/cudf/cudf/core/udf/function.cu @@ -14,79 +14,79 @@ * limitations under the License. */ -#include -#include +#include +#include using size_type = int; // double atomicAdd __device__ __forceinline__ double atomicAdd(double* address, double val) { - unsigned long long int* address_as_ull = - (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + - __longlong_as_double(assumed))); + do { + assumed = old; + old = + atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); - } while (assumed != old); + } while (assumed != old); - return __longlong_as_double(old); + return __longlong_as_double(old); } // double atomicMax -__device__ __forceinline__ double atomicMax(double *address, double val) +__device__ __forceinline__ double atomicMax(double* address, double val) { - unsigned long long old = __double_as_longlong(*address); - while(val > __longlong_as_double(old)) - { - unsigned long long assumed = old; - if((old = atomicCAS((unsigned long long *)address, assumed, __double_as_longlong(val))) == assumed) - break; - } - return __longlong_as_double(old); + unsigned long long old = __double_as_longlong(*address); + while (val > __longlong_as_double(old)) { + unsigned long long assumed = old; + if ((old = atomicCAS((unsigned long long*)address, assumed, __double_as_longlong(val))) == + assumed) + break; + } + return __longlong_as_double(old); } // double atomicMin -__device__ __forceinline__ double atomicMin(double *address, double val) +__device__ __forceinline__ double atomicMin(double* address, double val) { - unsigned long long old = __double_as_longlong(*address); - while(val < __longlong_as_double(old)) - { - unsigned long long assumed = old; - if((old = atomicCAS((unsigned long long *)address, assumed, __double_as_longlong(val))) == assumed) - break; - } - return __longlong_as_double(old); + unsigned long long old = __double_as_longlong(*address); + while (val < __longlong_as_double(old)) { + unsigned long long assumed = old; + if ((old = atomicCAS((unsigned long long*)address, assumed, __double_as_longlong(val))) == + assumed) + break; + } + return __longlong_as_double(old); } -extern "C" __device__ int BlockSum_int64(int64_t *numba_return_value, int64_t const *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, + int64_t const* data, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_sum = 0; + int64_t local_sum = 0; __shared__ int64_t sum; - if (tid == 0) - sum = 0; + if (tid == 0) sum = 0; __syncthreads(); - // Calculate local sum for each thread - #pragma unroll +// Calculate local sum for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - local_sum += load; - } + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; + local_sum += load; + } } - atomicAdd((unsigned long long*) &sum, (unsigned long long) local_sum); + atomicAdd((unsigned long long*)&sum, (unsigned long long)local_sum); __syncthreads(); @@ -95,27 +95,29 @@ extern "C" __device__ int BlockSum_int64(int64_t *numba_return_value, int64_t co return 0; } -extern "C" __device__ int BlockSum_float64(double *numba_return_value, double const *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockSum_float64(double* numba_return_value, + double const* data, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_sum = 0; + double local_sum = 0; __shared__ double sum; - if (tid == 0) - sum = 0; + if (tid == 0) sum = 0; __syncthreads(); - // Calculate local sum for each thread - #pragma unroll +// Calculate local sum for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - local_sum += load; - } + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; + local_sum += load; + } } atomicAdd(&sum, local_sum); @@ -127,32 +129,33 @@ extern "C" __device__ int BlockSum_float64(double *numba_return_value, double co return 0; } - -extern "C" __device__ int BlockMean_int64(double *numba_return_value, int64_t const *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockMean_int64(double* numba_return_value, + int64_t const* data, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_sum = 0; + int64_t local_sum = 0; double mean; __shared__ int64_t sum; - if (tid == 0) - sum = 0; + if (tid == 0) sum = 0; __syncthreads(); - // Calculate local sum for each thread - #pragma unroll +// Calculate local sum for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - local_sum += load; - } + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; + local_sum += load; + } } - atomicAdd((unsigned long long*) &sum, (unsigned long long) local_sum); + atomicAdd((unsigned long long*)&sum, (unsigned long long)local_sum); __syncthreads(); @@ -161,31 +164,32 @@ extern "C" __device__ int BlockMean_int64(double *numba_return_value, int64_t co *numba_return_value = mean; return 0; - } -extern "C" __device__ int BlockMean_float64(double *numba_return_value, double const *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockMean_float64(double* numba_return_value, + double const* data, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_sum = 0; + double local_sum = 0; double mean; __shared__ double sum; - if (tid == 0) - sum = 0; + if (tid == 0) sum = 0; __syncthreads(); - // Calculate local sum for each thread - #pragma unroll +// Calculate local sum for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - local_sum += load; - } + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; + local_sum += load; + } } atomicAdd(&sum, local_sum); @@ -197,18 +201,18 @@ extern "C" __device__ int BlockMean_float64(double *numba_return_value, double c *numba_return_value = mean; return 0; - } - - -extern "C" __device__ int BlockStd_int64(double *numba_return_value, int64_t const *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockStd_int64(double* numba_return_value, + int64_t const* data, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_sum = 0; - double local_var = 0; + int64_t local_sum = 0; + double local_var = 0; double mean; double std; @@ -222,31 +226,31 @@ extern "C" __device__ int BlockStd_int64(double *numba_return_value, int64_t con __syncthreads(); - // Calculate local sum for each thread - #pragma unroll +// Calculate local sum for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - local_sum += load; - } + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; + local_sum += load; + } } - atomicAdd((unsigned long long*) &sum, (unsigned long long) local_sum); + atomicAdd((unsigned long long*)&sum, (unsigned long long)local_sum); __syncthreads(); mean = sum / static_cast(size); - // Calculate local sum for each thread - #pragma unroll +// Calculate local sum for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - double temp = load - mean; - temp = pow(temp, 2); - local_var += temp; - } - } + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; + double temp = load - mean; + temp = pow(temp, 2); + local_var += temp; + } + } atomicAdd(&var, local_var); @@ -259,13 +263,16 @@ extern "C" __device__ int BlockStd_int64(double *numba_return_value, int64_t con return 0; } -extern "C" __device__ int BlockStd_float64(double *numba_return_value, double const *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockStd_float64(double* numba_return_value, + double const* data, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_sum = 0; - double local_var = 0; + double local_sum = 0; + double local_var = 0; double mean; double std; @@ -279,13 +286,13 @@ extern "C" __device__ int BlockStd_float64(double *numba_return_value, double co __syncthreads(); - // Calculate local sum for each thread - #pragma unroll +// Calculate local sum for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - local_sum += load; - } + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; + local_sum += load; + } } atomicAdd(&sum, local_sum); @@ -294,16 +301,16 @@ extern "C" __device__ int BlockStd_float64(double *numba_return_value, double co mean = sum / static_cast(size); - // Calculate local sum for each thread - #pragma unroll +// Calculate local sum for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - double temp = load - mean; - temp = pow(temp, 2); - local_var += temp; - } - } + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; + double temp = load - mean; + temp = pow(temp, 2); + local_var += temp; + } + } atomicAdd(&var, local_var); @@ -316,13 +323,16 @@ extern "C" __device__ int BlockStd_float64(double *numba_return_value, double co return 0; } -extern "C" __device__ int BlockVar_int64(double *numba_return_value, int64_t const *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockVar_int64(double* numba_return_value, + int64_t const* data, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_sum = 0; - double local_var = 0; + int64_t local_sum = 0; + double local_var = 0; double mean; __shared__ int64_t sum; @@ -335,31 +345,31 @@ extern "C" __device__ int BlockVar_int64(double *numba_return_value, int64_t con __syncthreads(); - // Calculate local sum for each thread - #pragma unroll +// Calculate local sum for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - local_sum += load; - } + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; + local_sum += load; + } } - atomicAdd((unsigned long long*) &sum, (unsigned long long) local_sum); + atomicAdd((unsigned long long*)&sum, (unsigned long long)local_sum); __syncthreads(); mean = sum / static_cast(size); - // Calculate local sum for each thread - #pragma unroll +// Calculate local sum for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - double temp = load - mean; - temp = pow(temp, 2); - local_var += temp; - } - } + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; + double temp = load - mean; + temp = pow(temp, 2); + local_var += temp; + } + } atomicAdd(&var, local_var); @@ -372,13 +382,16 @@ extern "C" __device__ int BlockVar_int64(double *numba_return_value, int64_t con return 0; } -extern "C" __device__ int BlockVar_float64(double *numba_return_value, double const *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockVar_float64(double* numba_return_value, + double const* data, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_sum = 0; - double local_var = 0; + double local_sum = 0; + double local_var = 0; double mean; __shared__ double sum; @@ -391,13 +404,13 @@ extern "C" __device__ int BlockVar_float64(double *numba_return_value, double co __syncthreads(); - // Calculate local sum for each thread - #pragma unroll +// Calculate local sum for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - local_sum += load; - } + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; + local_sum += load; + } } atomicAdd(&sum, local_sum); @@ -406,16 +419,16 @@ extern "C" __device__ int BlockVar_float64(double *numba_return_value, double co mean = sum / static_cast(size); - // Calculate local sum for each thread - #pragma unroll +// Calculate local sum for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - double temp = load - mean; - temp = pow(temp, 2); - local_var += temp; - } - } + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; + double temp = load - mean; + temp = pow(temp, 2); + local_var += temp; + } + } atomicAdd(&var, local_var); @@ -428,29 +441,28 @@ extern "C" __device__ int BlockVar_float64(double *numba_return_value, double co return 0; } - // Calculate maximum of the group, return the scalar -extern "C" __device__ int BlockMax_int32(int *numba_return_value, int *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockMax_int32(int* numba_return_value, int* data, int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int local_max = INT_MIN; + int local_max = INT_MIN; __shared__ int smax; - if (tid == 0) - smax = INT_MIN; + if (tid == 0) smax = INT_MIN; __syncthreads(); - // Calculate local max for each thread - #pragma unroll +// Calculate local max for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int load = data[tid + item * tb_size]; - local_max = max(local_max, load); - } + if (tid + (item * tb_size) < size) { + int load = data[tid + item * tb_size]; + local_max = max(local_max, load); + } } __syncthreads(); @@ -466,33 +478,35 @@ extern "C" __device__ int BlockMax_int32(int *numba_return_value, int *data, int } // Calculate maximum of the group, return the scalar -extern "C" __device__ int BlockMax_int64(int64_t *numba_return_value, int64_t const *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockMax_int64(int64_t* numba_return_value, + int64_t const* data, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_max = INT64_MIN; + int64_t local_max = INT64_MIN; __shared__ int64_t smax; - if (tid == 0) - smax = INT64_MIN; + if (tid == 0) smax = INT64_MIN; __syncthreads(); - // Calculate local max for each thread - #pragma unroll +// Calculate local max for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - local_max = max(local_max, load); - } + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; + local_max = max(local_max, load); + } } __syncthreads(); // Calculate local max for each group - atomicMax((long long*) (&smax), (long long) local_max); + atomicMax((long long*)(&smax), (long long)local_max); __syncthreads(); @@ -502,27 +516,29 @@ extern "C" __device__ int BlockMax_int64(int64_t *numba_return_value, int64_t co } // Calculate maximum of the group, return the scalar -extern "C" __device__ int BlockMax_float64(double *numba_return_value, double const *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockMax_float64(double* numba_return_value, + double const* data, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_max = -DBL_MAX; + double local_max = -DBL_MAX; __shared__ double smax; - if (tid == 0) - smax = -DBL_MAX; + if (tid == 0) smax = -DBL_MAX; __syncthreads(); - // Calculate local max for each thread - #pragma unroll +// Calculate local max for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - local_max = max(local_max, load); - } + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; + local_max = max(local_max, load); + } } __syncthreads(); @@ -538,26 +554,26 @@ extern "C" __device__ int BlockMax_float64(double *numba_return_value, double co } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockMin_int32(int *numba_return_value, int *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockMin_int32(int* numba_return_value, int* data, int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int local_min = INT_MAX; + int local_min = INT_MAX; __shared__ int smin; - if (tid == 0) - smin = INT_MAX; - + if (tid == 0) smin = INT_MAX; + __syncthreads(); - #pragma unroll +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int load = data[tid + item * tb_size]; - local_min = min(local_min, load); - } + if (tid + (item * tb_size) < size) { + int load = data[tid + item * tb_size]; + local_min = min(local_min, load); + } } __syncthreads(); @@ -573,33 +589,35 @@ extern "C" __device__ int BlockMin_int32(int *numba_return_value, int *data, int } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockMin_int64(int64_t *numba_return_value, int64_t const *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockMin_int64(int64_t* numba_return_value, + int64_t const* data, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_min = INT64_MAX; + int64_t local_min = INT64_MAX; __shared__ int64_t smin; - if (tid == 0) - smin = INT64_MAX; - + if (tid == 0) smin = INT64_MAX; + __syncthreads(); - // Calculate local max for each thread - #pragma unroll +// Calculate local max for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - local_min = min(local_min, load); - } + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; + local_min = min(local_min, load); + } } __syncthreads(); // Calculate local max for each group - atomicMin((long long*) (&smin), (long long)local_min); + atomicMin((long long*)(&smin), (long long)local_min); __syncthreads(); @@ -609,27 +627,29 @@ extern "C" __device__ int BlockMin_int64(int64_t *numba_return_value, int64_t co } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockMin_float64(double *numba_return_value, double const *data, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockMin_float64(double* numba_return_value, + double const* data, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_min = DBL_MAX; + double local_min = DBL_MAX; __shared__ double smin; - if (tid == 0) - smin = DBL_MAX; - + if (tid == 0) smin = DBL_MAX; + __syncthreads(); - // Calculate local max for each thread - #pragma unroll +// Calculate local max for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - local_min = min(local_min, load); - } + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; + local_min = min(local_min, load); + } } __syncthreads(); @@ -645,13 +665,17 @@ extern "C" __device__ int BlockMin_float64(double *numba_return_value, double co } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t const *data, int64_t* index, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockIdxMax_int64(int64_t* numba_return_value, + int64_t const* data, + int64_t* index, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_max = INT64_MIN; - int64_t local_idx = -1; + int64_t local_max = INT64_MIN; + int64_t local_idx = -1; __shared__ int64_t smax; __shared__ int64_t sidx; @@ -660,31 +684,29 @@ extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t smax = INT64_MIN; sidx = INT64_MAX; } - + __syncthreads(); - // Calculate local max for each thread - #pragma unroll +// Calculate local max for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - if (load > local_max) { - local_max = load; - local_idx = index[tid + item * tb_size]; - } + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; + if (load > local_max) { + local_max = load; + local_idx = index[tid + item * tb_size]; } + } } __syncthreads(); // Calculate local max for each group - atomicMax((long long*) (&smax), (long long) local_max); + atomicMax((long long*)(&smax), (long long)local_max); __syncthreads(); - if (local_max == smax) { - atomicMin((long long*) (&sidx), (long long) local_idx); - } + if (local_max == smax) { atomicMin((long long*)(&sidx), (long long)local_idx); } __syncthreads(); @@ -694,13 +716,17 @@ extern "C" __device__ int BlockIdxMax_int64(int64_t *numba_return_value, int64_t } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, double const *data, int64_t* index, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockIdxMax_float64(int64_t* numba_return_value, + double const* data, + int64_t* index, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_max = -DBL_MAX; - int64_t local_idx = -1; + double local_max = -DBL_MAX; + int64_t local_idx = -1; __shared__ double smax; __shared__ int64_t sidx; @@ -709,19 +735,19 @@ extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, doubl smax = -DBL_MAX; sidx = INT64_MAX; } - + __syncthreads(); - // Calculate local max for each thread - #pragma unroll +// Calculate local max for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - if (load > local_max) { - local_max = load; - local_idx = index[tid + item * tb_size]; - } + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; + if (load > local_max) { + local_max = load; + local_idx = index[tid + item * tb_size]; } + } } __syncthreads(); @@ -731,9 +757,7 @@ extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, doubl __syncthreads(); - if (local_max == smax) { - atomicMin((long long*) (&sidx), (long long)local_idx); - } + if (local_max == smax) { atomicMin((long long*)(&sidx), (long long)local_idx); } __syncthreads(); @@ -743,13 +767,17 @@ extern "C" __device__ int BlockIdxMax_float64(int64_t *numba_return_value, doubl } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t const *data, int64_t* index, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockIdxMin_int64(int64_t* numba_return_value, + int64_t const* data, + int64_t* index, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_min = INT64_MAX; - int64_t local_idx = -1; + int64_t local_min = INT64_MAX; + int64_t local_idx = -1; __shared__ int64_t smin; __shared__ int64_t sidx; @@ -758,31 +786,29 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t smin = INT64_MAX; sidx = INT64_MAX; } - + __syncthreads(); - // Calculate local max for each thread - #pragma unroll +// Calculate local max for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - if (load < local_min) { - local_min = load; - local_idx = index[tid + item * tb_size]; - } + if (tid + (item * tb_size) < size) { + int64_t load = data[tid + item * tb_size]; + if (load < local_min) { + local_min = load; + local_idx = index[tid + item * tb_size]; } + } } __syncthreads(); // Calculate local max for each group - atomicMin((long long*) (&smin), (long long) local_min); + atomicMin((long long*)(&smin), (long long)local_min); __syncthreads(); - if (local_min == smin) { - atomicMin((long long*) (&sidx), (long long)local_idx); - } + if (local_min == smin) { atomicMin((long long*)(&sidx), (long long)local_idx); } __syncthreads(); @@ -792,13 +818,17 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t *numba_return_value, int64_t } // Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMin_float64(int64_t *numba_return_value, double const *data, int64_t* index, int64_t size) { - int tid = threadIdx.x; +extern "C" __device__ int BlockIdxMin_float64(int64_t* numba_return_value, + double const* data, + int64_t* index, + int64_t size) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_min = DBL_MAX; - int64_t local_idx = -1; + double local_min = DBL_MAX; + int64_t local_idx = -1; __shared__ double smin; __shared__ int64_t sidx; @@ -807,19 +837,19 @@ extern "C" __device__ int BlockIdxMin_float64(int64_t *numba_return_value, doubl smin = DBL_MAX; sidx = INT64_MAX; } - + __syncthreads(); - // Calculate local max for each thread - #pragma unroll +// Calculate local max for each thread +#pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - if (load < local_min) { - local_min = load; - local_idx = index[tid + item * tb_size]; - } + if (tid + (item * tb_size) < size) { + double load = data[tid + item * tb_size]; + if (load < local_min) { + local_min = load; + local_idx = index[tid + item * tb_size]; } + } } __syncthreads(); @@ -829,9 +859,7 @@ extern "C" __device__ int BlockIdxMin_float64(int64_t *numba_return_value, doubl __syncthreads(); - if (local_min == smin) { - atomicMin((long long*) (&sidx), (long long)local_idx); - } + if (local_min == smin) { atomicMin((long long*)(&sidx), (long long)local_idx); } __syncthreads(); From 7f9ea1ff445fe35ab7c6d0092985e681b7be17fd Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Wed, 26 Oct 2022 17:47:58 +0000 Subject: [PATCH 015/121] Refactoring C++ function --- python/cudf/cudf/core/udf/function.cu | 714 +++++------------- python/cudf/cudf/core/udf/groupby_function.py | 4 +- 2 files changed, 202 insertions(+), 516 deletions(-) diff --git a/python/cudf/cudf/core/udf/function.cu b/python/cudf/cudf/core/udf/function.cu index e44ed632160..67a27cce5db 100644 --- a/python/cudf/cudf/core/udf/function.cu +++ b/python/cudf/cudf/core/udf/function.cu @@ -35,6 +35,12 @@ __device__ __forceinline__ double atomicAdd(double* address, double val) return __longlong_as_double(old); } +// int64_t atomicAdd +__device__ __forceinline__ int64_t atomicAdd(int64_t* address, int64_t val) +{ + return atomicAdd((unsigned long long*)address, (unsigned long long)val); +} + // double atomicMax __device__ __forceinline__ double atomicMax(double* address, double val) { @@ -48,6 +54,12 @@ __device__ __forceinline__ double atomicMax(double* address, double val) return __longlong_as_double(old); } +// int64_t atomicMax +__device__ __forceinline__ int64_t atomicMax(int64_t* address, int64_t val) +{ + return atomicMax((long long*)address, (long long)val); +} + // double atomicMin __device__ __forceinline__ double atomicMin(double* address, double val) { @@ -61,17 +73,19 @@ __device__ __forceinline__ double atomicMin(double* address, double val) return __longlong_as_double(old); } -extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, - int64_t const* data, - int64_t size) +// int64_t atomicMin +__device__ __forceinline__ int64_t atomicMin(int64_t* address, int64_t val) { - int tid = threadIdx.x; - int tb_size = blockDim.x; - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_sum = 0; + return atomicMin((long long*)address, (long long)val); +} - __shared__ int64_t sum; +// Use a C++ templated __device__ function to implement the body of the algorithm. +template +__device__ T device_sum(T const* data, int const items_per_thread, size_type size) { + __shared__ T sum; + int tid = threadIdx.x; + int tb_size = blockDim.x; + T local_sum = 0; if (tid == 0) sum = 0; @@ -81,509 +95,361 @@ extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; + T load = data[tid + item * tb_size]; local_sum += load; } } - atomicAdd((unsigned long long*)&sum, (unsigned long long)local_sum); + atomicAdd(&sum, local_sum); __syncthreads(); - *numba_return_value = sum; - - return 0; + return sum; } -extern "C" __device__ int BlockSum_float64(double* numba_return_value, - double const* data, - int64_t size) -{ - int tid = threadIdx.x; - int tb_size = blockDim.x; - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_sum = 0; +// Use a C++ templated __device__ function to implement the body of the algorithm. +template +__device__ T device_var(T const* data, int const items_per_thread, size_type size) { - __shared__ double sum; + int tid = threadIdx.x; + int tb_size = blockDim.x; - if (tid == 0) sum = 0; + double local_var = 0; + __shared__ double var; + if (tid == 0) var = 0; - __syncthreads(); + T sum = device_sum(data, items_per_thread, size); + double mean = sum / static_cast(size); // Calculate local sum for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - local_sum += load; + T load = data[tid + item * tb_size]; + double temp = load - mean; + temp = pow(temp, 2); + local_var += temp; } } - atomicAdd(&sum, local_sum); + atomicAdd(&var, local_var); __syncthreads(); - *numba_return_value = sum; - - return 0; + return (var / (size - 1)); } -extern "C" __device__ int BlockMean_int64(double* numba_return_value, - int64_t const* data, - int64_t size) -{ - int tid = threadIdx.x; +// Use a C++ templated __device__ function to implement the body of the algorithm. +template +__device__ T device_max(T const* data, int const items_per_thread, size_type size, T init_val) { + + int tid = threadIdx.x; int tb_size = blockDim.x; - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_sum = 0; - double mean; - __shared__ int64_t sum; + T local_max = init_val; + __shared__ T smax; - if (tid == 0) sum = 0; + if (tid == 0) smax = init_val; __syncthreads(); -// Calculate local sum for each thread +// Calculate local max for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - local_sum += load; + T load = data[tid + item * tb_size]; + local_max = max(local_max, load); } } - atomicAdd((unsigned long long*)&sum, (unsigned long long)local_sum); - __syncthreads(); - mean = sum / static_cast(size); + // Calculate local max for each group + atomicMax((&smax), local_max); - *numba_return_value = mean; + __syncthreads(); - return 0; + return smax; } -extern "C" __device__ int BlockMean_float64(double* numba_return_value, - double const* data, - int64_t size) -{ - int tid = threadIdx.x; +// Use a C++ templated __device__ function to implement the body of the algorithm. +template +__device__ T device_min(T const* data, int const items_per_thread, size_type size, T init_val) { + + int tid = threadIdx.x; int tb_size = blockDim.x; - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_sum = 0; - double mean; - __shared__ double sum; + T local_min = init_val; + __shared__ T smin; - if (tid == 0) sum = 0; + if (tid == 0) smin = init_val; __syncthreads(); -// Calculate local sum for each thread +// Calculate local min for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - local_sum += load; + T load = data[tid + item * tb_size]; + local_min = min(local_min, load); } } - atomicAdd(&sum, local_sum); - __syncthreads(); - mean = sum / static_cast(size); + // Calculate local min for each group + atomicMin((&smin), local_min); - *numba_return_value = mean; + __syncthreads(); - return 0; + return smin; } -extern "C" __device__ int BlockStd_int64(double* numba_return_value, - int64_t const* data, - int64_t size) -{ +// Use a C++ templated __device__ function to implement the body of the algorithm. +template +__device__ T device_idxmax(T const* data, int const items_per_thread, int64_t const* index, size_type size, T init_val) { + int tid = threadIdx.x; int tb_size = blockDim.x; + // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_sum = 0; - double local_var = 0; - double mean; - double std; + T local_max = init_val; + int64_t local_idx = -1; - __shared__ int64_t sum; - __shared__ double var; + __shared__ T smax; + __shared__ int64_t sidx; if (tid == 0) { - sum = 0; - var = 0; + smax = init_val; + sidx = INT64_MAX; } __syncthreads(); -// Calculate local sum for each thread +// Calculate local max for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - local_sum += load; + T load = data[tid + item * tb_size]; + if (load > local_max) { + local_max = load; + local_idx = index[tid + item * tb_size]; + } } } - atomicAdd((unsigned long long*)&sum, (unsigned long long)local_sum); - __syncthreads(); - mean = sum / static_cast(size); - -// Calculate local sum for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - double temp = load - mean; - temp = pow(temp, 2); - local_var += temp; - } - } - - atomicAdd(&var, local_var); + // Calculate local max for each group + atomicMax((&smax), local_max); __syncthreads(); - std = sqrt(var / (size - 1)); + if (local_max == smax) { atomicMin((&sidx),local_idx); } - *numba_return_value = std; + __syncthreads(); - return 0; + return sidx; } -extern "C" __device__ int BlockStd_float64(double* numba_return_value, - double const* data, - int64_t size) -{ - int tid = threadIdx.x; +// Use a C++ templated __device__ function to implement the body of the algorithm. +template +__device__ T device_idxmin(T const* data, int const items_per_thread, int64_t const* index, size_type size, T init_val) { + + int tid = threadIdx.x; int tb_size = blockDim.x; - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_sum = 0; - double local_var = 0; - double mean; - double std; - __shared__ double sum; - __shared__ double var; + T local_min = init_val; + int64_t local_idx = -1; + + __shared__ T smin; + __shared__ int64_t sidx; if (tid == 0) { - sum = 0; - var = 0; + smin = init_val; + sidx = INT64_MAX; } __syncthreads(); -// Calculate local sum for each thread +// Calculate local max for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - local_sum += load; + T load = data[tid + item * tb_size]; + if (load < local_min) { + local_min = load; + local_idx = index[tid + item * tb_size]; + } } } - atomicAdd(&sum, local_sum); - __syncthreads(); - mean = sum / static_cast(size); - -// Calculate local sum for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - double temp = load - mean; - temp = pow(temp, 2); - local_var += temp; - } - } - - atomicAdd(&var, local_var); + // Calculate local max for each group + atomicMin((&smin), local_min); __syncthreads(); - std = sqrt(var / (size - 1)); + if (local_min == smin) { atomicMin((&sidx), local_idx); } - *numba_return_value = std; + __syncthreads(); - return 0; + return sidx; } -extern "C" __device__ int BlockVar_int64(double* numba_return_value, +extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, int64_t const* data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_sum = 0; - double local_var = 0; - double mean; - - __shared__ int64_t sum; - __shared__ double var; - - if (tid == 0) { - sum = 0; - var = 0; - } - - __syncthreads(); - -// Calculate local sum for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - local_sum += load; - } - } - - atomicAdd((unsigned long long*)&sum, (unsigned long long)local_sum); - - __syncthreads(); - - mean = sum / static_cast(size); - -// Calculate local sum for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - double temp = load - mean; - temp = pow(temp, 2); - local_var += temp; - } - } + + int64_t sum = device_sum(data, items_per_thread, size); - atomicAdd(&var, local_var); - - __syncthreads(); - - var = var / (size - 1); - - *numba_return_value = var; + *numba_return_value = sum; return 0; } -extern "C" __device__ int BlockVar_float64(double* numba_return_value, +extern "C" __device__ int BlockSum_float64(double* numba_return_value, double const* data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_sum = 0; - double local_var = 0; - double mean; - __shared__ double sum; - __shared__ double var; + double sum = device_sum(data, items_per_thread, size); - if (tid == 0) { - sum = 0; - var = 0; - } + *numba_return_value = sum; - __syncthreads(); + return 0; +} -// Calculate local sum for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - local_sum += load; - } - } +extern "C" __device__ int BlockMean_int64(double* numba_return_value, + int64_t const* data, + int64_t size) +{ + int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + auto const items_per_thread = (size + tb_size - 1) / tb_size; - atomicAdd(&sum, local_sum); + int64_t sum = device_sum(data, items_per_thread, size); - __syncthreads(); + double mean = sum / static_cast(size); - mean = sum / static_cast(size); + *numba_return_value = mean; -// Calculate local sum for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - double temp = load - mean; - temp = pow(temp, 2); - local_var += temp; - } - } + return 0; +} - atomicAdd(&var, local_var); +extern "C" __device__ int BlockMean_float64(double* numba_return_value, + double const* data, + int64_t size) +{ + int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + auto const items_per_thread = (size + tb_size - 1) / tb_size; - __syncthreads(); + double sum = device_sum(data, items_per_thread, size); - var = var / (size - 1); + double mean = sum / static_cast(size); - *numba_return_value = var; + *numba_return_value = mean; return 0; } -// Calculate maximum of the group, return the scalar -extern "C" __device__ int BlockMax_int32(int* numba_return_value, int* data, int64_t size) +extern "C" __device__ int BlockStd_int64(double* numba_return_value, + int64_t const* data, + int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int local_max = INT_MIN; - - __shared__ int smax; - if (tid == 0) smax = INT_MIN; + double var = device_var(data, items_per_thread, size); - __syncthreads(); - -// Calculate local max for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int load = data[tid + item * tb_size]; - local_max = max(local_max, load); - } - } + *numba_return_value = sqrt(var); - __syncthreads(); + return 0; +} - // Calculate local max for each group - atomicMax(&smax, local_max); +extern "C" __device__ int BlockStd_float64(double* numba_return_value, + double const* data, + int64_t size) +{ + int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + auto const items_per_thread = (size + tb_size - 1) / tb_size; - __syncthreads(); + double var = device_var(data, items_per_thread, size); - *numba_return_value = smax; + *numba_return_value = sqrt(var); return 0; } -// Calculate maximum of the group, return the scalar -extern "C" __device__ int BlockMax_int64(int64_t* numba_return_value, +extern "C" __device__ int BlockVar_int64(double* numba_return_value, int64_t const* data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_max = INT64_MIN; - - __shared__ int64_t smax; - - if (tid == 0) smax = INT64_MIN; - - __syncthreads(); - -// Calculate local max for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - local_max = max(local_max, load); - } - } - - __syncthreads(); - - // Calculate local max for each group - atomicMax((long long*)(&smax), (long long)local_max); - __syncthreads(); + double var = device_var(data, items_per_thread, size); - *numba_return_value = smax; + *numba_return_value = var; return 0; } -// Calculate maximum of the group, return the scalar -extern "C" __device__ int BlockMax_float64(double* numba_return_value, +extern "C" __device__ int BlockVar_float64(double* numba_return_value, double const* data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_max = -DBL_MAX; - __shared__ double smax; + double var = device_var(data, items_per_thread, size); - if (tid == 0) smax = -DBL_MAX; - - __syncthreads(); - -// Calculate local max for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - local_max = max(local_max, load); - } - } - - __syncthreads(); - - // Calculate local max for each group - atomicMax((&smax), local_max); - - __syncthreads(); - - *numba_return_value = smax; + *numba_return_value = var; return 0; } -// Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockMin_int32(int* numba_return_value, int* data, int64_t size) +// Calculate maximum of the group, return the scalar +extern "C" __device__ int BlockMax_int64(int64_t* numba_return_value, + int64_t const* data, + int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int local_min = INT_MAX; - __shared__ int smin; + int64_t max_val = device_max(data, items_per_thread, size, INT64_MIN); - if (tid == 0) smin = INT_MAX; - - __syncthreads(); - -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int load = data[tid + item * tb_size]; - local_min = min(local_min, load); - } - } + *numba_return_value = max_val; - __syncthreads(); + return 0; +} - // Calculate local max for each group - atomicMin(&smin, local_min); +// Calculate maximum of the group, return the scalar +extern "C" __device__ int BlockMax_float64(double* numba_return_value, + double const* data, + int64_t size) +{ + int tb_size = blockDim.x; + // Calculate how many elements each thread is working on + auto const items_per_thread = (size + tb_size - 1) / tb_size; - __syncthreads(); + double max_val = device_max(data, items_per_thread, size, -DBL_MAX); - *numba_return_value = smin; + *numba_return_value = max_val; return 0; } @@ -593,35 +459,13 @@ extern "C" __device__ int BlockMin_int64(int64_t* numba_return_value, int64_t const* data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_min = INT64_MAX; - __shared__ int64_t smin; + int64_t min_val = device_min(data, items_per_thread, size, INT64_MAX); - if (tid == 0) smin = INT64_MAX; - - __syncthreads(); - -// Calculate local max for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - local_min = min(local_min, load); - } - } - - __syncthreads(); - - // Calculate local max for each group - atomicMin((long long*)(&smin), (long long)local_min); - - __syncthreads(); - - *numba_return_value = smin; + *numba_return_value = min_val; return 0; } @@ -631,35 +475,13 @@ extern "C" __device__ int BlockMin_float64(double* numba_return_value, double const* data, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_min = DBL_MAX; - __shared__ double smin; + double min_val = device_min(data, items_per_thread, size, DBL_MAX); - if (tid == 0) smin = DBL_MAX; - - __syncthreads(); - -// Calculate local max for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - local_min = min(local_min, load); - } - } - - __syncthreads(); - - // Calculate local max for each group - atomicMin((&smin), local_min); - - __syncthreads(); - - *numba_return_value = smin; + *numba_return_value = min_val; return 0; } @@ -670,47 +492,13 @@ extern "C" __device__ int BlockIdxMax_int64(int64_t* numba_return_value, int64_t* index, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_max = INT64_MIN; - int64_t local_idx = -1; - - __shared__ int64_t smax; - __shared__ int64_t sidx; - if (tid == 0) { - smax = INT64_MIN; - sidx = INT64_MAX; - } + int64_t idxmax = device_idxmax(data, items_per_thread, index, size, INT64_MIN); - __syncthreads(); - -// Calculate local max for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - if (load > local_max) { - local_max = load; - local_idx = index[tid + item * tb_size]; - } - } - } - - __syncthreads(); - - // Calculate local max for each group - atomicMax((long long*)(&smax), (long long)local_max); - - __syncthreads(); - - if (local_max == smax) { atomicMin((long long*)(&sidx), (long long)local_idx); } - - __syncthreads(); - - *numba_return_value = sidx; + *numba_return_value = idxmax; return 0; } @@ -721,47 +509,13 @@ extern "C" __device__ int BlockIdxMax_float64(int64_t* numba_return_value, int64_t* index, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_max = -DBL_MAX; - int64_t local_idx = -1; - - __shared__ double smax; - __shared__ int64_t sidx; - - if (tid == 0) { - smax = -DBL_MAX; - sidx = INT64_MAX; - } - - __syncthreads(); - -// Calculate local max for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - if (load > local_max) { - local_max = load; - local_idx = index[tid + item * tb_size]; - } - } - } - - __syncthreads(); - - // Calculate local max for each group - atomicMax((&smax), local_max); - - __syncthreads(); - if (local_max == smax) { atomicMin((long long*)(&sidx), (long long)local_idx); } + int64_t idxmax = device_idxmax(data, items_per_thread, index, size, -DBL_MAX); - __syncthreads(); - - *numba_return_value = sidx; + *numba_return_value = idxmax; return 0; } @@ -772,47 +526,13 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t* numba_return_value, int64_t* index, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t local_min = INT64_MAX; - int64_t local_idx = -1; - - __shared__ int64_t smin; - __shared__ int64_t sidx; - - if (tid == 0) { - smin = INT64_MAX; - sidx = INT64_MAX; - } - - __syncthreads(); -// Calculate local max for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - int64_t load = data[tid + item * tb_size]; - if (load < local_min) { - local_min = load; - local_idx = index[tid + item * tb_size]; - } - } - } - - __syncthreads(); - - // Calculate local max for each group - atomicMin((long long*)(&smin), (long long)local_min); - - __syncthreads(); - - if (local_min == smin) { atomicMin((long long*)(&sidx), (long long)local_idx); } - - __syncthreads(); + int64_t idxmin = device_idxmin(data, items_per_thread, index, size, INT64_MAX); - *numba_return_value = sidx; + *numba_return_value = idxmin; return 0; } @@ -823,47 +543,13 @@ extern "C" __device__ int BlockIdxMin_float64(int64_t* numba_return_value, int64_t* index, int64_t size) { - int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double local_min = DBL_MAX; - int64_t local_idx = -1; - - __shared__ double smin; - __shared__ int64_t sidx; - - if (tid == 0) { - smin = DBL_MAX; - sidx = INT64_MAX; - } - - __syncthreads(); - -// Calculate local max for each thread -#pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - double load = data[tid + item * tb_size]; - if (load < local_min) { - local_min = load; - local_idx = index[tid + item * tb_size]; - } - } - } - - __syncthreads(); - // Calculate local max for each group - atomicMin((&smin), local_min); - - __syncthreads(); - - if (local_min == smin) { atomicMin((long long*)(&sidx), (long long)local_idx); } - - __syncthreads(); + int64_t idxmin = device_idxmin(data, items_per_thread, index, size, DBL_MAX); - *numba_return_value = sidx; + *numba_return_value = idxmin; return 0; } \ No newline at end of file diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 26d7e8a81c0..f9785cdbcd7 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -196,14 +196,14 @@ def __init__( ) if len(files) == 0: raise RuntimeError( - "This strings_udf installation is missing the necessary PTX " + "This groupby apply installation is missing the necessary PTX " "files. Please file an issue reporting this error and how you " "installed cudf and strings_udf." ) dev = cuda.get_current_device() cc = "".join(str(x) for x in dev.compute_capability) sms = [os.path.basename(f).rstrip(".ptx").lstrip("function_") for f in files] -selected_sm = max(sm for sm in sms if sm < cc) +selected_sm = max(sm for sm in sms if sm <= cc) dev_func_ptx = os.path.join( os.path.dirname(__file__), f"function_{selected_sm}.ptx" ) From 3d76a44995efe4f89692065637ca5c96ecfb605a Mon Sep 17 00:00:00 2001 From: Chameleon Cloud User Date: Wed, 2 Nov 2022 22:57:25 +0000 Subject: [PATCH 016/121] Fix bug in C++ and Python Cleanup --- python/cudf/cudf/core/udf/function.cu | 279 ++++++++++++------ python/cudf/cudf/core/udf/groupby_function.py | 263 +++++------------ 2 files changed, 263 insertions(+), 279 deletions(-) diff --git a/python/cudf/cudf/core/udf/function.cu b/python/cudf/cudf/core/udf/function.cu index 67a27cce5db..65401cde262 100644 --- a/python/cudf/cudf/core/udf/function.cu +++ b/python/cudf/cudf/core/udf/function.cu @@ -81,16 +81,11 @@ __device__ __forceinline__ int64_t atomicMin(int64_t* address, int64_t val) // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ T device_sum(T const* data, int const items_per_thread, size_type size) { - __shared__ T sum; +__device__ void device_sum(T const* data, int const items_per_thread, size_type size, T* sum) { int tid = threadIdx.x; int tb_size = blockDim.x; T local_sum = 0; - if (tid == 0) sum = 0; - - __syncthreads(); - // Calculate local sum for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { @@ -100,26 +95,28 @@ __device__ T device_sum(T const* data, int const items_per_thread, size_type siz } } - atomicAdd(&sum, local_sum); + atomicAdd(sum, local_sum); __syncthreads(); - return sum; } // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ T device_var(T const* data, int const items_per_thread, size_type size) { +__device__ void device_var(T const* data, int const items_per_thread, size_type size, T* sum, double* var) { - int tid = threadIdx.x; + int tid = threadIdx.x; int tb_size = blockDim.x; - + // Calculate how many elements each thread is working on + T local_sum = 0; double local_var = 0; - __shared__ double var; - if (tid == 0) var = 0; + double mean; - T sum = device_sum(data, items_per_thread, size); - double mean = sum / static_cast(size); + device_sum(data, items_per_thread, size, sum); + + __syncthreads(); + + mean = (*sum) / static_cast(size); // Calculate local sum for each thread #pragma unroll @@ -132,26 +129,23 @@ __device__ T device_var(T const* data, int const items_per_thread, size_type siz } } - atomicAdd(&var, local_var); + atomicAdd(var, local_var); __syncthreads(); - return (var / (size - 1)); + *var = *var / (size - 1); + + __syncthreads(); } // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ T device_max(T const* data, int const items_per_thread, size_type size, T init_val) { +__device__ void device_max(T const* data, int const items_per_thread, size_type size, T init_val, T* smax) { int tid = threadIdx.x; int tb_size = blockDim.x; T local_max = init_val; - __shared__ T smax; - - if (tid == 0) smax = init_val; - - __syncthreads(); // Calculate local max for each thread #pragma unroll @@ -165,26 +159,19 @@ __device__ T device_max(T const* data, int const items_per_thread, size_type siz __syncthreads(); // Calculate local max for each group - atomicMax((&smax), local_max); + atomicMax(smax, local_max); __syncthreads(); - - return smax; } // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ T device_min(T const* data, int const items_per_thread, size_type size, T init_val) { +__device__ void device_min(T const* data, int const items_per_thread, size_type size, T init_val, T* smin) { int tid = threadIdx.x; int tb_size = blockDim.x; T local_min = init_val; - __shared__ T smin; - - if (tid == 0) smin = init_val; - - __syncthreads(); // Calculate local min for each thread #pragma unroll @@ -198,16 +185,14 @@ __device__ T device_min(T const* data, int const items_per_thread, size_type siz __syncthreads(); // Calculate local min for each group - atomicMin((&smin), local_min); + atomicMin(smin, local_min); __syncthreads(); - - return smin; } // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ T device_idxmax(T const* data, int const items_per_thread, int64_t const* index, size_type size, T init_val) { +__device__ void device_idxmax(T const* data, int const items_per_thread, int64_t const* index, size_type size, T init_val, T* smax, int64_t* sidx) { int tid = threadIdx.x; int tb_size = blockDim.x; @@ -216,16 +201,6 @@ __device__ T device_idxmax(T const* data, int const items_per_thread, int64_t co T local_max = init_val; int64_t local_idx = -1; - __shared__ T smax; - __shared__ int64_t sidx; - - if (tid == 0) { - smax = init_val; - sidx = INT64_MAX; - } - - __syncthreads(); - // Calculate local max for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { @@ -241,20 +216,18 @@ __device__ T device_idxmax(T const* data, int const items_per_thread, int64_t co __syncthreads(); // Calculate local max for each group - atomicMax((&smax), local_max); + atomicMax(smax, local_max); __syncthreads(); - if (local_max == smax) { atomicMin((&sidx),local_idx); } + if (local_max == (*smax)) { atomicMin(sidx,local_idx); } __syncthreads(); - - return sidx; } // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ T device_idxmin(T const* data, int const items_per_thread, int64_t const* index, size_type size, T init_val) { +__device__ void device_idxmin(T const* data, int const items_per_thread, int64_t const* index, size_type size, T init_val, T* smin, int64_t* sidx) { int tid = threadIdx.x; int tb_size = blockDim.x; @@ -262,16 +235,6 @@ __device__ T device_idxmin(T const* data, int const items_per_thread, int64_t co T local_min = init_val; int64_t local_idx = -1; - __shared__ T smin; - __shared__ int64_t sidx; - - if (tid == 0) { - smin = init_val; - sidx = INT64_MAX; - } - - __syncthreads(); - // Calculate local max for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { @@ -287,15 +250,13 @@ __device__ T device_idxmin(T const* data, int const items_per_thread, int64_t co __syncthreads(); // Calculate local max for each group - atomicMin((&smin), local_min); + atomicMin(smin, local_min); __syncthreads(); - if (local_min == smin) { atomicMin((&sidx), local_idx); } + if (local_min == (*smin)) { atomicMin(sidx, local_idx); } __syncthreads(); - - return sidx; } extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, @@ -305,8 +266,15 @@ extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, int tb_size = blockDim.x; // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; + + __shared__ int64_t sum; + if (threadIdx.x == 0) { + sum = 0; + } + + __syncthreads(); - int64_t sum = device_sum(data, items_per_thread, size); + device_sum(data, items_per_thread, size, &sum); *numba_return_value = sum; @@ -321,7 +289,14 @@ extern "C" __device__ int BlockSum_float64(double* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double sum = device_sum(data, items_per_thread, size); + __shared__ double sum; + if (threadIdx.x == 0) { + sum = 0; + } + + __syncthreads(); + + device_sum(data, items_per_thread, size, &sum); *numba_return_value = sum; @@ -336,7 +311,14 @@ extern "C" __device__ int BlockMean_int64(double* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t sum = device_sum(data, items_per_thread, size); + __shared__ int64_t sum; + if (threadIdx.x == 0) { + sum = 0; + } + + __syncthreads(); + + device_sum(data, items_per_thread, size, &sum); double mean = sum / static_cast(size); @@ -353,7 +335,14 @@ extern "C" __device__ int BlockMean_float64(double* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double sum = device_sum(data, items_per_thread, size); + __shared__ double sum; + if (threadIdx.x == 0) { + sum = 0; + } + + __syncthreads(); + + device_sum(data, items_per_thread, size, &sum); double mean = sum / static_cast(size); @@ -370,7 +359,17 @@ extern "C" __device__ int BlockStd_int64(double* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double var = device_var(data, items_per_thread, size); + __shared__ int64_t sum; + __shared__ double var; + + if (threadIdx.x == 0) { + sum = 0; + var = 0; + } + + __syncthreads(); + + device_var(data, items_per_thread, size, &sum, &var); *numba_return_value = sqrt(var); @@ -385,7 +384,17 @@ extern "C" __device__ int BlockStd_float64(double* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double var = device_var(data, items_per_thread, size); + __shared__ double sum; + __shared__ double var; + + if (threadIdx.x == 0) { + sum = 0; + var = 0; + } + + __syncthreads(); + + device_var(data, items_per_thread, size, &sum, &var); *numba_return_value = sqrt(var); @@ -400,7 +409,17 @@ extern "C" __device__ int BlockVar_int64(double* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double var = device_var(data, items_per_thread, size); + __shared__ int64_t sum; + __shared__ double var; + + if (threadIdx.x == 0) { + sum = 0; + var = 0; + } + + __syncthreads(); + + device_var(data, items_per_thread, size, &sum, &var); *numba_return_value = var; @@ -415,7 +434,17 @@ extern "C" __device__ int BlockVar_float64(double* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double var = device_var(data, items_per_thread, size); + __shared__ double sum; + __shared__ double var; + + if (threadIdx.x == 0) { + sum = 0; + var = 0; + } + + __syncthreads(); + + device_var(data, items_per_thread, size, &sum, &var); *numba_return_value = var; @@ -431,9 +460,17 @@ extern "C" __device__ int BlockMax_int64(int64_t* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t max_val = device_max(data, items_per_thread, size, INT64_MIN); + __shared__ int64_t smax; + + if (threadIdx.x == 0) { + smax = INT64_MIN; + } + + __syncthreads(); + + device_max(data, items_per_thread, size, INT64_MIN, &smax); - *numba_return_value = max_val; + *numba_return_value = smax; return 0; } @@ -447,9 +484,17 @@ extern "C" __device__ int BlockMax_float64(double* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double max_val = device_max(data, items_per_thread, size, -DBL_MAX); + __shared__ double smax; + + if (threadIdx.x == 0) { + smax = -DBL_MAX; + } + + __syncthreads(); + + device_max(data, items_per_thread, size, -DBL_MAX, &smax); - *numba_return_value = max_val; + *numba_return_value = smax; return 0; } @@ -463,9 +508,17 @@ extern "C" __device__ int BlockMin_int64(int64_t* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t min_val = device_min(data, items_per_thread, size, INT64_MAX); + __shared__ int64_t smin; + + if (threadIdx.x == 0) { + smin = INT64_MAX; + } + + __syncthreads(); + + device_min(data, items_per_thread, size, INT64_MAX, &smin); - *numba_return_value = min_val; + *numba_return_value = smin; return 0; } @@ -479,9 +532,17 @@ extern "C" __device__ int BlockMin_float64(double* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - double min_val = device_min(data, items_per_thread, size, DBL_MAX); + __shared__ double smin; - *numba_return_value = min_val; + if (threadIdx.x == 0) { + smin = DBL_MAX; + } + + __syncthreads(); + + device_min(data, items_per_thread, size, DBL_MAX, &smin); + + *numba_return_value = smin; return 0; } @@ -496,9 +557,19 @@ extern "C" __device__ int BlockIdxMax_int64(int64_t* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t idxmax = device_idxmax(data, items_per_thread, index, size, INT64_MIN); + __shared__ int64_t smax; + __shared__ int64_t sidx; + + if (threadIdx.x == 0) { + smax = INT64_MIN; + sidx = INT64_MAX; + } + + __syncthreads(); + + device_idxmax(data, items_per_thread, index, size, INT64_MIN, &smax, &sidx); - *numba_return_value = idxmax; + *numba_return_value = sidx; return 0; } @@ -513,9 +584,19 @@ extern "C" __device__ int BlockIdxMax_float64(int64_t* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t idxmax = device_idxmax(data, items_per_thread, index, size, -DBL_MAX); + __shared__ double smax; + __shared__ int64_t sidx; + + if (threadIdx.x == 0) { + smax = -DBL_MAX; + sidx = INT64_MAX; + } + + __syncthreads(); - *numba_return_value = idxmax; + device_idxmax(data, items_per_thread, index, size, -DBL_MAX, &smax, &sidx); + + *numba_return_value = smax; return 0; } @@ -530,9 +611,19 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t idxmin = device_idxmin(data, items_per_thread, index, size, INT64_MAX); + __shared__ int64_t smin; + __shared__ int64_t sidx; + + if (threadIdx.x == 0) { + smin = INT64_MAX; + sidx = INT64_MAX; + } + + __syncthreads(); + + device_idxmin(data, items_per_thread, index, size, INT64_MAX, &smin, &sidx); - *numba_return_value = idxmin; + *numba_return_value = sidx; return 0; } @@ -547,9 +638,19 @@ extern "C" __device__ int BlockIdxMin_float64(int64_t* numba_return_value, // Calculate how many elements each thread is working on auto const items_per_thread = (size + tb_size - 1) / tb_size; - int64_t idxmin = device_idxmin(data, items_per_thread, index, size, DBL_MAX); + __shared__ double smin; + __shared__ int64_t sidx; + + if (threadIdx.x == 0) { + smin = DBL_MAX; + sidx = INT64_MAX; + } + + __syncthreads(); + + device_idxmin(data, items_per_thread, index, size, DBL_MAX, &smin, &sidx); - *numba_return_value = idxmin; + *numba_return_value = sidx; return 0; } \ No newline at end of file diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index f9785cdbcd7..6dc6eb16e26 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -44,6 +44,7 @@ # groups. numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 +index_default_type = types.int64 class Group(object): def __init__(self, group_data, size, index, dtype, index_dtype): @@ -55,7 +56,7 @@ def __init__(self, group_data, size, index, dtype, index_dtype): class GroupType(numba.types.Type): - def __init__(self, group_scalar_type, index_type=types.int64): + def __init__(self, group_scalar_type, index_type=index_default_type): self.group_scalar_type = group_scalar_type self.index_type = index_type self.group_data_type = types.CPointer(group_scalar_type) @@ -71,9 +72,10 @@ def typeof_group(val, c): return GroupType( numba.np.numpy_support.from_dtype(val.dtype), numba.np.numpy_support.from_dtype(val.index_dtype), - ) # converting from numpy type to numba type + ) # Identifies instances of the Group class as GroupType +# The typing of the python "function" Group.__init__ as it appears in python code @type_callable(Group) def type_group(context): def typer(group_data, size, index): @@ -100,10 +102,6 @@ def __init__( models.StructModel.__init__(self, dmm, fe_type, members) -my_max_int32 = cuda.declare_device( - "BlockMax_int32", "types.int32(types.CPointer(types.int32),types.int64)" -) - my_max_int64 = cuda.declare_device( "BlockMax_int64", "types.int64(types.CPointer(types.int64),types.int64)" ) @@ -113,10 +111,6 @@ def __init__( "types.float64(types.CPointer(types.float64),types.int64)", ) -my_min_int32 = cuda.declare_device( - "BlockMin_int32", "types.int32(types.CPointer(types.int32),types.int64)" -) - my_min_int64 = cuda.declare_device( "BlockMin_int64", "types.int64(types.CPointer(types.int64),types.int64)" ) @@ -189,6 +183,7 @@ def __init__( "types.CPointer(types.int64),types.int64)", ) + # Load the highest compute capability file available that is less than # the current device's. files = glob.glob( @@ -196,91 +191,111 @@ def __init__( ) if len(files) == 0: raise RuntimeError( - "This groupby apply installation is missing the necessary PTX " + "This strings_udf installation is missing the necessary PTX " "files. Please file an issue reporting this error and how you " "installed cudf and strings_udf." ) dev = cuda.get_current_device() cc = "".join(str(x) for x in dev.compute_capability) sms = [os.path.basename(f).rstrip(".ptx").lstrip("function_") for f in files] -selected_sm = max(sm for sm in sms if sm <= cc) +selected_sm = max(sm for sm in sms if sm < cc) dev_func_ptx = os.path.join( os.path.dirname(__file__), f"function_{selected_sm}.ptx" ) -def call_my_max_int32(data, size): - return my_max_int32(data, size) - - -def call_my_max_int64(data, size): +def call_max_int64(data, size): return my_max_int64(data, size) -def call_my_max_float64(data, size): +def call_max_float64(data, size): return my_max_float64(data, size) -def call_my_min_int32(data, size): - return my_min_int32(data, size) - - -def call_my_min_int64(data, size): +def call_min_int64(data, size): return my_min_int64(data, size) -def call_my_min_float64(data, size): +def call_min_float64(data, size): return my_min_float64(data, size) -def call_my_sum_int64(data, size): +def call_sum_int64(data, size): return my_sum_int64(data, size) -def call_my_sum_float64(data, size): +def call_sum_float64(data, size): return my_sum_float64(data, size) -def call_my_mean_int64(data, size): +def call_mean_int64(data, size): return my_mean_int64(data, size) -def call_my_mean_float64(data, size): +def call_mean_float64(data, size): return my_mean_float64(data, size) -def call_my_std_int64(data, size): +def call_std_int64(data, size): return my_std_int64(data, size) -def call_my_std_float64(data, size): +def call_std_float64(data, size): return my_std_float64(data, size) -def call_my_var_int64(data, size): +def call_var_int64(data, size): return my_var_int64(data, size) -def call_my_var_float64(data, size): +def call_var_float64(data, size): return my_var_float64(data, size) -def call_my_idxmax_int64(data, index, size): +def call_idxmax_int64(data, index, size): return my_idxmax_int64(data, index, size) -def call_my_idxmax_float64(data, index, size): +def call_idxmax_float64(data, index, size): return my_idxmax_float64(data, index, size) -def call_my_idxmin_int64(data, index, size): +def call_idxmin_int64(data, index, size): return my_idxmin_int64(data, index, size) -def call_my_idxmin_float64(data, index, size): +def call_idxmin_float64(data, index, size): return my_idxmin_float64(data, index, size) +call_cuda_functions = {} +call_cuda_functions['max'] = {} +call_cuda_functions['min'] = {} +call_cuda_functions['sum'] = {} +call_cuda_functions['mean'] = {} +call_cuda_functions['var'] = {} +call_cuda_functions['std'] = {} +call_cuda_functions['idxmax'] = {} +call_cuda_functions['idxmin'] = {} + +call_cuda_functions['max'][types.int64] = call_max_int64 +call_cuda_functions['min'][types.int64] = call_min_int64 +call_cuda_functions['sum'][types.int64] = call_sum_int64 +call_cuda_functions['mean'][types.int64] = call_mean_int64 +call_cuda_functions['std'][types.int64] = call_std_int64 +call_cuda_functions['var'][types.int64] = call_var_int64 +call_cuda_functions['idxmax'][types.int64] = call_idxmax_int64 +call_cuda_functions['idxmin'][types.int64] = call_idxmin_int64 + +call_cuda_functions['max'][types.float64] = call_max_float64 +call_cuda_functions['min'][types.float64] = call_min_float64 +call_cuda_functions['sum'][types.float64] = call_sum_float64 +call_cuda_functions['mean'][types.float64] = call_mean_float64 +call_cuda_functions['std'][types.float64] = call_std_float64 +call_cuda_functions['var'][types.float64] = call_var_float64 +call_cuda_functions['idxmax'][types.float64] = call_idxmax_float64 +call_cuda_functions['idxmin'][types.float64] = call_idxmin_float64 + @lower_builtin(Group, types.Array, types.int64, types.Array) def group_constructor(context, builder, sig, args): group_data, size, index = args @@ -433,11 +448,7 @@ def resolve_idxmin(self, mod): GroupIdxMin, GroupType(mod.group_scalar_type, mod.index_type) ) - -@cuda_lower("GroupType.max", GroupType(types.int32)) -@cuda_lower("GroupType.max", GroupType(types.int64)) -@cuda_lower("GroupType.max", GroupType(types.float64)) -def cuda_Group_max(context, builder, sig, args): +def lowering_function(context, builder, sig, args, function): retty = sig.return_type grp = cgutils.create_struct_proxy(sig.args[0])( @@ -449,52 +460,25 @@ def cuda_Group_max(context, builder, sig, args): group_data_ptr = builder.alloca(grp.group_data.type) builder.store(grp.group_data, group_data_ptr) - if grp_type.group_scalar_type == types.int32: - func = call_my_max_int32 - elif grp_type.group_scalar_type == types.int64: - func = call_my_max_int64 - elif grp_type.group_scalar_type == types.float64: - func = call_my_max_float64 + func = call_cuda_functions[function][grp_type.group_scalar_type] - result = context.compile_internal( + return context.compile_internal( builder, func, nb_signature(retty, group_dataty, grp_type.size_type), (builder.load(group_data_ptr), grp.size), ) - return result +@cuda_lower("GroupType.max", GroupType(types.int64)) +@cuda_lower("GroupType.max", GroupType(types.float64)) +def cuda_Group_max(context, builder, sig, args): + return lowering_function(context, builder, sig, args, 'max') -@cuda_lower("GroupType.min", GroupType(types.int32)) @cuda_lower("GroupType.min", GroupType(types.int64)) @cuda_lower("GroupType.min", GroupType(types.float64)) def cuda_Group_min(context, builder, sig, args): - retty = sig.return_type - - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - grp_type = sig.args[0] - - group_dataty = grp_type.group_data_type - group_data_ptr = builder.alloca(grp.group_data.type) - builder.store(grp.group_data, group_data_ptr) - - if grp_type.group_scalar_type == types.int32: - func = call_my_min_int32 - elif grp_type.group_scalar_type == types.int64: - func = call_my_min_int64 - elif grp_type.group_scalar_type == types.float64: - func = call_my_min_float64 - - result = context.compile_internal( - builder, - func, - nb_signature(retty, group_dataty, grp_type.size_type), - (builder.load(group_data_ptr), grp.size), - ) - return result + return lowering_function(context, builder, sig, args, 'min') @cuda_lower("GroupType.size", GroupType(types.int64)) @@ -503,8 +487,7 @@ def cuda_Group_size(context, builder, sig, args): grp = cgutils.create_struct_proxy(sig.args[0])( context, builder, value=args[0] ) - result = grp.size - return result + return grp.size @cuda_lower("GroupType.count", GroupType(types.int64)) @@ -513,120 +496,31 @@ def cuda_Group_count(context, builder, sig, args): grp = cgutils.create_struct_proxy(sig.args[0])( context, builder, value=args[0] ) - result = grp.size - return result + return grp.size @cuda_lower("GroupType.sum", GroupType(types.int64)) @cuda_lower("GroupType.sum", GroupType(types.float64)) def cuda_Group_sum(context, builder, sig, args): - retty = sig.return_type - - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - grp_type = sig.args[0] - - group_dataty = grp_type.group_data_type - group_data_ptr = builder.alloca(grp.group_data.type) - builder.store(grp.group_data, group_data_ptr) - - if grp_type.group_scalar_type == types.int64: - func = call_my_sum_int64 - elif grp_type.group_scalar_type == types.float64: - func = call_my_sum_float64 - - result = context.compile_internal( - builder, - func, - nb_signature(retty, group_dataty, grp_type.size_type), - (builder.load(group_data_ptr), grp.size), - ) - return result + return lowering_function(context, builder, sig, args, 'sum') @cuda_lower("GroupType.mean", GroupType(types.int64)) @cuda_lower("GroupType.mean", GroupType(types.float64)) def cuda_Group_mean(context, builder, sig, args): - retty = sig.return_type - - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - grp_type = sig.args[0] - - group_dataty = grp_type.group_data_type - group_data_ptr = builder.alloca(grp.group_data.type) - builder.store(grp.group_data, group_data_ptr) - - if grp_type.group_scalar_type == types.int64: - func = call_my_mean_int64 - elif grp_type.group_scalar_type == types.float64: - func = call_my_mean_float64 - - result = context.compile_internal( - builder, - func, - nb_signature(retty, group_dataty, grp_type.size_type), - (builder.load(group_data_ptr), grp.size), - ) - return result + return lowering_function(context, builder, sig, args, 'mean') @cuda_lower("GroupType.std", GroupType(types.int64)) @cuda_lower("GroupType.std", GroupType(types.float64)) def cuda_Group_std(context, builder, sig, args): - retty = sig.return_type - - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - grp_type = sig.args[0] - - group_dataty = grp_type.group_data_type - group_data_ptr = builder.alloca(grp.group_data.type) - builder.store(grp.group_data, group_data_ptr) - - if grp_type.group_scalar_type == types.int64: - func = call_my_std_int64 - elif grp_type.group_scalar_type == types.float64: - func = call_my_std_float64 - - result = context.compile_internal( - builder, - func, - nb_signature(retty, group_dataty, grp_type.size_type), - (builder.load(group_data_ptr), grp.size), - ) - return result + return lowering_function(context, builder, sig, args, 'std') @cuda_lower("GroupType.var", GroupType(types.int64)) @cuda_lower("GroupType.var", GroupType(types.float64)) def cuda_Group_var(context, builder, sig, args): - retty = sig.return_type - - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - grp_type = sig.args[0] - - group_dataty = grp_type.group_data_type - group_data_ptr = builder.alloca(grp.group_data.type) - builder.store(grp.group_data, group_data_ptr) - - if grp_type.group_scalar_type == types.int64: - func = call_my_var_int64 - elif grp_type.group_scalar_type == types.float64: - func = call_my_var_float64 - - result = context.compile_internal( - builder, - func, - nb_signature(retty, group_dataty, grp_type.size_type), - (builder.load(group_data_ptr), grp.size), - ) - return result + return lowering_function(context, builder, sig, args, 'var') @cuda_lower("GroupType.idxmax", GroupType(types.int64, types.int64)) @@ -647,18 +541,14 @@ def cuda_Group_idxmax(context, builder, sig, args): index_ptr = builder.alloca(grp.index.type) builder.store(grp.index, index_ptr) - if grp_type.group_scalar_type == types.int64: - func = call_my_idxmax_int64 - elif grp_type.group_scalar_type == types.float64: - func = call_my_idxmax_float64 + func = call_cuda_functions['idxmax'][grp_type.group_scalar_type] - result = context.compile_internal( + return context.compile_internal( builder, func, nb_signature(retty, group_dataty, index_dataty, grp_type.size_type), (builder.load(group_data_ptr), builder.load(index_ptr), grp.size), ) - return result @cuda_lower("GroupType.idxmin", GroupType(types.int64, types.int64)) @@ -679,18 +569,14 @@ def cuda_Group_idxmin(context, builder, sig, args): index_ptr = builder.alloca(grp.index.type) builder.store(grp.index, index_ptr) - if grp_type.group_scalar_type == types.int64: - func = call_my_idxmin_int64 - elif grp_type.group_scalar_type == types.float64: - func = call_my_idxmin_float64 + func = call_cuda_functions['idxmin'][grp_type.group_scalar_type] - result = context.compile_internal( + return context.compile_internal( builder, func, nb_signature(retty, group_dataty, index_dataty, grp_type.size_type), (builder.load(group_data_ptr), builder.load(index_ptr), grp.size), ) - return result def _get_frame_groupby_type(dtype, index_dtype): @@ -850,13 +736,10 @@ def jit_groupby_apply(offsets, grouped_values, function, *args, cache=True): # Dispatcher is specialized, so there's only one definition - get # it so we can get the cufunc from the code library kern_def = next(iter(specialized.overloads.values())) - kwargs = dict( - func=kern_def._codelibrary.get_cufunc(), - b2d_func=0, - memsize=0, - blocksizelimit=blocklim, - ) - _, tpb = ctx.get_max_potential_block_size(**kwargs) + grid, tpb = ctx.get_max_potential_block_size(func=kern_def._codelibrary.get_cufunc(), + b2d_func=0, + memsize=0, + blocksizelimit=blocklim) stream = cuda.default_stream() From d876ad77670c0cda384fb961fab2fbb61d8532da Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 21 Nov 2022 10:01:10 -0800 Subject: [PATCH 017/121] pass style, cleanup --- python/cudf/cudf/core/udf/function.cu | 113 +++++++++--------- python/cudf/cudf/core/udf/groupby_function.py | 88 +++++++------- python/cudf/cudf/core/udf/utils.py | 3 - python/strings_udf/cpp/CMakeLists.txt | 1 - 4 files changed, 104 insertions(+), 101 deletions(-) diff --git a/python/cudf/cudf/core/udf/function.cu b/python/cudf/cudf/core/udf/function.cu index 872a1392331..1b308374675 100644 --- a/python/cudf/cudf/core/udf/function.cu +++ b/python/cudf/cudf/core/udf/function.cu @@ -83,8 +83,9 @@ __device__ __forceinline__ int64_t atomicMin(int64_t* address, int64_t val) // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ void device_sum(T const* data, int const items_per_thread, size_type size, T* sum) { - int tid = threadIdx.x; +__device__ void device_sum(T const* data, int const items_per_thread, size_type size, T* sum) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; T local_sum = 0; @@ -100,18 +101,18 @@ __device__ void device_sum(T const* data, int const items_per_thread, size_type atomicAdd(sum, local_sum); __syncthreads(); - } // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ void device_var(T const* data, int const items_per_thread, size_type size, T* sum, double* var) { - +__device__ void device_var( + T const* data, int const items_per_thread, size_type size, T* sum, double* var) +{ int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on - T local_sum = 0; - double local_var = 0; + T local_sum = 0; + double local_var = 0; double mean; device_sum(data, items_per_thread, size, sum); @@ -124,9 +125,9 @@ __device__ void device_var(T const* data, int const items_per_thread, size_type #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { if (tid + (item * tb_size) < size) { - T load = data[tid + item * tb_size]; - double temp = load - mean; - temp = pow(temp, 2); + T load = data[tid + item * tb_size]; + double temp = load - mean; + temp = pow(temp, 2); local_var += temp; } } @@ -142,19 +143,20 @@ __device__ void device_var(T const* data, int const items_per_thread, size_type // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ void device_max(T const* data, int const items_per_thread, size_type size, T init_val, T* smax) { - - int tid = threadIdx.x; +__device__ void device_max( + T const* data, int const items_per_thread, size_type size, T init_val, T* smax) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; - T local_max = init_val; + T local_max = init_val; // Calculate local max for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { if (tid + (item * tb_size) < size) { - T load = data[tid + item * tb_size]; - local_max = max(local_max, load); + T load = data[tid + item * tb_size]; + local_max = max(local_max, load); } } @@ -168,19 +170,20 @@ __device__ void device_max(T const* data, int const items_per_thread, size_type // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ void device_min(T const* data, int const items_per_thread, size_type size, T init_val, T* smin) { - - int tid = threadIdx.x; +__device__ void device_min( + T const* data, int const items_per_thread, size_type size, T init_val, T* smin) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; - T local_min = init_val; + T local_min = init_val; // Calculate local min for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { if (tid + (item * tb_size) < size) { - T load = data[tid + item * tb_size]; - local_min = min(local_min, load); + T load = data[tid + item * tb_size]; + local_min = min(local_min, load); } } @@ -194,14 +197,20 @@ __device__ void device_min(T const* data, int const items_per_thread, size_type // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ void device_idxmax(T const* data, int const items_per_thread, int64_t const* index, size_type size, T init_val, T* smax, int64_t* sidx) { - +__device__ void device_idxmax(T const* data, + int const items_per_thread, + int64_t const* index, + size_type size, + T init_val, + T* smax, + int64_t* sidx) +{ int tid = threadIdx.x; int tb_size = blockDim.x; // Calculate how many elements each thread is working on - T local_max = init_val; - int64_t local_idx = -1; + T local_max = init_val; + int64_t local_idx = -1; // Calculate local max for each thread #pragma unroll @@ -222,20 +231,26 @@ __device__ void device_idxmax(T const* data, int const items_per_thread, int64_t __syncthreads(); - if (local_max == (*smax)) { atomicMin(sidx,local_idx); } + if (local_max == (*smax)) { atomicMin(sidx, local_idx); } __syncthreads(); } // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ void device_idxmin(T const* data, int const items_per_thread, int64_t const* index, size_type size, T init_val, T* smin, int64_t* sidx) { - - int tid = threadIdx.x; +__device__ void device_idxmin(T const* data, + int const items_per_thread, + int64_t const* index, + size_type size, + T init_val, + T* smin, + int64_t* sidx) +{ + int tid = threadIdx.x; int tb_size = blockDim.x; - T local_min = init_val; - int64_t local_idx = -1; + T local_min = init_val; + int64_t local_idx = -1; // Calculate local max for each thread #pragma unroll @@ -270,12 +285,10 @@ extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, auto const items_per_thread = (size + tb_size - 1) / tb_size; __shared__ int64_t sum; - if (threadIdx.x == 0) { - sum = 0; - } + if (threadIdx.x == 0) { sum = 0; } __syncthreads(); - + device_sum(data, items_per_thread, size, &sum); *numba_return_value = sum; @@ -292,9 +305,7 @@ extern "C" __device__ int BlockSum_float64(double* numba_return_value, auto const items_per_thread = (size + tb_size - 1) / tb_size; __shared__ double sum; - if (threadIdx.x == 0) { - sum = 0; - } + if (threadIdx.x == 0) { sum = 0; } __syncthreads(); @@ -314,9 +325,7 @@ extern "C" __device__ int BlockMean_int64(double* numba_return_value, auto const items_per_thread = (size + tb_size - 1) / tb_size; __shared__ int64_t sum; - if (threadIdx.x == 0) { - sum = 0; - } + if (threadIdx.x == 0) { sum = 0; } __syncthreads(); @@ -338,9 +347,7 @@ extern "C" __device__ int BlockMean_float64(double* numba_return_value, auto const items_per_thread = (size + tb_size - 1) / tb_size; __shared__ double sum; - if (threadIdx.x == 0) { - sum = 0; - } + if (threadIdx.x == 0) { sum = 0; } __syncthreads(); @@ -464,9 +471,7 @@ extern "C" __device__ int BlockMax_int64(int64_t* numba_return_value, __shared__ int64_t smax; - if (threadIdx.x == 0) { - smax = INT64_MIN; - } + if (threadIdx.x == 0) { smax = INT64_MIN; } __syncthreads(); @@ -488,9 +493,7 @@ extern "C" __device__ int BlockMax_float64(double* numba_return_value, __shared__ double smax; - if (threadIdx.x == 0) { - smax = -DBL_MAX; - } + if (threadIdx.x == 0) { smax = -DBL_MAX; } __syncthreads(); @@ -512,9 +515,7 @@ extern "C" __device__ int BlockMin_int64(int64_t* numba_return_value, __shared__ int64_t smin; - if (threadIdx.x == 0) { - smin = INT64_MAX; - } + if (threadIdx.x == 0) { smin = INT64_MAX; } __syncthreads(); @@ -536,9 +537,7 @@ extern "C" __device__ int BlockMin_float64(double* numba_return_value, __shared__ double smin; - if (threadIdx.x == 0) { - smin = DBL_MAX; - } + if (threadIdx.x == 0) { smin = DBL_MAX; } __syncthreads(); diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 8a7636e4427..e4e24191a36 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -3,6 +3,7 @@ import glob import math import os +from typing import Any, Dict import cupy as cp import numba @@ -46,6 +47,7 @@ index_default_type = types.int64 + class Group(object): def __init__(self, group_data, size, index, dtype, index_dtype): self.group_data = group_data @@ -75,7 +77,8 @@ def typeof_group(val, c): ) # Identifies instances of the Group class as GroupType -# The typing of the python "function" Group.__init__ as it appears in python code +# The typing of the python "function" Group.__init__ +# as it appears in python code @type_callable(Group) def type_group(context): def typer(group_data, size, index): @@ -268,33 +271,34 @@ def call_idxmin_float64(data, index, size): return my_idxmin_float64(data, index, size) -call_cuda_functions = {} -call_cuda_functions['max'] = {} -call_cuda_functions['min'] = {} -call_cuda_functions['sum'] = {} -call_cuda_functions['mean'] = {} -call_cuda_functions['var'] = {} -call_cuda_functions['std'] = {} -call_cuda_functions['idxmax'] = {} -call_cuda_functions['idxmin'] = {} - -call_cuda_functions['max'][types.int64] = call_max_int64 -call_cuda_functions['min'][types.int64] = call_min_int64 -call_cuda_functions['sum'][types.int64] = call_sum_int64 -call_cuda_functions['mean'][types.int64] = call_mean_int64 -call_cuda_functions['std'][types.int64] = call_std_int64 -call_cuda_functions['var'][types.int64] = call_var_int64 -call_cuda_functions['idxmax'][types.int64] = call_idxmax_int64 -call_cuda_functions['idxmin'][types.int64] = call_idxmin_int64 - -call_cuda_functions['max'][types.float64] = call_max_float64 -call_cuda_functions['min'][types.float64] = call_min_float64 -call_cuda_functions['sum'][types.float64] = call_sum_float64 -call_cuda_functions['mean'][types.float64] = call_mean_float64 -call_cuda_functions['std'][types.float64] = call_std_float64 -call_cuda_functions['var'][types.float64] = call_var_float64 -call_cuda_functions['idxmax'][types.float64] = call_idxmax_float64 -call_cuda_functions['idxmin'][types.float64] = call_idxmin_float64 +call_cuda_functions: Dict[Any, Any] = {} +call_cuda_functions["max"] = {} +call_cuda_functions["min"] = {} +call_cuda_functions["sum"] = {} +call_cuda_functions["mean"] = {} +call_cuda_functions["var"] = {} +call_cuda_functions["std"] = {} +call_cuda_functions["idxmax"] = {} +call_cuda_functions["idxmin"] = {} + +call_cuda_functions["max"][types.int64] = call_max_int64 +call_cuda_functions["min"][types.int64] = call_min_int64 +call_cuda_functions["sum"][types.int64] = call_sum_int64 +call_cuda_functions["mean"][types.int64] = call_mean_int64 +call_cuda_functions["std"][types.int64] = call_std_int64 +call_cuda_functions["var"][types.int64] = call_var_int64 +call_cuda_functions["idxmax"][types.int64] = call_idxmax_int64 +call_cuda_functions["idxmin"][types.int64] = call_idxmin_int64 + +call_cuda_functions["max"][types.float64] = call_max_float64 +call_cuda_functions["min"][types.float64] = call_min_float64 +call_cuda_functions["sum"][types.float64] = call_sum_float64 +call_cuda_functions["mean"][types.float64] = call_mean_float64 +call_cuda_functions["std"][types.float64] = call_std_float64 +call_cuda_functions["var"][types.float64] = call_var_float64 +call_cuda_functions["idxmax"][types.float64] = call_idxmax_float64 +call_cuda_functions["idxmin"][types.float64] = call_idxmin_float64 + @lower_builtin(Group, types.Array, types.int64, types.Array) def group_constructor(context, builder, sig, args): @@ -448,6 +452,7 @@ def resolve_idxmin(self, mod): GroupIdxMin, GroupType(mod.group_scalar_type, mod.index_type) ) + def lowering_function(context, builder, sig, args, function): retty = sig.return_type @@ -469,16 +474,17 @@ def lowering_function(context, builder, sig, args, function): (builder.load(group_data_ptr), grp.size), ) + @cuda_lower("GroupType.max", GroupType(types.int64)) @cuda_lower("GroupType.max", GroupType(types.float64)) def cuda_Group_max(context, builder, sig, args): - return lowering_function(context, builder, sig, args, 'max') + return lowering_function(context, builder, sig, args, "max") @cuda_lower("GroupType.min", GroupType(types.int64)) @cuda_lower("GroupType.min", GroupType(types.float64)) def cuda_Group_min(context, builder, sig, args): - return lowering_function(context, builder, sig, args, 'min') + return lowering_function(context, builder, sig, args, "min") @cuda_lower("GroupType.size", GroupType(types.int64)) @@ -502,25 +508,25 @@ def cuda_Group_count(context, builder, sig, args): @cuda_lower("GroupType.sum", GroupType(types.int64)) @cuda_lower("GroupType.sum", GroupType(types.float64)) def cuda_Group_sum(context, builder, sig, args): - return lowering_function(context, builder, sig, args, 'sum') + return lowering_function(context, builder, sig, args, "sum") @cuda_lower("GroupType.mean", GroupType(types.int64)) @cuda_lower("GroupType.mean", GroupType(types.float64)) def cuda_Group_mean(context, builder, sig, args): - return lowering_function(context, builder, sig, args, 'mean') + return lowering_function(context, builder, sig, args, "mean") @cuda_lower("GroupType.std", GroupType(types.int64)) @cuda_lower("GroupType.std", GroupType(types.float64)) def cuda_Group_std(context, builder, sig, args): - return lowering_function(context, builder, sig, args, 'std') + return lowering_function(context, builder, sig, args, "std") @cuda_lower("GroupType.var", GroupType(types.int64)) @cuda_lower("GroupType.var", GroupType(types.float64)) def cuda_Group_var(context, builder, sig, args): - return lowering_function(context, builder, sig, args, 'var') + return lowering_function(context, builder, sig, args, "var") @cuda_lower("GroupType.idxmax", GroupType(types.int64, types.int64)) @@ -541,7 +547,7 @@ def cuda_Group_idxmax(context, builder, sig, args): index_ptr = builder.alloca(grp.index.type) builder.store(grp.index, index_ptr) - func = call_cuda_functions['idxmax'][grp_type.group_scalar_type] + func = call_cuda_functions["idxmax"][grp_type.group_scalar_type] return context.compile_internal( builder, @@ -569,7 +575,7 @@ def cuda_Group_idxmin(context, builder, sig, args): index_ptr = builder.alloca(grp.index.type) builder.store(grp.index, index_ptr) - func = call_cuda_functions['idxmin'][grp_type.group_scalar_type] + func = call_cuda_functions["idxmin"][grp_type.group_scalar_type] return context.compile_internal( builder, @@ -736,10 +742,12 @@ def jit_groupby_apply(offsets, grouped_values, function, *args, cache=True): # Dispatcher is specialized, so there's only one definition - get # it so we can get the cufunc from the code library kern_def = next(iter(specialized.overloads.values())) - grid, tpb = ctx.get_max_potential_block_size(func=kern_def._codelibrary.get_cufunc(), - b2d_func=0, - memsize=0, - blocksizelimit=blocklim) + grid, tpb = ctx.get_max_potential_block_size( + func=kern_def._codelibrary.get_cufunc(), + b2d_func=0, + memsize=0, + blocksizelimit=blocklim, + ) stream = cuda.default_stream() diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 0a906e1fd34..bb4ea351fca 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -253,9 +253,6 @@ def _get_kernel_groupby_apply(kernel_string, globals_, func, dev_func_ptx): return kernel -launch_arg_getters: Dict[Any, Any] = {} - - def _get_input_args_from_frame(fr): args = [] offsets = [] diff --git a/python/strings_udf/cpp/CMakeLists.txt b/python/strings_udf/cpp/CMakeLists.txt index 50617196ac2..4bcb65e3aae 100644 --- a/python/strings_udf/cpp/CMakeLists.txt +++ b/python/strings_udf/cpp/CMakeLists.txt @@ -105,7 +105,6 @@ message("\n\n\n") message("${CMAKE_CUDA_ARCHITECTURES}") message("\n\n\n") - foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) set(tgt shim_${arch}) From f6001969f320120c93db781bfd5d6cf47f23da6a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 21 Nov 2022 12:20:05 -0800 Subject: [PATCH 018/121] start to move files --- python/cudf/CMakeLists.txt | 2 +- .../core/udf => udf_cpp/groupby}/CMakeLists.txt | 16 ---------------- .../core/udf => udf_cpp/groupby}/function.cu | 0 3 files changed, 1 insertion(+), 17 deletions(-) rename python/cudf/{cudf/core/udf => udf_cpp/groupby}/CMakeLists.txt (76%) rename python/cudf/{cudf/core/udf => udf_cpp/groupby}/function.cu (100%) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 1c1e41cbf82..62ecdfb23f1 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -123,7 +123,7 @@ endif() rapids_cython_init() add_subdirectory(cudf/_lib) -add_subdirectory(cudf/core/udf) +add_subdirectory(udf_cpp/groupby) include(cmake/Modules/ProtobufHelpers.cmake) codegen_protoc(cudf/utils/metadata/orc_column_statistics.proto) diff --git a/python/cudf/cudf/core/udf/CMakeLists.txt b/python/cudf/udf_cpp/groupby/CMakeLists.txt similarity index 76% rename from python/cudf/cudf/core/udf/CMakeLists.txt rename to python/cudf/udf_cpp/groupby/CMakeLists.txt index 6e81a218034..0a35c8ee0b4 100644 --- a/python/cudf/cudf/core/udf/CMakeLists.txt +++ b/python/cudf/udf_cpp/groupby/CMakeLists.txt @@ -29,22 +29,6 @@ project( rapids_find_package(CUDAToolkit REQUIRED) -# include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) rapids_cpm_libcudacxx(BUILD_EXPORT_SET -# strings-udf-exports INSTALL_EXPORT_SET strings-udf-exports) - -# add_library(groupby_udf_cpp SHARED function.cu) target_include_directories( groupby_udf_cpp PUBLIC -# "$" ) - -# set_target_properties( groupby_udf_cpp PROPERTIES BUILD_RPATH "\$ORIGIN" INSTALL_RPATH "\$ORIGIN" -# CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON -# POSITION_INDEPENDENT_CODE ON INTERFACE_POSITION_INDEPENDENT_CODE ON ) -# -# set(UDF_CXX_FLAGS) set(UDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) -# target_compile_options( groupby_udf_cpp PRIVATE "$<$:${UDF_CXX_FLAGS}>" -# "$<$:${UDF_CUDA_FLAGS}>" -# ) target_link_libraries(groupby_udf_cpp PUBLIC cudf::cudf CUDA::nvrtc) install(TARGETS -# groupby_udf_cpp DESTINATION ./cudf/core/udf/) - # This function will copy the generated PTX file from its generator-specific location in the build # tree into a specified location in the build tree from which we can install it. function(copy_ptx_to_location target destination) diff --git a/python/cudf/cudf/core/udf/function.cu b/python/cudf/udf_cpp/groupby/function.cu similarity index 100% rename from python/cudf/cudf/core/udf/function.cu rename to python/cudf/udf_cpp/groupby/function.cu From 6cbdaf8e8831c1625863b7749bad254c0d253a6d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 22 Nov 2022 09:51:32 -0800 Subject: [PATCH 019/121] starting to refactor --- python/cudf/cudf/core/udf/groupby_function.py | 85 ++++++------------- python/cudf/cudf/core/udf/utils.py | 55 ++++++++++++ python/strings_udf/strings_udf/__init__.py | 58 +------------ 3 files changed, 86 insertions(+), 112 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index e4e24191a36..1e24945ee3d 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -1,6 +1,5 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. -import glob import math import os from typing import Any, Dict @@ -35,6 +34,7 @@ _all_dtypes_from_frame, _compile_or_get, _get_kernel_groupby_apply, + _get_ptx_file, _get_udf_return_type, _supported_cols_from_frame, _supported_dtypes_from_frame, @@ -46,6 +46,7 @@ numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 index_default_type = types.int64 +dev_func_ptx = _get_ptx_file(os.path.dirname(__file__), "function_") class Group(object): @@ -105,23 +106,34 @@ def __init__( models.StructModel.__init__(self, dmm, fe_type, members) -my_max_int64 = cuda.declare_device( - "BlockMax_int64", "types.int64(types.CPointer(types.int64),types.int64)" -) +_funcs = ["Max", "Min"] +_types = [types.int64, types.float64] +_cuda_funcs = {} +for func in _funcs: + for ty in _types: + _cuda_funcs[func.lower()] = cuda.declare_device( + f"Block{func}_{ty}", ty(types.CPointer(ty), types.int64) + ) -my_max_float64 = cuda.declare_device( - "BlockMax_float64", - "types.float64(types.CPointer(types.float64),types.int64)", -) +call_cuda_functions: Dict[Any, Any] = {} -my_min_int64 = cuda.declare_device( - "BlockMin_int64", "types.int64(types.CPointer(types.int64),types.int64)" -) -my_min_float64 = cuda.declare_device( - "BlockMin_float64", - "types.float64(types.CPointer(types.float64),types.int64)", -) +def _register_cuda_reduction_caller(func, ty): + func = func.lower() + cuda_func = _cuda_funcs[func] + + def caller(data, size): + return cuda_func(data, size) + + if call_cuda_functions.get(func.lower()) is None: + call_cuda_functions[func] = {} + call_cuda_functions[func][ty] = caller + + +_register_cuda_reduction_caller("max", types.int64) +_register_cuda_reduction_caller("max", types.float64) +_register_cuda_reduction_caller("min", types.int64) +_register_cuda_reduction_caller("min", types.float64) my_sum_int64 = cuda.declare_device( "BlockSum_int64", "types.int64(types.CPointer(types.int64),types.int64)" @@ -187,42 +199,6 @@ def __init__( ) -# Load the highest compute capability file available that is less than -# the current device's. -files = glob.glob( - os.path.join(os.path.dirname(os.path.realpath(__file__)), "function_*.ptx") -) -if len(files) == 0: - raise RuntimeError( - "This strings_udf installation is missing the necessary PTX " - "files. Please file an issue reporting this error and how you " - "installed cudf and strings_udf." - ) -dev = cuda.get_current_device() -cc = "".join(str(x) for x in dev.compute_capability) -sms = [os.path.basename(f).rstrip(".ptx").lstrip("function_") for f in files] -selected_sm = max(sm for sm in sms if sm <= cc) -dev_func_ptx = os.path.join( - os.path.dirname(__file__), f"function_{selected_sm}.ptx" -) - - -def call_max_int64(data, size): - return my_max_int64(data, size) - - -def call_max_float64(data, size): - return my_max_float64(data, size) - - -def call_min_int64(data, size): - return my_min_int64(data, size) - - -def call_min_float64(data, size): - return my_min_float64(data, size) - - def call_sum_int64(data, size): return my_sum_int64(data, size) @@ -271,9 +247,6 @@ def call_idxmin_float64(data, index, size): return my_idxmin_float64(data, index, size) -call_cuda_functions: Dict[Any, Any] = {} -call_cuda_functions["max"] = {} -call_cuda_functions["min"] = {} call_cuda_functions["sum"] = {} call_cuda_functions["mean"] = {} call_cuda_functions["var"] = {} @@ -281,8 +254,6 @@ def call_idxmin_float64(data, index, size): call_cuda_functions["idxmax"] = {} call_cuda_functions["idxmin"] = {} -call_cuda_functions["max"][types.int64] = call_max_int64 -call_cuda_functions["min"][types.int64] = call_min_int64 call_cuda_functions["sum"][types.int64] = call_sum_int64 call_cuda_functions["mean"][types.int64] = call_mean_int64 call_cuda_functions["std"][types.int64] = call_std_int64 @@ -290,8 +261,6 @@ def call_idxmin_float64(data, index, size): call_cuda_functions["idxmax"][types.int64] = call_idxmax_int64 call_cuda_functions["idxmin"][types.int64] = call_idxmin_int64 -call_cuda_functions["max"][types.float64] = call_max_float64 -call_cuda_functions["min"][types.float64] = call_min_float64 call_cuda_functions["sum"][types.float64] = call_sum_float64 call_cuda_functions["mean"][types.float64] = call_mean_float64 call_cuda_functions["std"][types.float64] = call_std_float64 diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index bb4ea351fca..0a9833fe5b8 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -1,5 +1,7 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. +import glob +import os from typing import Any, Callable, Dict, List import cachetools @@ -283,3 +285,56 @@ def _post_process_output_col(col, retty): if getter := output_col_getters.get(retty): col = getter(col) return as_column(col, retty) + + +def _get_appropriate_file(sms, cc): + filtered_sms = list(filter(lambda x: x[0] <= cc, sms)) + if filtered_sms: + return max(filtered_sms, key=lambda y: y[0]) + else: + return None + + +def _get_ptx_file(path, prefix): + if "RAPIDS_NO_INITIALIZE" in os.environ: + # shim_60.ptx is always built + cc = int(os.environ.get("STRINGS_UDF_CC", "60")) + else: + dev = cuda.get_current_device() + + # Load the highest compute capability file available that is less than + # the current device's. + cc = int("".join(str(x) for x in dev.compute_capability)) + files = glob.glob(os.path.join(path, f"{prefix}*.ptx")) + if len(files) == 0: + raise RuntimeError( + "This strings_udf installation is missing the necessary PTX " + f"files for compute capability {cc}. " + "Please file an issue reporting this error and how you " + "installed cudf and strings_udf." + "https://github.com/rapidsai/cudf/issues" + ) + regular_sms = [] + + for f in files: + file_name = os.path.basename(f) + sm_number = file_name.rstrip(".ptx").lstrip(prefix) + if sm_number.endswith("a"): + processed_sm_number = int(sm_number.rstrip("a")) + if processed_sm_number == cc: + return f + else: + regular_sms.append((int(sm_number), f)) + + regular_result = None + + if regular_sms: + regular_result = _get_appropriate_file(regular_sms, cc) + + if regular_result is None: + raise RuntimeError( + "This strings_udf installation is missing the necessary PTX " + f"files that are <={cc}." + ) + else: + return regular_result[1] diff --git a/python/strings_udf/strings_udf/__init__.py b/python/strings_udf/strings_udf/__init__.py index bf13b79ab90..0b4dc11779a 100644 --- a/python/strings_udf/strings_udf/__init__.py +++ b/python/strings_udf/strings_udf/__init__.py @@ -8,6 +8,8 @@ from numba.cuda.cudadrv.driver import Linker from ptxcompiler.patch import NO_DRIVER, safe_get_versions +from cudf.core.udf.utils import _get_ptx_file + from . import _version __version__ = _version.get_versions()["version"] @@ -17,13 +19,7 @@ # tracks the version of CUDA used to build the c++ and PTX components STRINGS_UDF_PTX_VERSION = (11, 5) - -def _get_appropriate_file(sms, cc): - filtered_sms = list(filter(lambda x: x[0] <= cc, sms)) - if filtered_sms: - return max(filtered_sms, key=lambda y: y[0]) - else: - return None +path = os.path.dirname(__file__) def maybe_patch_numba_linker(driver_version): @@ -42,52 +38,6 @@ def maybe_patch_numba_linker(driver_version): logger.debug("Cannot patch Numba Linker - unsupported version") -def _get_ptx_file(): - if "RAPIDS_NO_INITIALIZE" in os.environ: - # shim_60.ptx is always built - cc = int(os.environ.get("STRINGS_UDF_CC", "60")) - else: - dev = cuda.get_current_device() - - # Load the highest compute capability file available that is less than - # the current device's. - cc = int("".join(str(x) for x in dev.compute_capability)) - files = glob.glob(os.path.join(os.path.dirname(__file__), "shim_*.ptx")) - if len(files) == 0: - raise RuntimeError( - "This strings_udf installation is missing the necessary PTX " - f"files for compute capability {cc}. " - "Please file an issue reporting this error and how you " - "installed cudf and strings_udf." - "https://github.com/rapidsai/cudf/issues" - ) - - regular_sms = [] - - for f in files: - file_name = os.path.basename(f) - sm_number = file_name.rstrip(".ptx").lstrip("shim_") - if sm_number.endswith("a"): - processed_sm_number = int(sm_number.rstrip("a")) - if processed_sm_number == cc: - return f - else: - regular_sms.append((int(sm_number), f)) - - regular_result = None - - if regular_sms: - regular_result = _get_appropriate_file(regular_sms, cc) - - if regular_result is None: - raise RuntimeError( - "This strings_udf installation is missing the necessary PTX " - f"files that are <={cc}." - ) - else: - return regular_result[1] - - # Maximum size of a string column is 2 GiB _STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get( "STRINGS_UDF_HEAP_SIZE", 2**31 @@ -117,4 +67,4 @@ def set_malloc_heap_size(size=None): if versions != NO_DRIVER: driver_version, runtime_version = versions maybe_patch_numba_linker(driver_version) - ptxpath = _get_ptx_file() + ptxpath = _get_ptx_file(path, "shim_") From 3a11fe11c545f710207b8b15a62521dd802bcab9 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Nov 2022 07:18:01 -0800 Subject: [PATCH 020/121] continue to refactor typing --- python/cudf/cudf/core/udf/groupby_function.py | 122 +++--------------- 1 file changed, 21 insertions(+), 101 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 1e24945ee3d..48603b342be 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -106,73 +106,39 @@ def __init__( models.StructModel.__init__(self, dmm, fe_type, members) -_funcs = ["Max", "Min"] -_types = [types.int64, types.float64] -_cuda_funcs = {} -for func in _funcs: - for ty in _types: - _cuda_funcs[func.lower()] = cuda.declare_device( - f"Block{func}_{ty}", ty(types.CPointer(ty), types.int64) - ) +SUPPORTED_INPUT_TYPES = [types.int64, types.float64] + call_cuda_functions: Dict[Any, Any] = {} -def _register_cuda_reduction_caller(func, ty): - func = func.lower() - cuda_func = _cuda_funcs[func] +def _register_cuda_reduction_caller(func, inputty, retty): + cuda_func = cuda.declare_device( + f"Block{func}_{inputty}", retty(types.CPointer(inputty), types.int64) + ) def caller(data, size): return cuda_func(data, size) if call_cuda_functions.get(func.lower()) is None: - call_cuda_functions[func] = {} - call_cuda_functions[func][ty] = caller - - -_register_cuda_reduction_caller("max", types.int64) -_register_cuda_reduction_caller("max", types.float64) -_register_cuda_reduction_caller("min", types.int64) -_register_cuda_reduction_caller("min", types.float64) - -my_sum_int64 = cuda.declare_device( - "BlockSum_int64", "types.int64(types.CPointer(types.int64),types.int64)" -) - -my_sum_float64 = cuda.declare_device( - "BlockSum_float64", - "types.float64(types.CPointer(types.float64),types.int64)", -) + call_cuda_functions[func.lower()] = {} + call_cuda_functions[func.lower()][retty] = caller -my_mean_int64 = cuda.declare_device( - "BlockMean_int64", - "types.float64(types.CPointer(types.int64),types.int64)", -) -my_mean_float64 = cuda.declare_device( - "BlockMean_float64", - "types.float64(types.CPointer(types.float64),types.int64)", -) +_register_cuda_reduction_caller("Max", types.float64, types.float64) +_register_cuda_reduction_caller("Max", types.int64, types.int64) +_register_cuda_reduction_caller("Min", types.float64, types.float64) +_register_cuda_reduction_caller("Min", types.int64, types.int64) +_register_cuda_reduction_caller("Min", types.float64, types.float64) +_register_cuda_reduction_caller("Sum", types.int64, types.int64) +_register_cuda_reduction_caller("Sum", types.float64, types.float64) +_register_cuda_reduction_caller("Mean", types.int64, types.float64) +_register_cuda_reduction_caller("Mean", types.float64, types.float64) +_register_cuda_reduction_caller("Std", types.int64, types.float64) +_register_cuda_reduction_caller("Std", types.float64, types.float64) +_register_cuda_reduction_caller("Var", types.int64, types.float64) +_register_cuda_reduction_caller("Var", types.float64, types.float64) -my_std_int64 = cuda.declare_device( - "BlockStd_int64", - "types.float64(types.CPointer(types.int64),types.int64)", -) - -my_std_float64 = cuda.declare_device( - "BlockStd_float64", - "types.float64(types.CPointer(types.float64),types.int64)", -) - -my_var_int64 = cuda.declare_device( - "BlockVar_int64", - "types.float64(types.CPointer(types.int64),types.int64)", -) - -my_var_float64 = cuda.declare_device( - "BlockVar_float64", - "types.float64(types.CPointer(types.float64),types.int64)", -) my_idxmax_int64 = cuda.declare_device( "BlockIdxMax_int64", @@ -199,38 +165,6 @@ def caller(data, size): ) -def call_sum_int64(data, size): - return my_sum_int64(data, size) - - -def call_sum_float64(data, size): - return my_sum_float64(data, size) - - -def call_mean_int64(data, size): - return my_mean_int64(data, size) - - -def call_mean_float64(data, size): - return my_mean_float64(data, size) - - -def call_std_int64(data, size): - return my_std_int64(data, size) - - -def call_std_float64(data, size): - return my_std_float64(data, size) - - -def call_var_int64(data, size): - return my_var_int64(data, size) - - -def call_var_float64(data, size): - return my_var_float64(data, size) - - def call_idxmax_int64(data, index, size): return my_idxmax_int64(data, index, size) @@ -247,24 +181,10 @@ def call_idxmin_float64(data, index, size): return my_idxmin_float64(data, index, size) -call_cuda_functions["sum"] = {} -call_cuda_functions["mean"] = {} -call_cuda_functions["var"] = {} -call_cuda_functions["std"] = {} call_cuda_functions["idxmax"] = {} call_cuda_functions["idxmin"] = {} - -call_cuda_functions["sum"][types.int64] = call_sum_int64 -call_cuda_functions["mean"][types.int64] = call_mean_int64 -call_cuda_functions["std"][types.int64] = call_std_int64 -call_cuda_functions["var"][types.int64] = call_var_int64 call_cuda_functions["idxmax"][types.int64] = call_idxmax_int64 call_cuda_functions["idxmin"][types.int64] = call_idxmin_int64 - -call_cuda_functions["sum"][types.float64] = call_sum_float64 -call_cuda_functions["mean"][types.float64] = call_mean_float64 -call_cuda_functions["std"][types.float64] = call_std_float64 -call_cuda_functions["var"][types.float64] = call_var_float64 call_cuda_functions["idxmax"][types.float64] = call_idxmax_float64 call_cuda_functions["idxmin"][types.float64] = call_idxmin_float64 From 81615485c8d3d8eef2edcfd670fc5d9e5a596826 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Nov 2022 07:50:21 -0800 Subject: [PATCH 021/121] move lowering to its own file --- python/cudf/cudf/core/udf/__init__.py | 7 +- python/cudf/cudf/core/udf/groupby_function.py | 158 ---------------- python/cudf/cudf/core/udf/groupby_lowering.py | 168 ++++++++++++++++++ 3 files changed, 174 insertions(+), 159 deletions(-) create mode 100644 python/cudf/cudf/core/udf/groupby_lowering.py diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py index 8092207e037..a6c9fbe2b2a 100644 --- a/python/cudf/cudf/core/udf/__init__.py +++ b/python/cudf/cudf/core/udf/__init__.py @@ -9,7 +9,12 @@ from cudf.core.udf import api, row_function, utils from cudf.utils.dtypes import STRING_TYPES -from . import masked_lowering, masked_typing +from . import ( + groupby_function, + groupby_lowering, + masked_lowering, + masked_typing, +) _units = ["ns", "ms", "us", "s"] _datetime_cases = {types.NPDatetime(u) for u in _units} diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 48603b342be..45645a3b75e 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -8,9 +8,7 @@ import numba import numpy as np from numba import cuda, types -from numba.core import cgutils from numba.core.extending import ( - lower_builtin, make_attribute_wrapper, models, register_model, @@ -21,7 +19,6 @@ from numba.core.typing.templates import AbstractTemplate, AttributeTemplate from numba.cuda.cudadecl import registry as cuda_registry from numba.cuda.cudadrv.devices import get_context -from numba.cuda.cudaimpl import lower as cuda_lower from numba.np import numpy_support from numba.types import Record @@ -189,29 +186,6 @@ def call_idxmin_float64(data, index, size): call_cuda_functions["idxmin"][types.float64] = call_idxmin_float64 -@lower_builtin(Group, types.Array, types.int64, types.Array) -def group_constructor(context, builder, sig, args): - group_data, size, index = args - - grp = cgutils.create_struct_proxy(sig.return_type)(context, builder) - - arr_group_data = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=group_data - ) - group_data_ptr = arr_group_data.data - - arr_index = cgutils.create_struct_proxy(sig.args[2])( - context, builder, value=index - ) - index_ptr = arr_index.data - - grp.group_data = group_data_ptr - grp.index = index_ptr - grp.size = size - - return grp._getvalue() - - make_attribute_wrapper(GroupType, "group_data", "group_data") make_attribute_wrapper(GroupType, "index", "index") make_attribute_wrapper(GroupType, "size", "size") @@ -342,138 +316,6 @@ def resolve_idxmin(self, mod): ) -def lowering_function(context, builder, sig, args, function): - retty = sig.return_type - - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - grp_type = sig.args[0] - - group_dataty = grp_type.group_data_type - group_data_ptr = builder.alloca(grp.group_data.type) - builder.store(grp.group_data, group_data_ptr) - - func = call_cuda_functions[function][grp_type.group_scalar_type] - - return context.compile_internal( - builder, - func, - nb_signature(retty, group_dataty, grp_type.size_type), - (builder.load(group_data_ptr), grp.size), - ) - - -@cuda_lower("GroupType.max", GroupType(types.int64)) -@cuda_lower("GroupType.max", GroupType(types.float64)) -def cuda_Group_max(context, builder, sig, args): - return lowering_function(context, builder, sig, args, "max") - - -@cuda_lower("GroupType.min", GroupType(types.int64)) -@cuda_lower("GroupType.min", GroupType(types.float64)) -def cuda_Group_min(context, builder, sig, args): - return lowering_function(context, builder, sig, args, "min") - - -@cuda_lower("GroupType.size", GroupType(types.int64)) -@cuda_lower("GroupType.size", GroupType(types.float64)) -def cuda_Group_size(context, builder, sig, args): - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - return grp.size - - -@cuda_lower("GroupType.count", GroupType(types.int64)) -@cuda_lower("GroupType.count", GroupType(types.float64)) -def cuda_Group_count(context, builder, sig, args): - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - return grp.size - - -@cuda_lower("GroupType.sum", GroupType(types.int64)) -@cuda_lower("GroupType.sum", GroupType(types.float64)) -def cuda_Group_sum(context, builder, sig, args): - return lowering_function(context, builder, sig, args, "sum") - - -@cuda_lower("GroupType.mean", GroupType(types.int64)) -@cuda_lower("GroupType.mean", GroupType(types.float64)) -def cuda_Group_mean(context, builder, sig, args): - return lowering_function(context, builder, sig, args, "mean") - - -@cuda_lower("GroupType.std", GroupType(types.int64)) -@cuda_lower("GroupType.std", GroupType(types.float64)) -def cuda_Group_std(context, builder, sig, args): - return lowering_function(context, builder, sig, args, "std") - - -@cuda_lower("GroupType.var", GroupType(types.int64)) -@cuda_lower("GroupType.var", GroupType(types.float64)) -def cuda_Group_var(context, builder, sig, args): - return lowering_function(context, builder, sig, args, "var") - - -@cuda_lower("GroupType.idxmax", GroupType(types.int64, types.int64)) -@cuda_lower("GroupType.idxmax", GroupType(types.float64, types.int64)) -def cuda_Group_idxmax(context, builder, sig, args): - retty = sig.return_type - - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - grp_type = sig.args[0] - - group_dataty = grp_type.group_data_type - group_data_ptr = builder.alloca(grp.group_data.type) - builder.store(grp.group_data, group_data_ptr) - - index_dataty = grp_type.group_index_type - index_ptr = builder.alloca(grp.index.type) - builder.store(grp.index, index_ptr) - - func = call_cuda_functions["idxmax"][grp_type.group_scalar_type] - - return context.compile_internal( - builder, - func, - nb_signature(retty, group_dataty, index_dataty, grp_type.size_type), - (builder.load(group_data_ptr), builder.load(index_ptr), grp.size), - ) - - -@cuda_lower("GroupType.idxmin", GroupType(types.int64, types.int64)) -@cuda_lower("GroupType.idxmin", GroupType(types.float64, types.int64)) -def cuda_Group_idxmin(context, builder, sig, args): - retty = sig.return_type - - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - grp_type = sig.args[0] - - group_dataty = grp_type.group_data_type - group_data_ptr = builder.alloca(grp.group_data.type) - builder.store(grp.group_data, group_data_ptr) - - index_dataty = grp_type.group_index_type - index_ptr = builder.alloca(grp.index.type) - builder.store(grp.index, index_ptr) - - func = call_cuda_functions["idxmin"][grp_type.group_scalar_type] - - return context.compile_internal( - builder, - func, - nb_signature(retty, group_dataty, index_dataty, grp_type.size_type), - (builder.load(group_data_ptr), builder.load(index_ptr), grp.size), - ) - - def _get_frame_groupby_type(dtype, index_dtype): """ Get the numba `Record` type corresponding to a frame. diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py new file mode 100644 index 00000000000..ca2623cf706 --- /dev/null +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -0,0 +1,168 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from numba import types +from numba.core import cgutils +from numba.core.extending import lower_builtin +from numba.core.typing import signature as nb_signature +from numba.cuda.cudaimpl import lower as cuda_lower + +from cudf.core.udf.groupby_function import ( + Group, + GroupType, + call_cuda_functions, +) + + +def lowering_function(context, builder, sig, args, function): + retty = sig.return_type + + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + grp_type = sig.args[0] + + group_dataty = grp_type.group_data_type + group_data_ptr = builder.alloca(grp.group_data.type) + builder.store(grp.group_data, group_data_ptr) + + func = call_cuda_functions[function][grp_type.group_scalar_type] + + return context.compile_internal( + builder, + func, + nb_signature(retty, group_dataty, grp_type.size_type), + (builder.load(group_data_ptr), grp.size), + ) + + +@lower_builtin(Group, types.Array, types.int64, types.Array) +def group_constructor(context, builder, sig, args): + group_data, size, index = args + + grp = cgutils.create_struct_proxy(sig.return_type)(context, builder) + + arr_group_data = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=group_data + ) + group_data_ptr = arr_group_data.data + + arr_index = cgutils.create_struct_proxy(sig.args[2])( + context, builder, value=index + ) + index_ptr = arr_index.data + + grp.group_data = group_data_ptr + grp.index = index_ptr + grp.size = size + + return grp._getvalue() + + +@cuda_lower("GroupType.max", GroupType(types.int64)) +@cuda_lower("GroupType.max", GroupType(types.float64)) +def cuda_Group_max(context, builder, sig, args): + return lowering_function(context, builder, sig, args, "max") + + +@cuda_lower("GroupType.min", GroupType(types.int64)) +@cuda_lower("GroupType.min", GroupType(types.float64)) +def cuda_Group_min(context, builder, sig, args): + return lowering_function(context, builder, sig, args, "min") + + +@cuda_lower("GroupType.size", GroupType(types.int64)) +@cuda_lower("GroupType.size", GroupType(types.float64)) +def cuda_Group_size(context, builder, sig, args): + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + return grp.size + + +@cuda_lower("GroupType.count", GroupType(types.int64)) +@cuda_lower("GroupType.count", GroupType(types.float64)) +def cuda_Group_count(context, builder, sig, args): + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + return grp.size + + +@cuda_lower("GroupType.sum", GroupType(types.int64)) +@cuda_lower("GroupType.sum", GroupType(types.float64)) +def cuda_Group_sum(context, builder, sig, args): + return lowering_function(context, builder, sig, args, "sum") + + +@cuda_lower("GroupType.mean", GroupType(types.int64)) +@cuda_lower("GroupType.mean", GroupType(types.float64)) +def cuda_Group_mean(context, builder, sig, args): + return lowering_function(context, builder, sig, args, "mean") + + +@cuda_lower("GroupType.std", GroupType(types.int64)) +@cuda_lower("GroupType.std", GroupType(types.float64)) +def cuda_Group_std(context, builder, sig, args): + return lowering_function(context, builder, sig, args, "std") + + +@cuda_lower("GroupType.var", GroupType(types.int64)) +@cuda_lower("GroupType.var", GroupType(types.float64)) +def cuda_Group_var(context, builder, sig, args): + return lowering_function(context, builder, sig, args, "var") + + +@cuda_lower("GroupType.idxmax", GroupType(types.int64, types.int64)) +@cuda_lower("GroupType.idxmax", GroupType(types.float64, types.int64)) +def cuda_Group_idxmax(context, builder, sig, args): + retty = sig.return_type + + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + grp_type = sig.args[0] + + group_dataty = grp_type.group_data_type + group_data_ptr = builder.alloca(grp.group_data.type) + builder.store(grp.group_data, group_data_ptr) + + index_dataty = grp_type.group_index_type + index_ptr = builder.alloca(grp.index.type) + builder.store(grp.index, index_ptr) + + func = call_cuda_functions["idxmax"][grp_type.group_scalar_type] + + return context.compile_internal( + builder, + func, + nb_signature(retty, group_dataty, index_dataty, grp_type.size_type), + (builder.load(group_data_ptr), builder.load(index_ptr), grp.size), + ) + + +@cuda_lower("GroupType.idxmin", GroupType(types.int64, types.int64)) +@cuda_lower("GroupType.idxmin", GroupType(types.float64, types.int64)) +def cuda_Group_idxmin(context, builder, sig, args): + retty = sig.return_type + + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + grp_type = sig.args[0] + + group_dataty = grp_type.group_data_type + group_data_ptr = builder.alloca(grp.group_data.type) + builder.store(grp.group_data, group_data_ptr) + + index_dataty = grp_type.group_index_type + index_ptr = builder.alloca(grp.index.type) + builder.store(grp.index, index_ptr) + + func = call_cuda_functions["idxmin"][grp_type.group_scalar_type] + + return context.compile_internal( + builder, + func, + nb_signature(retty, group_dataty, index_dataty, grp_type.size_type), + (builder.load(group_data_ptr), builder.load(index_ptr), grp.size), + ) From 52656ab914e82255f2368ecef6a447236bbbe592 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Nov 2022 07:57:31 -0800 Subject: [PATCH 022/121] continue refactoring idxmin and idxmax functions --- python/cudf/cudf/core/udf/groupby_function.py | 69 ++++++------------- 1 file changed, 20 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 45645a3b75e..6929f023540 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -122,6 +122,22 @@ def caller(data, size): call_cuda_functions[func.lower()][retty] = caller +def _register_cuda_idxreduction_caller(func, inputty): + cuda_func = cuda.declare_device( + f"Block{func}_{inputty}", + types.int64( + types.CPointer(inputty), types.CPointer(types.int64), types.int64 + ), + ) + + def caller(data, index, size): + return cuda_func(data, index, size) + + if call_cuda_functions.get(func.lower()) is None: + call_cuda_functions[func.lower()] = {} + call_cuda_functions[func.lower()][types.int64] = caller + + _register_cuda_reduction_caller("Max", types.float64, types.float64) _register_cuda_reduction_caller("Max", types.int64, types.int64) _register_cuda_reduction_caller("Min", types.float64, types.float64) @@ -135,55 +151,10 @@ def caller(data, size): _register_cuda_reduction_caller("Std", types.float64, types.float64) _register_cuda_reduction_caller("Var", types.int64, types.float64) _register_cuda_reduction_caller("Var", types.float64, types.float64) - - -my_idxmax_int64 = cuda.declare_device( - "BlockIdxMax_int64", - "types.int64(types.CPointer(types.int64)," - "types.CPointer(types.int64),types.int64)", -) - -my_idxmax_float64 = cuda.declare_device( - "BlockIdxMax_float64", - "types.int64(types.CPointer(types.float64)," - "types.CPointer(types.int64),types.int64)", -) - -my_idxmin_int64 = cuda.declare_device( - "BlockIdxMin_int64", - "types.int64(types.CPointer(types.int64)," - "types.CPointer(types.int64),types.int64)", -) - -my_idxmin_float64 = cuda.declare_device( - "BlockIdxMin_float64", - "types.int64(types.CPointer(types.float64)," - "types.CPointer(types.int64),types.int64)", -) - - -def call_idxmax_int64(data, index, size): - return my_idxmax_int64(data, index, size) - - -def call_idxmax_float64(data, index, size): - return my_idxmax_float64(data, index, size) - - -def call_idxmin_int64(data, index, size): - return my_idxmin_int64(data, index, size) - - -def call_idxmin_float64(data, index, size): - return my_idxmin_float64(data, index, size) - - -call_cuda_functions["idxmax"] = {} -call_cuda_functions["idxmin"] = {} -call_cuda_functions["idxmax"][types.int64] = call_idxmax_int64 -call_cuda_functions["idxmin"][types.int64] = call_idxmin_int64 -call_cuda_functions["idxmax"][types.float64] = call_idxmax_float64 -call_cuda_functions["idxmin"][types.float64] = call_idxmin_float64 +_register_cuda_idxreduction_caller("IdxMax", types.int64) +_register_cuda_idxreduction_caller("IdxMax", types.float64) +_register_cuda_idxreduction_caller("IdxMin", types.int64) +_register_cuda_idxreduction_caller("IdxMin", types.float64) make_attribute_wrapper(GroupType, "group_data", "group_data") From b9096f32644bed837caec4e36ff8919fc0a7e8c2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Nov 2022 09:13:13 -0800 Subject: [PATCH 023/121] add tests for idxmin and idxmax, not currently passing --- python/cudf/cudf/core/udf/groupby_function.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 26 ++++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 6929f023540..8226c389221 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -135,7 +135,7 @@ def caller(data, index, size): if call_cuda_functions.get(func.lower()) is None: call_cuda_functions[func.lower()] = {} - call_cuda_functions[func.lower()][types.int64] = caller + call_cuda_functions[func.lower()][inputty] = caller _register_cuda_reduction_caller("Max", types.float64, types.float64) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 8d7aade0b8c..124251996da 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -369,7 +369,14 @@ def emulate(df): assert_groupby_results_equal(expect, got) -def test_groupby_apply_jit(): +@pytest.mark.parametrize( + "func", + [ + lambda df: df["val1"].max() + df["val2"].min(), + lambda df: df["val1"].idxmax() + df["val2"].idxmin(), + ], +) +def test_groupby_apply_jit(func): np.random.seed(0) df = DataFrame() nelem = 20 @@ -381,10 +388,7 @@ def test_groupby_apply_jit(): expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False) got_grpby = df.groupby(["key1", "key2"]) - def foo(df): - return df["val1"].max() + df["val2"].min() - - expect = expect_grpby.apply(foo) + expect = expect_grpby.apply(func) # TODO: Due to some inconsistencies between how pandas and cudf handle the # created index we get different columns in the index vs the data and a # different name. For now I'm hacking around this to test the core @@ -392,11 +396,15 @@ def foo(df): names = list(expect.columns) names[2] = 0 expect.columns = names + + got_jit = got_grpby.apply(func, engine="jit").reset_index() # TODO: Shouldn't have to reset_index below - got_nonjit = got_grpby.apply(foo).reset_index() - got_jit = got_grpby.apply(foo, engine="jit").reset_index() - assert_groupby_results_equal(expect, got_nonjit) - assert_groupby_results_equal(expect, got_jit) + try: + got_nonjit = got_grpby.apply(func).reset_index() + assert_groupby_results_equal(expect, got_nonjit) + assert_groupby_results_equal(expect, got_jit) + except AttributeError: + assert_groupby_results_equal(expect, got_jit) def create_test_groupby_apply_jit_args_params(): From d21a099383cb4832c0844d5739c95b59eb235961 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Nov 2022 09:23:45 -0800 Subject: [PATCH 024/121] normalize call_cuda_functions keys --- python/cudf/cudf/core/udf/groupby_function.py | 8 ++++++-- python/cudf/cudf/core/udf/groupby_lowering.py | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 8226c389221..a29aa2b9d1f 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -119,7 +119,9 @@ def caller(data, size): if call_cuda_functions.get(func.lower()) is None: call_cuda_functions[func.lower()] = {} - call_cuda_functions[func.lower()][retty] = caller + + type_key = (retty, inputty) + call_cuda_functions[func.lower()][type_key] = caller def _register_cuda_idxreduction_caller(func, inputty): @@ -133,9 +135,11 @@ def _register_cuda_idxreduction_caller(func, inputty): def caller(data, index, size): return cuda_func(data, index, size) + # idxmax and idxmin always return int64 + type_key = (types.int64, inputty) if call_cuda_functions.get(func.lower()) is None: call_cuda_functions[func.lower()] = {} - call_cuda_functions[func.lower()][inputty] = caller + call_cuda_functions[func.lower()][type_key] = caller _register_cuda_reduction_caller("Max", types.float64, types.float64) diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index ca2623cf706..bbf69fe36da 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -25,7 +25,8 @@ def lowering_function(context, builder, sig, args, function): group_data_ptr = builder.alloca(grp.group_data.type) builder.store(grp.group_data, group_data_ptr) - func = call_cuda_functions[function][grp_type.group_scalar_type] + type_key = (sig.return_type, grp_type.group_scalar_type) + func = call_cuda_functions[function][type_key] return context.compile_internal( builder, @@ -130,7 +131,8 @@ def cuda_Group_idxmax(context, builder, sig, args): index_ptr = builder.alloca(grp.index.type) builder.store(grp.index, index_ptr) - func = call_cuda_functions["idxmax"][grp_type.group_scalar_type] + type_key = (types.int64, grp_type.group_scalar_type) + func = call_cuda_functions["idxmax"][type_key] return context.compile_internal( builder, From 62aad1e465541f79cfc3d561c873d483fb7ab506 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Nov 2022 11:58:15 -0800 Subject: [PATCH 025/121] continued refactoring --- python/cudf/cudf/core/groupby/groupby.py | 6 +- python/cudf/cudf/core/udf/groupby_function.py | 369 +++--------------- python/cudf/cudf/core/udf/groupby_utils.py | 199 ++++++++++ python/cudf/cudf/core/udf/utils.py | 11 - 4 files changed, 265 insertions(+), 320 deletions(-) create mode 100644 python/cudf/cudf/core/udf/groupby_utils.py diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e08e2211cf2..4ed54ba0447 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -23,7 +23,7 @@ from cudf.core.column_accessor import ColumnAccessor from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex -from cudf.core.udf.groupby_function import jit_groupby_apply +from cudf.core.udf.groupby_utils import jit_groupby_apply from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate @@ -770,7 +770,7 @@ def pipe(self, func, *args, **kwargs): """ return cudf.core.common.pipe(self, func, *args, **kwargs) - def apply(self, function, *args, engine="cudf", cache=True): + def apply(self, function, *args, engine="cudf"): """Apply a python transformation function over the grouped chunk. Parameters @@ -841,7 +841,7 @@ def mult(df): if engine == "jit": chunk_results = jit_groupby_apply( - offsets, grouped_values, function, *args, cache=cache + offsets, grouped_values, function, *args ) result = cudf.Series(chunk_results, index=group_names) result.index.names = self.grouping.names diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index a29aa2b9d1f..094e282f9e7 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -1,12 +1,7 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. - -import math -import os from typing import Any, Dict -import cupy as cp import numba -import numpy as np from numba import cuda, types from numba.core.extending import ( make_attribute_wrapper, @@ -18,32 +13,12 @@ from numba.core.typing import signature as nb_signature from numba.core.typing.templates import AbstractTemplate, AttributeTemplate from numba.cuda.cudadecl import registry as cuda_registry -from numba.cuda.cudadrv.devices import get_context -from numba.np import numpy_support -from numba.types import Record - -from cudf.core.column import as_column -from cudf.core.udf.templates import ( - group_initializer_template, - groupby_apply_kernel_template, -) -from cudf.core.udf.utils import ( - _all_dtypes_from_frame, - _compile_or_get, - _get_kernel_groupby_apply, - _get_ptx_file, - _get_udf_return_type, - _supported_cols_from_frame, - _supported_dtypes_from_frame, -) -from cudf.utils.utils import _cudf_nvtx_annotate # Disable occupancy warnings to avoid polluting output when there are few # groups. numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 index_default_type = types.int64 -dev_func_ptx = _get_ptx_file(os.path.dirname(__file__), "function_") class Group(object): @@ -103,9 +78,6 @@ def __init__( models.StructModel.__init__(self, dmm, fe_type, members) -SUPPORTED_INPUT_TYPES = [types.int64, types.float64] - - call_cuda_functions: Dict[Any, Any] = {} @@ -142,84 +114,28 @@ def caller(data, index, size): call_cuda_functions[func.lower()][type_key] = caller -_register_cuda_reduction_caller("Max", types.float64, types.float64) -_register_cuda_reduction_caller("Max", types.int64, types.int64) -_register_cuda_reduction_caller("Min", types.float64, types.float64) -_register_cuda_reduction_caller("Min", types.int64, types.int64) -_register_cuda_reduction_caller("Min", types.float64, types.float64) -_register_cuda_reduction_caller("Sum", types.int64, types.int64) -_register_cuda_reduction_caller("Sum", types.float64, types.float64) -_register_cuda_reduction_caller("Mean", types.int64, types.float64) -_register_cuda_reduction_caller("Mean", types.float64, types.float64) -_register_cuda_reduction_caller("Std", types.int64, types.float64) -_register_cuda_reduction_caller("Std", types.float64, types.float64) -_register_cuda_reduction_caller("Var", types.int64, types.float64) -_register_cuda_reduction_caller("Var", types.float64, types.float64) -_register_cuda_idxreduction_caller("IdxMax", types.int64) -_register_cuda_idxreduction_caller("IdxMax", types.float64) -_register_cuda_idxreduction_caller("IdxMin", types.int64) -_register_cuda_idxreduction_caller("IdxMin", types.float64) - - -make_attribute_wrapper(GroupType, "group_data", "group_data") -make_attribute_wrapper(GroupType, "index", "index") -make_attribute_wrapper(GroupType, "size", "size") - - -class GroupMax(AbstractTemplate): - key = "GroupType.max" - - def generic(self, args, kws): - return nb_signature(self.this.group_scalar_type, recvr=self.this) - +def _create_reduction_attr(name, retty=None): + class Attr(AbstractTemplate): + key = name -class GroupMin(AbstractTemplate): - key = "GroupType.min" - - def generic(self, args, kws): - return nb_signature(self.this.group_scalar_type, recvr=self.this) + if retty: + def generic(self, args, kws): + return nb_signature(retty, recvr=self.this) -class GroupSize(AbstractTemplate): - key = "GroupType.size" - - def generic(self, args, kws): - return nb_signature(types.int64, recvr=self.this) - - -class GroupCount(AbstractTemplate): - key = "GroupType.count" - - def generic(self, args, kws): - return nb_signature(types.int64, recvr=self.this) - - -class GroupSum(AbstractTemplate): - key = "GroupType.sum" - - def generic(self, args, kws): - return nb_signature(self.this.group_scalar_type, recvr=self.this) - - -class GroupMean(AbstractTemplate): - key = "GroupType.mean" - - def generic(self, args, kws): - return nb_signature(types.float64, recvr=self.this) - + else: -class GroupStd(AbstractTemplate): - key = "GroupType.std" + def generic(self, args, kws): + return nb_signature(self.this.group_scalar_type, recvr=self.this) - def generic(self, args, kws): - return nb_signature(types.float64, recvr=self.this) + Attr.generic = generic + def _attr(self, mod): + return types.BoundFunction( + Attr, GroupType(mod.group_scalar_type, mod.index_type) + ) -class GroupVar(AbstractTemplate): - key = "GroupType.var" - - def generic(self, args, kws): - return nb_signature(types.float64, recvr=self.this) + return _attr class GroupIdxMax(AbstractTemplate): @@ -240,46 +156,6 @@ def generic(self, args, kws): class GroupAttr(AttributeTemplate): key = GroupType - def resolve_max(self, mod): - return types.BoundFunction( - GroupMax, GroupType(mod.group_scalar_type, mod.index_type) - ) - - def resolve_min(self, mod): - return types.BoundFunction( - GroupMin, GroupType(mod.group_scalar_type, mod.index_type) - ) - - def resolve_size(self, mod): - return types.BoundFunction( - GroupSize, GroupType(mod.group_scalar_type, mod.index_type) - ) - - def resolve_count(self, mod): - return types.BoundFunction( - GroupCount, GroupType(mod.group_scalar_type, mod.index_type) - ) - - def resolve_sum(self, mod): - return types.BoundFunction( - GroupSum, GroupType(mod.group_scalar_type, mod.index_type) - ) - - def resolve_mean(self, mod): - return types.BoundFunction( - GroupMean, GroupType(mod.group_scalar_type, mod.index_type) - ) - - def resolve_std(self, mod): - return types.BoundFunction( - GroupStd, GroupType(mod.group_scalar_type, mod.index_type) - ) - - def resolve_var(self, mod): - return types.BoundFunction( - GroupVar, GroupType(mod.group_scalar_type, mod.index_type) - ) - def resolve_idxmax(self, mod): return types.BoundFunction( GroupIdxMax, GroupType(mod.group_scalar_type, mod.index_type) @@ -291,174 +167,55 @@ def resolve_idxmin(self, mod): ) -def _get_frame_groupby_type(dtype, index_dtype): - """ - Get the numba `Record` type corresponding to a frame. - Models the column as a dictionary like data structure - containing GroupTypes. - Large parts of this function are copied with comments - from the Numba internals and slightly modified to - account for validity bools to be present in the final - struct. - See numba.np.numpy_support.from_struct_dtype for details. - """ - - # Create the numpy structured type corresponding to the numpy dtype. - - fields = [] - offset = 0 - - sizes = [val[0].itemsize for val in dtype.fields.values()] - for i, (name, info) in enumerate(dtype.fields.items()): - # *info* consists of the element dtype, its offset from the beginning - # of the record, and an optional "title" containing metadata. - # We ignore the offset in info because its value assumes no masking; - # instead, we compute the correct offset based on the masked type. - elemdtype = info[0] - title = info[2] if len(info) == 3 else None - ty = numpy_support.from_dtype(elemdtype) - indexty = numpy_support.from_dtype(index_dtype) - infos = { - "type": GroupType(ty, indexty), - "offset": offset, - "title": title, - } - fields.append((name, infos)) - - # increment offset by itemsize plus one byte for validity - offset += 8 + 8 + 8 # group struct size (2 pointers and 1 integer) - - # Align the next member of the struct to be a multiple of the - # memory access size, per PTX ISA 7.4/5.4.5 - if i < len(sizes) - 1: - # next_itemsize = sizes[i + 1] - next_itemsize = 8 - offset = int(math.ceil(offset / next_itemsize) * next_itemsize) - - # Numba requires that structures are aligned for the CUDA target - _is_aligned_struct = True - return Record(fields, offset, _is_aligned_struct) - - -def _groupby_apply_kernel_string_from_template(frame, args): - """ - Function to write numba kernels for `DataFrame.apply` as a string. - Workaround until numba supports functions that use `*args` - - Both the number of input columns as well as their nullability and any - scalar arguments may vary, so the kernels vary significantly. See - templates.py for the full row kernel template and more details. - """ - # Create argument list for kernel - frame = _supported_cols_from_frame(frame) - - input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))]) - extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))]) - - # Generate the initializers for each device function argument - initializers = [] - for i, (colname, col) in enumerate(frame.items()): - idx = str(i) - initializers.append( - group_initializer_template.format(idx=idx, name=colname) - ) - - return groupby_apply_kernel_template.format( - input_columns=input_columns, - extra_args=extra_args, - group_initializers="\n".join(initializers), - ) - - -def _get_groupby_apply_kernel(frame, func, args): - dataframe_group_type = _get_frame_groupby_type( - np.dtype(list(_all_dtypes_from_frame(frame).items())), - frame.index.dtype, - ) - - return_type = _get_udf_return_type(dataframe_group_type, func, args) - - np_field_types = np.dtype( - list(_supported_dtypes_from_frame(frame).items()) - ) - dataframe_group_type = _get_frame_groupby_type( - np_field_types, frame.index.dtype - ) - - # Dict of 'local' variables into which `_kernel` is defined - global_exec_context = { - "cuda": cuda, - "Group": Group, - "dataframe_group_type": dataframe_group_type, - "types": types, - } - kernel_string = _groupby_apply_kernel_string_from_template(frame, args) - - kernel = _get_kernel_groupby_apply( - kernel_string, global_exec_context, func, dev_func_ptx - ) - - return kernel, return_type - - -@_cudf_nvtx_annotate -def jit_groupby_apply(offsets, grouped_values, function, *args, cache=True): - ngroups = len(offsets) - 1 - - if cache is True: - kernel, return_type = _compile_or_get( - grouped_values, function, args, _get_groupby_apply_kernel - ) - else: - kernel, return_type = _get_groupby_apply_kernel( - grouped_values, function, args - ) - return_type = numpy_support.as_dtype(return_type) - - output = cp.empty(ngroups, dtype=return_type) - - launch_args = [ - cp.asarray(offsets), - output, - cp.asarray(grouped_values.index), - ] - - for col in _supported_cols_from_frame(grouped_values).values(): - launch_args.append(cp.asarray(col)) - - launch_args += list(args) - - max_group_size = cp.diff(offsets).max() - - if max_group_size >= 1000: - # if ngroups < 100: - # blocklim = 1024 - # else: - blocklim = 256 - else: - blocklim = ((max_group_size + 32 - 1) / 32) * 32 - - if kernel.specialized: - specialized = kernel - else: - specialized = kernel.specialize(*launch_args) - - # Ask the driver to give a good config - ctx = get_context() - # Dispatcher is specialized, so there's only one definition - get - # it so we can get the cufunc from the code library - kern_def = next(iter(specialized.overloads.values())) - grid, tpb = ctx.get_max_potential_block_size( - func=kern_def._codelibrary.get_cufunc(), - b2d_func=0, - memsize=0, - blocksizelimit=blocklim, - ) +setattr(GroupAttr, "resolve_max", _create_reduction_attr("GroupType.max")) +setattr(GroupAttr, "resolve_min", _create_reduction_attr("GroupType.min")) +setattr(GroupAttr, "resolve_sum", _create_reduction_attr("GroupType.sum")) +setattr( + GroupAttr, + "resolve_size", + _create_reduction_attr("GroupType.size", retty=types.int64), +) +setattr( + GroupAttr, + "resolve_count", + _create_reduction_attr("GroupType.count", retty=types.int64), +) +setattr( + GroupAttr, + "resolve_mean", + _create_reduction_attr("GroupType.mean", retty=types.float64), +) +setattr( + GroupAttr, + "resolve_var", + _create_reduction_attr("GroupType.var", retty=types.float64), +) +setattr( + GroupAttr, + "resolve_std", + _create_reduction_attr("GroupType.std", retty=types.float64), +) - stream = cuda.default_stream() - specialized[ngroups, tpb, stream](*launch_args) +_register_cuda_reduction_caller("Max", types.float64, types.float64) +_register_cuda_reduction_caller("Max", types.int64, types.int64) +_register_cuda_reduction_caller("Min", types.float64, types.float64) +_register_cuda_reduction_caller("Min", types.int64, types.int64) +_register_cuda_reduction_caller("Min", types.float64, types.float64) +_register_cuda_reduction_caller("Sum", types.int64, types.int64) +_register_cuda_reduction_caller("Sum", types.float64, types.float64) +_register_cuda_reduction_caller("Mean", types.int64, types.float64) +_register_cuda_reduction_caller("Mean", types.float64, types.float64) +_register_cuda_reduction_caller("Std", types.int64, types.float64) +_register_cuda_reduction_caller("Std", types.float64, types.float64) +_register_cuda_reduction_caller("Var", types.int64, types.float64) +_register_cuda_reduction_caller("Var", types.float64, types.float64) +_register_cuda_idxreduction_caller("IdxMax", types.int64) +_register_cuda_idxreduction_caller("IdxMax", types.float64) +_register_cuda_idxreduction_caller("IdxMin", types.int64) +_register_cuda_idxreduction_caller("IdxMin", types.float64) - stream.synchronize() - return as_column(output, dtype=output.dtype) +make_attribute_wrapper(GroupType, "group_data", "group_data") +make_attribute_wrapper(GroupType, "index", "index") +make_attribute_wrapper(GroupType, "size", "size") diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py new file mode 100644 index 00000000000..2b4445f5090 --- /dev/null +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -0,0 +1,199 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import math +import os + +import cupy as cp +import numpy as np +from numba import cuda, types +from numba.cuda.cudadrv.devices import get_context +from numba.np import numpy_support +from numba.types import Record + +from cudf.core.column import as_column +from cudf.core.udf.groupby_function import Group, GroupType +from cudf.core.udf.templates import ( + group_initializer_template, + groupby_apply_kernel_template, +) +from cudf.core.udf.utils import ( + _all_dtypes_from_frame, + _get_ptx_file, + _get_udf_return_type, + _supported_cols_from_frame, + _supported_dtypes_from_frame, +) +from cudf.utils.utils import _cudf_nvtx_annotate + +dev_func_ptx = _get_ptx_file(os.path.dirname(__file__), "function_") + + +def _get_kernel_groupby_apply(kernel_string, globals_, func, dev_func_ptx): + """Template kernel compilation helper function for groupby apply""" + f_ = cuda.jit(device=True)(func) + globals_["f_"] = f_ + exec(kernel_string, globals_) + _kernel = globals_["_kernel"] + kernel = cuda.jit(link=[dev_func_ptx])(_kernel) + + return kernel + + +def _get_frame_groupby_type(dtype, index_dtype): + """ + Get the numba `Record` type corresponding to a frame. + Models the column as a dictionary like data structure + containing GroupTypes. + Large parts of this function are copied with comments + from the Numba internals and slightly modified to + account for validity bools to be present in the final + struct. + See numba.np.numpy_support.from_struct_dtype for details. + """ + + # Create the numpy structured type corresponding to the numpy dtype. + + fields = [] + offset = 0 + + sizes = [val[0].itemsize for val in dtype.fields.values()] + for i, (name, info) in enumerate(dtype.fields.items()): + elemdtype = info[0] + title = info[2] if len(info) == 3 else None + ty = numpy_support.from_dtype(elemdtype) + indexty = numpy_support.from_dtype(index_dtype) + infos = { + "type": GroupType(ty, indexty), + "offset": offset, + "title": title, + } + fields.append((name, infos)) + + offset += 8 + 8 + 8 # group struct size (2 pointers and 1 integer) + + # Align the next member of the struct to be a multiple of the + # memory access size, per PTX ISA 7.4/5.4.5 + if i < len(sizes) - 1: + # next_itemsize = sizes[i + 1] + next_itemsize = 8 + offset = int(math.ceil(offset / next_itemsize) * next_itemsize) + + # Numba requires that structures are aligned for the CUDA target + _is_aligned_struct = True + return Record(fields, offset, _is_aligned_struct) + + +def _groupby_apply_kernel_string_from_template(frame, args): + """ + Function to write numba kernels for `DataFrame.apply` as a string. + Workaround until numba supports functions that use `*args` + + Both the number of input columns as well as their nullability and any + scalar arguments may vary, so the kernels vary significantly. See + templates.py for the full row kernel template and more details. + """ + # Create argument list for kernel + frame = _supported_cols_from_frame(frame) + + input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))]) + extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))]) + + # Generate the initializers for each device function argument + initializers = [] + for i, (colname, col) in enumerate(frame.items()): + idx = str(i) + initializers.append( + group_initializer_template.format(idx=idx, name=colname) + ) + + return groupby_apply_kernel_template.format( + input_columns=input_columns, + extra_args=extra_args, + group_initializers="\n".join(initializers), + ) + + +def _get_groupby_apply_kernel(frame, func, args): + dataframe_group_type = _get_frame_groupby_type( + np.dtype(list(_all_dtypes_from_frame(frame).items())), + frame.index.dtype, + ) + + return_type = _get_udf_return_type(dataframe_group_type, func, args) + + np_field_types = np.dtype( + list(_supported_dtypes_from_frame(frame).items()) + ) + dataframe_group_type = _get_frame_groupby_type( + np_field_types, frame.index.dtype + ) + + # Dict of 'local' variables into which `_kernel` is defined + global_exec_context = { + "cuda": cuda, + "Group": Group, + "dataframe_group_type": dataframe_group_type, + "types": types, + } + kernel_string = _groupby_apply_kernel_string_from_template(frame, args) + + kernel = _get_kernel_groupby_apply( + kernel_string, global_exec_context, func, dev_func_ptx + ) + + return kernel, return_type + + +@_cudf_nvtx_annotate +def jit_groupby_apply(offsets, grouped_values, function, *args): + ngroups = len(offsets) - 1 + + kernel, return_type = _get_groupby_apply_kernel( + grouped_values, function, args + ) + return_type = numpy_support.as_dtype(return_type) + + output = cp.empty(ngroups, dtype=return_type) + + launch_args = [ + cp.asarray(offsets), + output, + cp.asarray(grouped_values.index), + ] + + for col in _supported_cols_from_frame(grouped_values).values(): + launch_args.append(cp.asarray(col)) + + launch_args += list(args) + + max_group_size = cp.diff(offsets).max() + + if max_group_size >= 1000: + blocklim = 256 + else: + blocklim = ((max_group_size + 32 - 1) / 32) * 32 + + if kernel.specialized: + specialized = kernel + else: + specialized = kernel.specialize(*launch_args) + + # Ask the driver to give a good config + ctx = get_context() + # Dispatcher is specialized, so there's only one definition - get + # it so we can get the cufunc from the code library + kern_def = next(iter(specialized.overloads.values())) + grid, tpb = ctx.get_max_potential_block_size( + func=kern_def._codelibrary.get_cufunc(), + b2d_func=0, + memsize=0, + blocksizelimit=blocklim, + ) + + stream = cuda.default_stream() + + specialized[ngroups, tpb, stream](*launch_args) + + stream.synchronize() + + return as_column(output, dtype=output.dtype) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 0a9833fe5b8..fbc5ddfe24b 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -244,17 +244,6 @@ def _get_kernel(kernel_string, globals_, sig, func): return kernel -def _get_kernel_groupby_apply(kernel_string, globals_, func, dev_func_ptx): - """Template kernel compilation helper function for groupby apply""" - f_ = cuda.jit(device=True)(func) - globals_["f_"] = f_ - exec(kernel_string, globals_) - _kernel = globals_["_kernel"] - kernel = cuda.jit(link=[dev_func_ptx])(_kernel) - - return kernel - - def _get_input_args_from_frame(fr): args = [] offsets = [] From 9ff058a63ded6d20589a680c885d303101999df8 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 28 Nov 2022 12:17:11 -0800 Subject: [PATCH 026/121] refactoring lowering --- python/cudf/cudf/core/udf/groupby_function.py | 1 + python/cudf/cudf/core/udf/groupby_lowering.py | 109 +++++++----------- 2 files changed, 45 insertions(+), 65 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 094e282f9e7..56ff0ff9365 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -19,6 +19,7 @@ numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 index_default_type = types.int64 +SUPPORTED_GROUPBY_JIT_TYPES = [types.int64, types.float64] class Group(object): diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index bbf69fe36da..7f395b12874 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -7,6 +7,7 @@ from numba.cuda.cudaimpl import lower as cuda_lower from cudf.core.udf.groupby_function import ( + SUPPORTED_GROUPBY_JIT_TYPES, Group, GroupType, call_cuda_functions, @@ -59,20 +60,40 @@ def group_constructor(context, builder, sig, args): return grp._getvalue() -@cuda_lower("GroupType.max", GroupType(types.int64)) -@cuda_lower("GroupType.max", GroupType(types.float64)) +def cuda_Group_idx_max_or_min(context, builder, sig, args, fname): + retty = sig.return_type + + grp = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + grp_type = sig.args[0] + + group_dataty = grp_type.group_data_type + group_data_ptr = builder.alloca(grp.group_data.type) + builder.store(grp.group_data, group_data_ptr) + + index_dataty = grp_type.group_index_type + index_ptr = builder.alloca(grp.index.type) + builder.store(grp.index, index_ptr) + type_key = (types.int64, grp_type.group_scalar_type) + func = call_cuda_functions[fname][type_key] + + return context.compile_internal( + builder, + func, + nb_signature(retty, group_dataty, index_dataty, grp_type.size_type), + (builder.load(group_data_ptr), builder.load(index_ptr), grp.size), + ) + + def cuda_Group_max(context, builder, sig, args): return lowering_function(context, builder, sig, args, "max") -@cuda_lower("GroupType.min", GroupType(types.int64)) -@cuda_lower("GroupType.min", GroupType(types.float64)) def cuda_Group_min(context, builder, sig, args): return lowering_function(context, builder, sig, args, "min") -@cuda_lower("GroupType.size", GroupType(types.int64)) -@cuda_lower("GroupType.size", GroupType(types.float64)) def cuda_Group_size(context, builder, sig, args): grp = cgutils.create_struct_proxy(sig.args[0])( context, builder, value=args[0] @@ -80,8 +101,6 @@ def cuda_Group_size(context, builder, sig, args): return grp.size -@cuda_lower("GroupType.count", GroupType(types.int64)) -@cuda_lower("GroupType.count", GroupType(types.float64)) def cuda_Group_count(context, builder, sig, args): grp = cgutils.create_struct_proxy(sig.args[0])( context, builder, value=args[0] @@ -89,82 +108,42 @@ def cuda_Group_count(context, builder, sig, args): return grp.size -@cuda_lower("GroupType.sum", GroupType(types.int64)) -@cuda_lower("GroupType.sum", GroupType(types.float64)) def cuda_Group_sum(context, builder, sig, args): return lowering_function(context, builder, sig, args, "sum") -@cuda_lower("GroupType.mean", GroupType(types.int64)) -@cuda_lower("GroupType.mean", GroupType(types.float64)) def cuda_Group_mean(context, builder, sig, args): return lowering_function(context, builder, sig, args, "mean") -@cuda_lower("GroupType.std", GroupType(types.int64)) -@cuda_lower("GroupType.std", GroupType(types.float64)) def cuda_Group_std(context, builder, sig, args): return lowering_function(context, builder, sig, args, "std") -@cuda_lower("GroupType.var", GroupType(types.int64)) -@cuda_lower("GroupType.var", GroupType(types.float64)) def cuda_Group_var(context, builder, sig, args): return lowering_function(context, builder, sig, args, "var") -@cuda_lower("GroupType.idxmax", GroupType(types.int64, types.int64)) -@cuda_lower("GroupType.idxmax", GroupType(types.float64, types.int64)) def cuda_Group_idxmax(context, builder, sig, args): - retty = sig.return_type + return cuda_Group_idx_max_or_min(context, builder, sig, args, "idxmax") - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - grp_type = sig.args[0] - - group_dataty = grp_type.group_data_type - group_data_ptr = builder.alloca(grp.group_data.type) - builder.store(grp.group_data, group_data_ptr) - index_dataty = grp_type.group_index_type - index_ptr = builder.alloca(grp.index.type) - builder.store(grp.index, index_ptr) - - type_key = (types.int64, grp_type.group_scalar_type) - func = call_cuda_functions["idxmax"][type_key] - - return context.compile_internal( - builder, - func, - nb_signature(retty, group_dataty, index_dataty, grp_type.size_type), - (builder.load(group_data_ptr), builder.load(index_ptr), grp.size), - ) - - -@cuda_lower("GroupType.idxmin", GroupType(types.int64, types.int64)) -@cuda_lower("GroupType.idxmin", GroupType(types.float64, types.int64)) def cuda_Group_idxmin(context, builder, sig, args): - retty = sig.return_type - - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] + return cuda_Group_idx_max_or_min(context, builder, sig, args, "idxmin") + + +for ty in SUPPORTED_GROUPBY_JIT_TYPES: + cuda_lower("GroupType.max", GroupType(ty))(cuda_Group_max) + cuda_lower("GroupType.min", GroupType(ty))(cuda_Group_min) + cuda_lower("GroupType.sum", GroupType(ty))(cuda_Group_sum) + cuda_lower("GroupType.count", GroupType(ty))(cuda_Group_count) + cuda_lower("GroupType.size", GroupType(ty))(cuda_Group_size) + cuda_lower("GroupType.mean", GroupType(ty))(cuda_Group_mean) + cuda_lower("GroupType.std", GroupType(ty))(cuda_Group_std) + cuda_lower("GroupType.var", GroupType(ty))(cuda_Group_var) + cuda_lower("GroupType.idxmax", GroupType(ty, types.int64))( + cuda_Group_idxmax ) - grp_type = sig.args[0] - - group_dataty = grp_type.group_data_type - group_data_ptr = builder.alloca(grp.group_data.type) - builder.store(grp.group_data, group_data_ptr) - - index_dataty = grp_type.group_index_type - index_ptr = builder.alloca(grp.index.type) - builder.store(grp.index, index_ptr) - - func = call_cuda_functions["idxmin"][grp_type.group_scalar_type] - - return context.compile_internal( - builder, - func, - nb_signature(retty, group_dataty, index_dataty, grp_type.size_type), - (builder.load(group_data_ptr), builder.load(index_ptr), grp.size), + cuda_lower("GroupType.idxmin", GroupType(ty, types.int64))( + cuda_Group_idxmin ) From 5f07ca22c67c563ccd1ed0d2139c2622014f668f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 29 Nov 2022 11:47:12 -0800 Subject: [PATCH 027/121] continued refactoring --- python/cudf/cudf/core/udf/groupby_function.py | 6 +- python/cudf/cudf/core/udf/groupby_lowering.py | 4 +- python/cudf/cudf/core/udf/groupby_utils.py | 74 ++++++++----------- python/cudf/cudf/core/udf/utils.py | 16 ++-- 4 files changed, 45 insertions(+), 55 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index 56ff0ff9365..ede18f1a4a5 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -13,13 +13,17 @@ from numba.core.typing import signature as nb_signature from numba.core.typing.templates import AbstractTemplate, AttributeTemplate from numba.cuda.cudadecl import registry as cuda_registry +from numba.np import numpy_support # Disable occupancy warnings to avoid polluting output when there are few # groups. numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 index_default_type = types.int64 -SUPPORTED_GROUPBY_JIT_TYPES = [types.int64, types.float64] +SUPPORTED_GROUPBY_NUMBA_TYPES = [types.int64, types.float64] +SUPPORTED_GROUPBY_NUMPY_TYPES = [ + numpy_support.as_dtype(dt) for dt in SUPPORTED_GROUPBY_NUMBA_TYPES +] class Group(object): diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index 7f395b12874..5fe92b464cc 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -7,7 +7,7 @@ from numba.cuda.cudaimpl import lower as cuda_lower from cudf.core.udf.groupby_function import ( - SUPPORTED_GROUPBY_JIT_TYPES, + SUPPORTED_GROUPBY_NUMBA_TYPES, Group, GroupType, call_cuda_functions, @@ -132,7 +132,7 @@ def cuda_Group_idxmin(context, builder, sig, args): return cuda_Group_idx_max_or_min(context, builder, sig, args, "idxmin") -for ty in SUPPORTED_GROUPBY_JIT_TYPES: +for ty in SUPPORTED_GROUPBY_NUMBA_TYPES: cuda_lower("GroupType.max", GroupType(ty))(cuda_Group_max) cuda_lower("GroupType.min", GroupType(ty))(cuda_Group_min) cuda_lower("GroupType.sum", GroupType(ty))(cuda_Group_sum) diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 2b4445f5090..a19f85e5c46 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -10,14 +10,18 @@ from numba.np import numpy_support from numba.types import Record -from cudf.core.column import as_column -from cudf.core.udf.groupby_function import Group, GroupType +import cudf.core.udf.utils +from cudf.core.udf.groupby_function import ( + SUPPORTED_GROUPBY_NUMPY_TYPES, + Group, + GroupType, +) from cudf.core.udf.templates import ( group_initializer_template, groupby_apply_kernel_template, ) from cudf.core.udf.utils import ( - _all_dtypes_from_frame, + _get_kernel, _get_ptx_file, _get_udf_return_type, _supported_cols_from_frame, @@ -26,17 +30,7 @@ from cudf.utils.utils import _cudf_nvtx_annotate dev_func_ptx = _get_ptx_file(os.path.dirname(__file__), "function_") - - -def _get_kernel_groupby_apply(kernel_string, globals_, func, dev_func_ptx): - """Template kernel compilation helper function for groupby apply""" - f_ = cuda.jit(device=True)(func) - globals_["f_"] = f_ - exec(kernel_string, globals_) - _kernel = globals_["_kernel"] - kernel = cuda.jit(link=[dev_func_ptx])(_kernel) - - return kernel +cudf.core.udf.utils.ptx_files.append(dev_func_ptx) def _get_frame_groupby_type(dtype, index_dtype): @@ -44,15 +38,10 @@ def _get_frame_groupby_type(dtype, index_dtype): Get the numba `Record` type corresponding to a frame. Models the column as a dictionary like data structure containing GroupTypes. - Large parts of this function are copied with comments - from the Numba internals and slightly modified to - account for validity bools to be present in the final - struct. See numba.np.numpy_support.from_struct_dtype for details. """ # Create the numpy structured type corresponding to the numpy dtype. - fields = [] offset = 0 @@ -93,17 +82,17 @@ def _groupby_apply_kernel_string_from_template(frame, args): templates.py for the full row kernel template and more details. """ # Create argument list for kernel - frame = _supported_cols_from_frame(frame) - + frame = _supported_cols_from_frame( + frame, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES + ) input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))]) extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))]) # Generate the initializers for each device function argument initializers = [] - for i, (colname, col) in enumerate(frame.items()): - idx = str(i) + for i, colname in enumerate(frame.keys()): initializers.append( - group_initializer_template.format(idx=idx, name=colname) + group_initializer_template.format(idx=i, name=colname) ) return groupby_apply_kernel_template.format( @@ -114,19 +103,17 @@ def _groupby_apply_kernel_string_from_template(frame, args): def _get_groupby_apply_kernel(frame, func, args): - dataframe_group_type = _get_frame_groupby_type( - np.dtype(list(_all_dtypes_from_frame(frame).items())), - frame.index.dtype, - ) - - return_type = _get_udf_return_type(dataframe_group_type, func, args) - np_field_types = np.dtype( - list(_supported_dtypes_from_frame(frame).items()) + list( + _supported_dtypes_from_frame( + frame, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES + ).items() + ) ) dataframe_group_type = _get_frame_groupby_type( np_field_types, frame.index.dtype ) + return_type = _get_udf_return_type(dataframe_group_type, func, args) # Dict of 'local' variables into which `_kernel` is defined global_exec_context = { @@ -137,15 +124,14 @@ def _get_groupby_apply_kernel(frame, func, args): } kernel_string = _groupby_apply_kernel_string_from_template(frame, args) - kernel = _get_kernel_groupby_apply( - kernel_string, global_exec_context, func, dev_func_ptx - ) + kernel = _get_kernel(kernel_string, global_exec_context, None, func) return kernel, return_type @_cudf_nvtx_annotate def jit_groupby_apply(offsets, grouped_values, function, *args): + offsets = cp.asarray(offsets) ngroups = len(offsets) - 1 kernel, return_type = _get_groupby_apply_kernel( @@ -153,17 +139,17 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): ) return_type = numpy_support.as_dtype(return_type) - output = cp.empty(ngroups, dtype=return_type) - + output = cudf.core.column.column_empty(ngroups, dtype=return_type) launch_args = [ - cp.asarray(offsets), + offsets, output, - cp.asarray(grouped_values.index), + grouped_values.index, ] - - for col in _supported_cols_from_frame(grouped_values).values(): - launch_args.append(cp.asarray(col)) - + launch_args += list( + _supported_cols_from_frame( + grouped_values, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES + ).values() + ) launch_args += list(args) max_group_size = cp.diff(offsets).max() @@ -196,4 +182,4 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): stream.synchronize() - return as_column(output, dtype=output.dtype) + return output diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index fbc5ddfe24b..187cc0db571 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -89,35 +89,35 @@ def _get_udf_return_type(argty, func: Callable, args=()): return result -def _is_jit_supported_type(dtype): +def _is_jit_supported_type(dtype, supported_types): # category dtype isn't hashable if isinstance(dtype, CategoricalDtype): return False - return str(dtype) in JIT_SUPPORTED_TYPES + return str(dtype) in supported_types -def _all_dtypes_from_frame(frame): +def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { colname: col.dtype - if _is_jit_supported_type(col.dtype) + if _is_jit_supported_type(col.dtype, supported_types=supported_types) else np.dtype("O") for colname, col in frame._data.items() } -def _supported_dtypes_from_frame(frame): +def _supported_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { colname: col.dtype for colname, col in frame._data.items() - if _is_jit_supported_type(col.dtype) + if _is_jit_supported_type(col.dtype, supported_types=supported_types) } -def _supported_cols_from_frame(frame): +def _supported_cols_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { colname: col for colname, col in frame._data.items() - if _is_jit_supported_type(col.dtype) + if _is_jit_supported_type(col.dtype, supported_types=supported_types) } From c12a9e3c295c26bb31a5d3b0df19df137a37e00f Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Mon, 2 Jan 2023 03:15:15 +0000 Subject: [PATCH 028/121] CMake changes --- python/cudf/udf_cpp/groupby/CMakeLists.txt | 11 +++++------ python/cudf/udf_cpp/groupby/function.cu | 14 +++++++------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/CMakeLists.txt b/python/cudf/udf_cpp/groupby/CMakeLists.txt index 0a35c8ee0b4..d3116b2d88e 100644 --- a/python/cudf/udf_cpp/groupby/CMakeLists.txt +++ b/python/cudf/udf_cpp/groupby/CMakeLists.txt @@ -12,13 +12,13 @@ # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1) +cmake_minimum_required(VERSION 3.23.1) include(rapids-cmake) include(rapids-cuda) include(rapids-find) -rapids_cuda_init_architectures(GROUPBY_UDF) +rapids_cuda_init_architectures(groupby-udf-cpp) # Create a project so that we can enable CUDA architectures in this file. project( @@ -53,7 +53,7 @@ file(COPY \${ptx_paths} DESTINATION \"${destination}\")" endfunction() # Create the shim library for each architecture. -set(GROUPBY_FUNCTION_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true) +set(GROUPBY_FUNCTION_CUDA_FLAGS --expt-relaxed-constexpr) # always build a default PTX file in case RAPIDS_NO_INITIALIZE is set and the device cc can't be # safely queried through a context @@ -72,13 +72,12 @@ foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) ${tgt} PROPERTIES CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON - POSITION_INDEPENDENT_CODE ON - INTERFACE_POSITION_INDEPENDENT_CODE ON CUDA_ARCHITECTURES ${arch} CUDA_PTX_COMPILATION ON + CUDA_SEPARABLE_COMPILATION ON ) - target_include_directories(${tgt} PUBLIC include) + target_include_directories(${tgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include) target_compile_options( ${tgt} PRIVATE "$<$:${GROUPBY_FUNCTION_CUDA_FLAGS}>" ) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 1b308374675..54032b595ad 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,9 +19,9 @@ using size_type = int; -/* + // double atomicAdd -__device__ __forceinline__ double atomicAdd(double* address, double val) +__device__ __forceinline__ double atomicAdds(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; @@ -35,10 +35,10 @@ __device__ __forceinline__ double atomicAdd(double* address, double val) return __longlong_as_double(old); } -*/ + // int64_t atomicAdd -__device__ __forceinline__ int64_t atomicAdd(int64_t* address, int64_t val) +__device__ __forceinline__ int64_t atomicAdds(int64_t* address, int64_t val) { return atomicAdd((unsigned long long*)address, (unsigned long long)val); } @@ -98,7 +98,7 @@ __device__ void device_sum(T const* data, int const items_per_thread, size_type } } - atomicAdd(sum, local_sum); + atomicAdds(sum, local_sum); __syncthreads(); } @@ -132,7 +132,7 @@ __device__ void device_var( } } - atomicAdd(var, local_var); + atomicAdds(var, local_var); __syncthreads(); From e650c21d0531c1be0d8a2eb8c7592b10ea6e32e1 Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Mon, 2 Jan 2023 14:42:53 +0000 Subject: [PATCH 029/121] C++ changes --- python/cudf/udf_cpp/groupby/function.cu | 89 +++++++++---------------- 1 file changed, 30 insertions(+), 59 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 54032b595ad..16899f749f2 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -85,15 +85,13 @@ __device__ __forceinline__ int64_t atomicMin(int64_t* address, int64_t val) template __device__ void device_sum(T const* data, int const items_per_thread, size_type size, T* sum) { - int tid = threadIdx.x; - int tb_size = blockDim.x; T local_sum = 0; // Calculate local sum for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - T load = data[tid + item * tb_size]; + if (threadIdx.x + (item * blockDim.x) < size) { + T load = data[threadIdx.x + item * blockDim.x]; local_sum += load; } } @@ -108,8 +106,6 @@ template __device__ void device_var( T const* data, int const items_per_thread, size_type size, T* sum, double* var) { - int tid = threadIdx.x; - int tb_size = blockDim.x; // Calculate how many elements each thread is working on T local_sum = 0; double local_var = 0; @@ -124,8 +120,8 @@ __device__ void device_var( // Calculate local sum for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - T load = data[tid + item * tb_size]; + if (threadIdx.x + (item * blockDim.x) < size) { + T load = data[threadIdx.x + item * blockDim.x]; double temp = load - mean; temp = pow(temp, 2); local_var += temp; @@ -146,16 +142,13 @@ template __device__ void device_max( T const* data, int const items_per_thread, size_type size, T init_val, T* smax) { - int tid = threadIdx.x; - int tb_size = blockDim.x; - T local_max = init_val; // Calculate local max for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - T load = data[tid + item * tb_size]; + if (threadIdx.x + (item * blockDim.x) < size) { + T load = data[threadIdx.x + item * blockDim.x]; local_max = max(local_max, load); } } @@ -173,16 +166,14 @@ template __device__ void device_min( T const* data, int const items_per_thread, size_type size, T init_val, T* smin) { - int tid = threadIdx.x; - int tb_size = blockDim.x; T local_min = init_val; // Calculate local min for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - T load = data[tid + item * tb_size]; + if (threadIdx.x + (item * blockDim.x) < size) { + T load = data[threadIdx.x + item * blockDim.x]; local_min = min(local_min, load); } } @@ -205,8 +196,6 @@ __device__ void device_idxmax(T const* data, T* smax, int64_t* sidx) { - int tid = threadIdx.x; - int tb_size = blockDim.x; // Calculate how many elements each thread is working on T local_max = init_val; @@ -215,11 +204,11 @@ __device__ void device_idxmax(T const* data, // Calculate local max for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - T load = data[tid + item * tb_size]; + if (threadIdx.x + (item * blockDim.x) < size) { + T load = data[threadIdx.x + item * blockDim.x]; if (load > local_max) { local_max = load; - local_idx = index[tid + item * tb_size]; + local_idx = index[threadIdx.x + item * blockDim.x]; } } } @@ -246,8 +235,6 @@ __device__ void device_idxmin(T const* data, T* smin, int64_t* sidx) { - int tid = threadIdx.x; - int tb_size = blockDim.x; T local_min = init_val; int64_t local_idx = -1; @@ -255,11 +242,11 @@ __device__ void device_idxmin(T const* data, // Calculate local max for each thread #pragma unroll for (size_type item = 0; item < items_per_thread; item++) { - if (tid + (item * tb_size) < size) { - T load = data[tid + item * tb_size]; + if (threadIdx.x + (item * blockDim.x) < size) { + T load = data[threadIdx.x + item * blockDim.x]; if (load < local_min) { local_min = load; - local_idx = index[tid + item * tb_size]; + local_idx = index[threadIdx.x + item * blockDim.x]; } } } @@ -280,9 +267,8 @@ extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, int64_t const* data, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ int64_t sum; if (threadIdx.x == 0) { sum = 0; } @@ -300,9 +286,8 @@ extern "C" __device__ int BlockSum_float64(double* numba_return_value, double const* data, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ double sum; if (threadIdx.x == 0) { sum = 0; } @@ -320,9 +305,8 @@ extern "C" __device__ int BlockMean_int64(double* numba_return_value, int64_t const* data, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ int64_t sum; if (threadIdx.x == 0) { sum = 0; } @@ -342,9 +326,8 @@ extern "C" __device__ int BlockMean_float64(double* numba_return_value, double const* data, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ double sum; if (threadIdx.x == 0) { sum = 0; } @@ -364,9 +347,8 @@ extern "C" __device__ int BlockStd_int64(double* numba_return_value, int64_t const* data, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ int64_t sum; __shared__ double var; @@ -389,9 +371,8 @@ extern "C" __device__ int BlockStd_float64(double* numba_return_value, double const* data, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ double sum; __shared__ double var; @@ -414,9 +395,8 @@ extern "C" __device__ int BlockVar_int64(double* numba_return_value, int64_t const* data, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ int64_t sum; __shared__ double var; @@ -439,9 +419,8 @@ extern "C" __device__ int BlockVar_float64(double* numba_return_value, double const* data, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ double sum; __shared__ double var; @@ -465,9 +444,8 @@ extern "C" __device__ int BlockMax_int64(int64_t* numba_return_value, int64_t const* data, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ int64_t smax; @@ -487,9 +465,8 @@ extern "C" __device__ int BlockMax_float64(double* numba_return_value, double const* data, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ double smax; @@ -509,9 +486,8 @@ extern "C" __device__ int BlockMin_int64(int64_t* numba_return_value, int64_t const* data, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ int64_t smin; @@ -531,9 +507,8 @@ extern "C" __device__ int BlockMin_float64(double* numba_return_value, double const* data, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ double smin; @@ -554,9 +529,8 @@ extern "C" __device__ int BlockIdxMax_int64(int64_t* numba_return_value, int64_t* index, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ int64_t smax; __shared__ int64_t sidx; @@ -581,9 +555,8 @@ extern "C" __device__ int BlockIdxMax_float64(int64_t* numba_return_value, int64_t* index, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ double smax; __shared__ int64_t sidx; @@ -608,9 +581,8 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t* numba_return_value, int64_t* index, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ int64_t smin; __shared__ int64_t sidx; @@ -635,9 +607,8 @@ extern "C" __device__ int BlockIdxMin_float64(int64_t* numba_return_value, int64_t* index, int64_t size) { - int tb_size = blockDim.x; // Calculate how many elements each thread is working on - auto const items_per_thread = (size + tb_size - 1) / tb_size; + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ double smin; __shared__ int64_t sidx; From 301eea16709e7950d4a6a44dbda2ed33de6fefe3 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 6 Jan 2023 09:32:23 -0800 Subject: [PATCH 030/121] style --- python/cudf/CMakeLists.txt | 2 +- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/core/udf/__init__.py | 2 +- python/cudf/cudf/core/udf/groupby_function.py | 2 +- python/cudf/cudf/core/udf/groupby_lowering.py | 2 +- python/cudf/cudf/core/udf/groupby_utils.py | 2 +- python/cudf/cudf/core/udf/templates.py | 2 +- python/cudf/cudf/core/udf/utils.py | 2 +- python/cudf/udf_cpp/groupby/CMakeLists.txt | 2 +- python/cudf/udf_cpp/groupby/function.cu | 7 +------ python/strings_udf/cpp/CMakeLists.txt | 2 +- python/strings_udf/strings_udf/__init__.py | 2 +- 12 files changed, 12 insertions(+), 17 deletions(-) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 4e04e3efddb..50a60014cc8 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 37a4c8df7b2..a56c969db00 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import itertools import pickle diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py index a6c9fbe2b2a..f8d6c6d69f3 100644 --- a/python/cudf/cudf/core/udf/__init__.py +++ b/python/cudf/cudf/core/udf/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. from functools import lru_cache diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_function.py index ede18f1a4a5..77a286317b6 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_function.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from typing import Any, Dict import numba diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index 5fe92b464cc..9e9a49ceb17 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. from numba import types from numba.core import cgutils diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index a19f85e5c46..d99d62c43c8 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import math import os diff --git a/python/cudf/cudf/core/udf/templates.py b/python/cudf/cudf/core/udf/templates.py index a4eca4a7efe..f982d904d7f 100644 --- a/python/cudf/cudf/core/udf/templates.py +++ b/python/cudf/cudf/core/udf/templates.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. unmasked_input_initializer_template = """\ d_{idx} = input_col_{idx} diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 187cc0db571..74c7494d8ef 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import glob import os diff --git a/python/cudf/udf_cpp/groupby/CMakeLists.txt b/python/cudf/udf_cpp/groupby/CMakeLists.txt index d3116b2d88e..2fd45e59885 100644 --- a/python/cudf/udf_cpp/groupby/CMakeLists.txt +++ b/python/cudf/udf_cpp/groupby/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 16899f749f2..e03ff5dec5e 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,6 @@ using size_type = int; - // double atomicAdd __device__ __forceinline__ double atomicAdds(double* address, double val) { @@ -36,7 +35,6 @@ __device__ __forceinline__ double atomicAdds(double* address, double val) return __longlong_as_double(old); } - // int64_t atomicAdd __device__ __forceinline__ int64_t atomicAdds(int64_t* address, int64_t val) { @@ -166,7 +164,6 @@ template __device__ void device_min( T const* data, int const items_per_thread, size_type size, T init_val, T* smin) { - T local_min = init_val; // Calculate local min for each thread @@ -196,7 +193,6 @@ __device__ void device_idxmax(T const* data, T* smax, int64_t* sidx) { - // Calculate how many elements each thread is working on T local_max = init_val; int64_t local_idx = -1; @@ -235,7 +231,6 @@ __device__ void device_idxmin(T const* data, T* smin, int64_t* sidx) { - T local_min = init_val; int64_t local_idx = -1; diff --git a/python/strings_udf/cpp/CMakeLists.txt b/python/strings_udf/cpp/CMakeLists.txt index 4bcb65e3aae..ae87b72c981 100644 --- a/python/strings_udf/cpp/CMakeLists.txt +++ b/python/strings_udf/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/strings_udf/strings_udf/__init__.py b/python/strings_udf/strings_udf/__init__.py index 0b4dc11779a..b624eab3dc4 100644 --- a/python/strings_udf/strings_udf/__init__.py +++ b/python/strings_udf/strings_udf/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import glob import os From e50f4a64ba4dce75d2f4d6ef5a4fd65d5881c0c6 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sun, 8 Jan 2023 08:19:21 -0800 Subject: [PATCH 031/121] found the bug --- python/cudf/cudf/core/udf/groupby_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index d99d62c43c8..c0d04a54f09 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -173,7 +173,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): func=kern_def._codelibrary.get_cufunc(), b2d_func=0, memsize=0, - blocksizelimit=blocklim, + blocksizelimit=int(blocklim), ) stream = cuda.default_stream() From df1485d141becac34ded0fe39aaa6daf4df30563 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sun, 8 Jan 2023 09:52:51 -0800 Subject: [PATCH 032/121] minor refactoring --- python/cudf/cudf/core/groupby/groupby.py | 3 +-- python/cudf/cudf/core/udf/__init__.py | 7 +------ python/cudf/cudf/core/udf/groupby_lowering.py | 2 +- .../core/udf/{groupby_function.py => groupby_typing.py} | 4 ++-- python/cudf/cudf/core/udf/groupby_utils.py | 8 ++------ python/cudf/cudf/core/udf/utils.py | 8 +------- python/strings_udf/cpp/CMakeLists.txt | 4 ---- 7 files changed, 8 insertions(+), 28 deletions(-) rename python/cudf/cudf/core/udf/{groupby_function.py => groupby_typing.py} (98%) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index a56c969db00..85e66a5f8e7 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -216,7 +216,6 @@ def __init__( sort=False, as_index=True, dropna=True, - cache=True, group_keys=True, ): """ @@ -883,7 +882,7 @@ def mult(df): index_data[None] = grouped_values.index._column result.index = cudf.MultiIndex._from_data(index_data) else: - raise ValueError("Unsupported engine!.") + raise ValueError(f"Unsupported engine '{engine}'") if self._sort: result = result.sort_index() diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py index f8d6c6d69f3..06ceecf0a35 100644 --- a/python/cudf/cudf/core/udf/__init__.py +++ b/python/cudf/cudf/core/udf/__init__.py @@ -9,12 +9,7 @@ from cudf.core.udf import api, row_function, utils from cudf.utils.dtypes import STRING_TYPES -from . import ( - groupby_function, - groupby_lowering, - masked_lowering, - masked_typing, -) +from . import groupby_lowering, groupby_typing, masked_lowering, masked_typing _units = ["ns", "ms", "us", "s"] _datetime_cases = {types.NPDatetime(u) for u in _units} diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index 9e9a49ceb17..34b2aa9737b 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -6,7 +6,7 @@ from numba.core.typing import signature as nb_signature from numba.cuda.cudaimpl import lower as cuda_lower -from cudf.core.udf.groupby_function import ( +from cudf.core.udf.groupby_typing import ( SUPPORTED_GROUPBY_NUMBA_TYPES, Group, GroupType, diff --git a/python/cudf/cudf/core/udf/groupby_function.py b/python/cudf/cudf/core/udf/groupby_typing.py similarity index 98% rename from python/cudf/cudf/core/udf/groupby_function.py rename to python/cudf/cudf/core/udf/groupby_typing.py index 77a286317b6..94e7a273c1e 100644 --- a/python/cudf/cudf/core/udf/groupby_function.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -112,8 +112,8 @@ def _register_cuda_idxreduction_caller(func, inputty): def caller(data, index, size): return cuda_func(data, index, size) - # idxmax and idxmin always return int64 - type_key = (types.int64, inputty) + # only support default index type right now + type_key = (index_default_type, inputty) if call_cuda_functions.get(func.lower()) is None: call_cuda_functions[func.lower()] = {} call_cuda_functions[func.lower()][type_key] = caller diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index c0d04a54f09..8fa0b2eb320 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -11,7 +11,7 @@ from numba.types import Record import cudf.core.udf.utils -from cudf.core.udf.groupby_function import ( +from cudf.core.udf.groupby_typing import ( SUPPORTED_GROUPBY_NUMPY_TYPES, Group, GroupType, @@ -74,12 +74,8 @@ def _get_frame_groupby_type(dtype, index_dtype): def _groupby_apply_kernel_string_from_template(frame, args): """ - Function to write numba kernels for `DataFrame.apply` as a string. + Function to write numba kernels for `Groupby.apply` as a string. Workaround until numba supports functions that use `*args` - - Both the number of input columns as well as their nullability and any - scalar arguments may vary, so the kernels vary significantly. See - templates.py for the full row kernel template and more details. """ # Create argument list for kernel frame = _supported_cols_from_frame( diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 74c7494d8ef..f28a049bfea 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -296,13 +296,7 @@ def _get_ptx_file(path, prefix): cc = int("".join(str(x) for x in dev.compute_capability)) files = glob.glob(os.path.join(path, f"{prefix}*.ptx")) if len(files) == 0: - raise RuntimeError( - "This strings_udf installation is missing the necessary PTX " - f"files for compute capability {cc}. " - "Please file an issue reporting this error and how you " - "installed cudf and strings_udf." - "https://github.com/rapidsai/cudf/issues" - ) + raise RuntimeError(f"Missing PTX files for cc={cc}") regular_sms = [] for f in files: diff --git a/python/strings_udf/cpp/CMakeLists.txt b/python/strings_udf/cpp/CMakeLists.txt index ae87b72c981..2cab9871f74 100644 --- a/python/strings_udf/cpp/CMakeLists.txt +++ b/python/strings_udf/cpp/CMakeLists.txt @@ -101,10 +101,6 @@ list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-virtual" "") list(SORT CMAKE_CUDA_ARCHITECTURES) list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES) -message("\n\n\n") -message("${CMAKE_CUDA_ARCHITECTURES}") -message("\n\n\n") - foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) set(tgt shim_${arch}) From 14fe3cb4e6bab2e2f194dfe478b6803b1cba5cd5 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 9 Jan 2023 11:24:58 -0800 Subject: [PATCH 033/121] update/add tests --- python/cudf/cudf/core/groupby/groupby.py | 3 ++ python/cudf/cudf/tests/test_groupby.py | 63 ++++++++++++++++++------ 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 85e66a5f8e7..6862b9f00ae 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -852,6 +852,9 @@ def mult(df): ) result = cudf.Series(chunk_results, index=group_names) result.index.names = self.grouping.names + if len(result.index.names) == 1: + result = result.reset_index() + result[None] = result.pop(0) elif engine == "cudf": ngroups = len(offsets) - 1 if ngroups > self._MAX_GROUPS_BEFORE_WARN: diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 3eba1538ff4..0e92531e127 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -20,6 +20,7 @@ PANDAS_GE_150, PANDAS_LT_140, ) +from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES from cudf.testing._utils import ( DATETIME_TYPES, SIGNED_TYPES, @@ -414,6 +415,49 @@ def test_groupby_apply_jit(func): assert_groupby_results_equal(expect, got_jit) +@pytest.fixture(scope="module") +def groupby_jit_data(): + np.random.seed(0) + df = DataFrame() + nelem = 20 + df["key1"] = np.random.randint(0, 3, nelem) + df["key2"] = np.random.randint(0, 2, nelem) + df["val1"] = np.random.random(nelem) + df["val2"] = np.random.random(nelem) + return df + + +def run_groupby_apply_jit_test(data, func, keys, *args): + expect_groupby_obj = data.to_pandas().groupby(keys, as_index=False) + got_groupby_obj = data.groupby(keys) + + cudf_jit_result = got_groupby_obj.apply(func, engine="jit") + pandas_result = expect_groupby_obj.apply(func) + # compare cuDF jit to pandas + assert_groupby_results_equal(cudf_jit_result, pandas_result) + + +@pytest.mark.parametrize("dtype", SUPPORTED_GROUPBY_NUMPY_TYPES) +@pytest.mark.parametrize( + "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] +) +def test_groupby_apply_jit_reductions(func, groupby_jit_data, dtype): + # dynamically generate to avoid pickling error + + funcstr = f""" +def func(df): + return df['val1'].{func}() + """ + lcl = {} + exec(funcstr, lcl) + func = lcl["func"] + + groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype) + groupby_jit_data["val2"] = groupby_jit_data["val2"].astype(dtype) + + run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) + + def create_test_groupby_apply_jit_args_params(): def f1(df, k): return df["val1"].max() + df["val2"].min() + k @@ -430,23 +474,14 @@ def f3(df, k, L, m): @pytest.mark.parametrize( "func,args", create_test_groupby_apply_jit_args_params() ) -def test_groupby_apply_jit_args(func, args): - np.random.seed(0) - df = DataFrame() - nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) +def test_groupby_apply_jit_args(func, args, groupby_jit_data): - expect_grpby = df.to_pandas().groupby(["key1", "key2"]) - got_grpby = df.groupby(["key1", "key2"]) + expect_grpby = groupby_jit_data.to_pandas().groupby(["key1", "key2"]) + got_grpby = groupby_jit_data.groupby(["key1", "key2"]) expect = expect_grpby.apply(func, *args) - got_nonjit = got_grpby.apply(func, *args) - got_jit = got_grpby.apply(func, *args, engine="jit") - assert_groupby_results_equal(expect, got_nonjit) - assert_groupby_results_equal(expect, got_jit) + got = got_grpby.apply(func, *args, engine="jit") + assert_groupby_results_equal(expect, got) @pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) From f7791b48fc005ea991174668c024c57ee4b9ebb8 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 9 Jan 2023 11:42:18 -0800 Subject: [PATCH 034/121] continue refactoring tests --- python/cudf/cudf/core/groupby/groupby.py | 6 +-- python/cudf/cudf/tests/test_groupby.py | 62 ++++++------------------ 2 files changed, 18 insertions(+), 50 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6862b9f00ae..35f3dcaf19e 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -852,9 +852,9 @@ def mult(df): ) result = cudf.Series(chunk_results, index=group_names) result.index.names = self.grouping.names - if len(result.index.names) == 1: - result = result.reset_index() - result[None] = result.pop(0) + # if len(result.index.names) == 1: + result = result.reset_index() + result[None] = result.pop(0) elif engine == "cudf": ngroups = len(offsets) - 1 if ngroups > self._MAX_GROUPS_BEFORE_WARN: diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 0e92531e127..c1798d9dadf 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -377,44 +377,6 @@ def emulate(df): assert_groupby_results_equal(expect, got) -@pytest.mark.parametrize( - "func", - [ - lambda df: df["val1"].max() + df["val2"].min(), - lambda df: df["val1"].idxmax() + df["val2"].idxmin(), - ], -) -def test_groupby_apply_jit(func): - np.random.seed(0) - df = DataFrame() - nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) - - expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False) - got_grpby = df.groupby(["key1", "key2"]) - - expect = expect_grpby.apply(func) - # TODO: Due to some inconsistencies between how pandas and cudf handle the - # created index we get different columns in the index vs the data and a - # different name. For now I'm hacking around this to test the core - # functionality, but we'll need to update that eventually. - names = list(expect.columns) - names[2] = 0 - expect.columns = names - - got_jit = got_grpby.apply(func, engine="jit").reset_index() - # TODO: Shouldn't have to reset_index below - try: - got_nonjit = got_grpby.apply(func).reset_index() - assert_groupby_results_equal(expect, got_nonjit) - assert_groupby_results_equal(expect, got_jit) - except AttributeError: - assert_groupby_results_equal(expect, got_jit) - - @pytest.fixture(scope="module") def groupby_jit_data(): np.random.seed(0) @@ -431,9 +393,10 @@ def run_groupby_apply_jit_test(data, func, keys, *args): expect_groupby_obj = data.to_pandas().groupby(keys, as_index=False) got_groupby_obj = data.groupby(keys) - cudf_jit_result = got_groupby_obj.apply(func, engine="jit") - pandas_result = expect_groupby_obj.apply(func) + cudf_jit_result = got_groupby_obj.apply(func, *args, engine="jit") + pandas_result = expect_groupby_obj.apply(func, *args) # compare cuDF jit to pandas + assert_groupby_results_equal(cudf_jit_result, pandas_result) @@ -458,6 +421,17 @@ def func(df): run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) +@pytest.mark.parametrize( + "func", + [ + lambda df: df["val1"].max() + df["val2"].min(), + lambda df: df["val1"].idxmax() + df["val2"].idxmin(), + ], +) +def test_groupby_apply_jit(func, groupby_jit_data): + run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"]) + + def create_test_groupby_apply_jit_args_params(): def f1(df, k): return df["val1"].max() + df["val2"].min() + k @@ -475,13 +449,7 @@ def f3(df, k, L, m): "func,args", create_test_groupby_apply_jit_args_params() ) def test_groupby_apply_jit_args(func, args, groupby_jit_data): - - expect_grpby = groupby_jit_data.to_pandas().groupby(["key1", "key2"]) - got_grpby = groupby_jit_data.groupby(["key1", "key2"]) - - expect = expect_grpby.apply(func, *args) - got = got_grpby.apply(func, *args, engine="jit") - assert_groupby_results_equal(expect, got) + run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"], *args) @pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) From 7f63c9062c60486b7f977c940c765b33d09d783a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 12 Jan 2023 08:22:27 -0800 Subject: [PATCH 035/121] add docs, switch to partials --- python/cudf/cudf/core/groupby/groupby.py | 6 +- python/cudf/cudf/core/udf/groupby_lowering.py | 77 +++++++++++-------- 2 files changed, 50 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 35f3dcaf19e..3082ca8520c 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -785,6 +785,11 @@ def apply(self, function, *args, engine="cudf"): func : function The python transformation function that will be applied on the grouped chunk. + engine: {'cudf', 'jit'}, default 'cudf' + Selects the GroupBy.apply implementation. Use `jit` to + select the numba JIT pipeline. + For more information, see the `cuDF guide to user defined functions + `__. Examples -------- @@ -852,7 +857,6 @@ def mult(df): ) result = cudf.Series(chunk_results, index=group_names) result.index.names = self.grouping.names - # if len(result.index.names) == 1: result = result.reset_index() result[None] = result.pop(0) elif engine == "cudf": diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index 34b2aa9737b..06b52b4ed18 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -1,5 +1,7 @@ # Copyright (c) 2022-2023, NVIDIA CORPORATION. +from functools import partial + from numba import types from numba.core import cgutils from numba.core.extending import lower_builtin @@ -15,20 +17,34 @@ def lowering_function(context, builder, sig, args, function): + """ + Instruction boilerplate used for calling a groupby reduction + __device__ function. Centers around a forward declaration of + this function and adds the pre/post processing instructions + necessary for calling it. + """ + # return type retty = sig.return_type + # a variable logically corresponding to the calling `Group` grp = cgutils.create_struct_proxy(sig.args[0])( context, builder, value=args[0] ) - grp_type = sig.args[0] + # what specific (numba) GroupType + grp_type = sig.args[0] group_dataty = grp_type.group_data_type + + # logically take the address of the group's data pointer group_data_ptr = builder.alloca(grp.group_data.type) builder.store(grp.group_data, group_data_ptr) + # obtain the correct forward declaration from registry type_key = (sig.return_type, grp_type.group_scalar_type) func = call_cuda_functions[function][type_key] + # insert the forward declaration and return its result + # pass it the data pointer and the group's size return context.compile_internal( builder, func, @@ -39,28 +55,46 @@ def lowering_function(context, builder, sig, args, function): @lower_builtin(Group, types.Array, types.int64, types.Array) def group_constructor(context, builder, sig, args): + """ + Instruction boilerplate used for instantiating a Group + struct from a data pointer, an index pointer, and a size + """ + group_data, size, index = args + # a variable logically corresponding to the calling `Group` grp = cgutils.create_struct_proxy(sig.return_type)(context, builder) + # the group data array and its pointer arr_group_data = cgutils.create_struct_proxy(sig.args[0])( context, builder, value=group_data ) group_data_ptr = arr_group_data.data + # the group index array and its pointer arr_index = cgutils.create_struct_proxy(sig.args[2])( context, builder, value=index ) index_ptr = arr_index.data + # fill the struct explicitly grp.group_data = group_data_ptr grp.index = index_ptr grp.size = size + # return the struct by value return grp._getvalue() -def cuda_Group_idx_max_or_min(context, builder, sig, args, fname): +def cuda_Group_idx_max_or_min(context, builder, sig, args, function): + """ + Instruction boilerplate used for calling a groupby reduction + __device__ function in the case where the function is either + `idxmax` or `idxmin`. See `lowering_function` for details. This + lowering differs from other reductions due to the presence of + the index. This results in the forward declaration expecting + an extra arg. + """ retty = sig.return_type grp = cgutils.create_struct_proxy(sig.args[0])( @@ -76,7 +110,7 @@ def cuda_Group_idx_max_or_min(context, builder, sig, args, fname): index_ptr = builder.alloca(grp.index.type) builder.store(grp.index, index_ptr) type_key = (types.int64, grp_type.group_scalar_type) - func = call_cuda_functions[fname][type_key] + func = call_cuda_functions[function][type_key] return context.compile_internal( builder, @@ -86,12 +120,15 @@ def cuda_Group_idx_max_or_min(context, builder, sig, args, fname): ) -def cuda_Group_max(context, builder, sig, args): - return lowering_function(context, builder, sig, args, "max") - +cuda_Group_max = partial(lowering_function, function="max") +cuda_Group_min = partial(lowering_function, function="min") +cuda_Group_sum = partial(lowering_function, function="sum") +cuda_Group_mean = partial(lowering_function, function="mean") +cuda_Group_std = partial(lowering_function, function="std") +cuda_Group_var = partial(lowering_function, function="var") -def cuda_Group_min(context, builder, sig, args): - return lowering_function(context, builder, sig, args, "min") +cuda_Group_idxmax = partial(cuda_Group_idx_max_or_min, function="idxmax") +cuda_Group_idxmin = partial(cuda_Group_idx_max_or_min, function="idxmin") def cuda_Group_size(context, builder, sig, args): @@ -108,30 +145,6 @@ def cuda_Group_count(context, builder, sig, args): return grp.size -def cuda_Group_sum(context, builder, sig, args): - return lowering_function(context, builder, sig, args, "sum") - - -def cuda_Group_mean(context, builder, sig, args): - return lowering_function(context, builder, sig, args, "mean") - - -def cuda_Group_std(context, builder, sig, args): - return lowering_function(context, builder, sig, args, "std") - - -def cuda_Group_var(context, builder, sig, args): - return lowering_function(context, builder, sig, args, "var") - - -def cuda_Group_idxmax(context, builder, sig, args): - return cuda_Group_idx_max_or_min(context, builder, sig, args, "idxmax") - - -def cuda_Group_idxmin(context, builder, sig, args): - return cuda_Group_idx_max_or_min(context, builder, sig, args, "idxmin") - - for ty in SUPPORTED_GROUPBY_NUMBA_TYPES: cuda_lower("GroupType.max", GroupType(ty))(cuda_Group_max) cuda_lower("GroupType.min", GroupType(ty))(cuda_Group_min) From 902223a54b9c92dd1346104cc69046f337d5c3c4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 17 Jan 2023 07:28:23 -0800 Subject: [PATCH 036/121] continue addressing reviews --- python/cudf/cudf/core/indexed_frame.py | 5 ++++- python/cudf/cudf/core/udf/groupby_typing.py | 3 --- python/cudf/cudf/core/udf/groupby_utils.py | 9 ++++++++- python/cudf/cudf/core/udf/utils.py | 8 ++++++++ 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6526ba1e7c3..1af73044f67 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -24,6 +24,8 @@ ) from uuid import uuid4 +from cudf.core.udf.utils import NoNumbaOccWarnings + import cupy as cp import numpy as np import pandas as pd @@ -2128,7 +2130,8 @@ def _apply(self, func, kernel_getter, *args, **kwargs): launch_args = output_args + input_args + list(args) try: - kernel.forall(len(self))(*launch_args) + with NoNumbaOccWarnings(): + kernel.forall(len(self))(*launch_args) except Exception as e: raise RuntimeError("UDF kernel execution failed.") from e diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 94e7a273c1e..30e11376885 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -15,9 +15,6 @@ from numba.cuda.cudadecl import registry as cuda_registry from numba.np import numpy_support -# Disable occupancy warnings to avoid polluting output when there are few -# groups. -numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 index_default_type = types.int64 SUPPORTED_GROUPBY_NUMBA_TYPES = [types.int64, types.float64] diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 8fa0b2eb320..d5b6bb8bdfd 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -5,12 +5,14 @@ import cupy as cp import numpy as np +import numba from numba import cuda, types from numba.cuda.cudadrv.devices import get_context from numba.np import numpy_support from numba.types import Record import cudf.core.udf.utils +from cudf.core.udf.utils import NoNumbaOccWarnings from cudf.core.udf.groupby_typing import ( SUPPORTED_GROUPBY_NUMPY_TYPES, Group, @@ -29,6 +31,8 @@ ) from cudf.utils.utils import _cudf_nvtx_annotate +from contextlib import contextmanager + dev_func_ptx = _get_ptx_file(os.path.dirname(__file__), "function_") cudf.core.udf.utils.ptx_files.append(dev_func_ptx) @@ -174,7 +178,10 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): stream = cuda.default_stream() - specialized[ngroups, tpb, stream](*launch_args) + # Disable occupancy warnings to avoid polluting output when there are few + # groups. + with NoNumbaOccWarnings(): + specialized[ngroups, tpb, stream](*launch_args) stream.synchronize() diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index f28a049bfea..b22f9a8c08d 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -4,6 +4,8 @@ import os from typing import Any, Callable, Dict, List +import numba + import cachetools import cupy as cp import numpy as np @@ -321,3 +323,9 @@ def _get_ptx_file(path, prefix): ) else: return regular_result[1] + +class NoNumbaOccWarnings(object): + def __enter__(self): + numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + def __exit__(self, exc_type, exc_val, exc_tb): + numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 1 From 78f8b6f212bfcb087e96cba6ce388cc283601616 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 17 Jan 2023 09:48:30 -0600 Subject: [PATCH 037/121] Update python/cudf/cudf/core/udf/groupby_typing.py Co-authored-by: Lawrence Mitchell --- python/cudf/cudf/core/udf/groupby_typing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 94e7a273c1e..d46ddd28f44 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -20,9 +20,8 @@ numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 index_default_type = types.int64 -SUPPORTED_GROUPBY_NUMBA_TYPES = [types.int64, types.float64] SUPPORTED_GROUPBY_NUMPY_TYPES = [ - numpy_support.as_dtype(dt) for dt in SUPPORTED_GROUPBY_NUMBA_TYPES + numpy_support.as_dtype(dt) for dt in [types.int64, types.float64] ] From 2849680ffef491b66e2b8bea87cb05cbf69c3b54 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 17 Jan 2023 07:58:46 -0800 Subject: [PATCH 038/121] merge remote --- python/cudf/cudf/core/udf/groupby_typing.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 4c1f21ba6c3..e2b91da9908 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -14,15 +14,23 @@ from numba.core.typing.templates import AbstractTemplate, AttributeTemplate from numba.cuda.cudadecl import registry as cuda_registry from numba.np import numpy_support +import pandas as pd -index_default_type = types.int64 +index_default_type = pd.RangeIndex(0,0).dtype # int64 +SUPPORTED_GROUPBY_NUMBA_TYPES = [types.int64, types.float64] SUPPORTED_GROUPBY_NUMPY_TYPES = [ numpy_support.as_dtype(dt) for dt in [types.int64, types.float64] ] class Group(object): + """ + A piece of python code whose purpose is to be replaced + during compilation. After being registered to GroupType, + serves as a handle for instantiating GroupType objects + in python code and accessing their attributes + """ def __init__(self, group_data, size, index, dtype, index_dtype): self.group_data = group_data self.size = size @@ -32,6 +40,11 @@ def __init__(self, group_data, size, index, dtype, index_dtype): class GroupType(numba.types.Type): + """ + Numba extension type carrying metadata associated with a single + GroupBy group. This metadata ultimately is passed to the CUDA + __device__ function which actually performs the work. + """ def __init__(self, group_scalar_type, index_type=index_default_type): self.group_scalar_type = group_scalar_type self.index_type = index_type From 33109f5a886b10b9164c831f3f0e55578cafed1c Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 17 Jan 2023 09:54:24 -0800 Subject: [PATCH 039/121] address more reviews --- python/cudf/cudf/core/udf/groupby_lowering.py | 7 +++ python/cudf/cudf/core/udf/groupby_typing.py | 51 +++++++------------ python/cudf/cudf/core/udf/groupby_utils.py | 43 +++++++++++----- python/cudf/cudf/core/udf/utils.py | 17 +++++++ 4 files changed, 72 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index 06b52b4ed18..606f8c6ebc8 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -13,6 +13,7 @@ Group, GroupType, call_cuda_functions, + index_default_type ) @@ -102,6 +103,12 @@ def cuda_Group_idx_max_or_min(context, builder, sig, args, function): ) grp_type = sig.args[0] + if grp_type.index_type != index_default_type: + raise TypeError( + f"Only inputs with default index dtype {index_default_type} " + "are supported." + ) + group_dataty = grp_type.group_data_type group_data_ptr = builder.alloca(grp.group_data.type) builder.store(grp.group_data, group_data_ptr) diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index e2b91da9908..890627a8c30 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -17,7 +17,7 @@ import pandas as pd -index_default_type = pd.RangeIndex(0,0).dtype # int64 +index_default_type = numpy_support.from_dtype(pd.RangeIndex(0,0).dtype) # int64 SUPPORTED_GROUPBY_NUMBA_TYPES = [types.int64, types.float64] SUPPORTED_GROUPBY_NUMPY_TYPES = [ numpy_support.as_dtype(dt) for dt in [types.int64, types.float64] @@ -43,7 +43,7 @@ class GroupType(numba.types.Type): """ Numba extension type carrying metadata associated with a single GroupBy group. This metadata ultimately is passed to the CUDA - __device__ function which actually performs the work. + __device__ function which actually performs the work. """ def __init__(self, group_scalar_type, index_type=index_default_type): self.group_scalar_type = group_scalar_type @@ -58,10 +58,15 @@ def __init__(self, group_scalar_type, index_type=index_default_type): @typeof_impl.register(Group) def typeof_group(val, c): + """ + Tie Group and GroupType together such that when Numba + sees usage of Group in raw python code, it knows to + treat those usages as uses of GroupType + """ return GroupType( numba.np.numpy_support.from_dtype(val.dtype), numba.np.numpy_support.from_dtype(val.index_dtype), - ) # Identifies instances of the Group class as GroupType + ) # The typing of the python "function" Group.__init__ @@ -170,6 +175,16 @@ def generic(self, args, kws): class GroupAttr(AttributeTemplate): key = GroupType + resolve_max = _create_reduction_attr("GroupType.max") + resolve_min = _create_reduction_attr("GroupType.min") + resolve_sum = _create_reduction_attr("GroupType.sum") + + resolve_size = _create_reduction_attr("GroupType.size", retty=types.int64) + resolve_count = _create_reduction_attr("GroupType.count", retty=types.int64) + resolve_mean = _create_reduction_attr("GroupType.mean", retty=types.float64) + resolve_var = _create_reduction_attr("GroupType.var", retty=types.float64) + resolve_std = _create_reduction_attr("GroupType.std", retty=types.float64) + def resolve_idxmax(self, mod): return types.BoundFunction( GroupIdxMax, GroupType(mod.group_scalar_type, mod.index_type) @@ -181,36 +196,6 @@ def resolve_idxmin(self, mod): ) -setattr(GroupAttr, "resolve_max", _create_reduction_attr("GroupType.max")) -setattr(GroupAttr, "resolve_min", _create_reduction_attr("GroupType.min")) -setattr(GroupAttr, "resolve_sum", _create_reduction_attr("GroupType.sum")) -setattr( - GroupAttr, - "resolve_size", - _create_reduction_attr("GroupType.size", retty=types.int64), -) -setattr( - GroupAttr, - "resolve_count", - _create_reduction_attr("GroupType.count", retty=types.int64), -) -setattr( - GroupAttr, - "resolve_mean", - _create_reduction_attr("GroupType.mean", retty=types.float64), -) -setattr( - GroupAttr, - "resolve_var", - _create_reduction_attr("GroupType.var", retty=types.float64), -) -setattr( - GroupAttr, - "resolve_std", - _create_reduction_attr("GroupType.std", retty=types.float64), -) - - _register_cuda_reduction_caller("Max", types.float64, types.float64) _register_cuda_reduction_caller("Max", types.int64, types.int64) _register_cuda_reduction_caller("Min", types.float64, types.float64) diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index d5b6bb8bdfd..8c85127c36b 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -28,10 +28,10 @@ _get_udf_return_type, _supported_cols_from_frame, _supported_dtypes_from_frame, + _get_extensionty_size ) from cudf.utils.utils import _cudf_nvtx_annotate -from contextlib import contextmanager dev_func_ptx = _get_ptx_file(os.path.dirname(__file__), "function_") cudf.core.udf.utils.ptx_files.append(dev_func_ptx) @@ -43,8 +43,16 @@ def _get_frame_groupby_type(dtype, index_dtype): Models the column as a dictionary like data structure containing GroupTypes. See numba.np.numpy_support.from_struct_dtype for details. - """ + Parameters + ---------- + level : np.dtype + A numpy structured array dtype associating field names + to scalar dtypes + index_dtype : np.dtype + A numpy scalar dtype associated with the index of the + incoming grouped data + """ # Create the numpy structured type corresponding to the numpy dtype. fields = [] offset = 0 @@ -55,21 +63,19 @@ def _get_frame_groupby_type(dtype, index_dtype): title = info[2] if len(info) == 3 else None ty = numpy_support.from_dtype(elemdtype) indexty = numpy_support.from_dtype(index_dtype) + groupty = GroupType(ty, indexty) infos = { - "type": GroupType(ty, indexty), + "type": groupty, "offset": offset, "title": title, } fields.append((name, infos)) - - offset += 8 + 8 + 8 # group struct size (2 pointers and 1 integer) + offset += _get_extensionty_size(groupty) # Align the next member of the struct to be a multiple of the # memory access size, per PTX ISA 7.4/5.4.5 if i < len(sizes) - 1: - # next_itemsize = sizes[i + 1] - next_itemsize = 8 - offset = int(math.ceil(offset / next_itemsize) * next_itemsize) + offset = int(math.ceil(offset / 8) * 8) # Numba requires that structures are aligned for the CUDA target _is_aligned_struct = True @@ -131,6 +137,20 @@ def _get_groupby_apply_kernel(frame, func, args): @_cudf_nvtx_annotate def jit_groupby_apply(offsets, grouped_values, function, *args): + """ + Main entrypoint for JIT Groupby.apply via Numba. + + Parameters + ---------- + offsets : list + A list of intergers denoting the indices of the group + boundries in grouped_values + grouped_values : DataFrame + A DataFrame representing the source data + sorted by group keys + function: callable + The user UDF defined on a DataFrame + """ offsets = cp.asarray(offsets) ngroups = len(offsets) - 1 @@ -157,7 +177,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): if max_group_size >= 1000: blocklim = 256 else: - blocklim = ((max_group_size + 32 - 1) / 32) * 32 + blocklim = ((max_group_size + 32 - 1) // 32) * 32 if kernel.specialized: specialized = kernel @@ -168,7 +188,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): ctx = get_context() # Dispatcher is specialized, so there's only one definition - get # it so we can get the cufunc from the code library - kern_def = next(iter(specialized.overloads.values())) + kern_def, = specialized.overloads.values() grid, tpb = ctx.get_max_potential_block_size( func=kern_def._codelibrary.get_cufunc(), b2d_func=0, @@ -176,13 +196,10 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): blocksizelimit=int(blocklim), ) - stream = cuda.default_stream() - # Disable occupancy warnings to avoid polluting output when there are few # groups. with NoNumbaOccWarnings(): specialized[ngroups, tpb, stream](*launch_args) - stream.synchronize() return output diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index b22f9a8c08d..519607cc3a1 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -6,6 +6,11 @@ import numba +from numba.core.datamodel import default_manager +from numba.cuda.cudadrv import nvvm + +import llvmlite.binding as ll + import cachetools import cupy as cp import numpy as np @@ -329,3 +334,15 @@ def __enter__(self): numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 def __exit__(self, exc_type, exc_val, exc_tb): numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 1 + +def _get_extensionty_size(ty): + """ + Return the size of an extension type in bytes + """ + data_layout = nvvm.data_layout + if isinstance(data_layout, dict): + data_layout = data_layout[64] + target_data = ll.create_target_data(data_layout) + llty = default_manager[ty].get_value_type() + return llty.get_abi_size(target_data) + From 07444eb2ffc2cdda6b266b78700afd2590e2564e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 17 Jan 2023 11:24:36 -0800 Subject: [PATCH 040/121] move utilities around --- python/cudf/cudf/core/udf/groupby_utils.py | 2 +- python/strings_udf/strings_udf/_typing.py | 18 +++++------------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 8c85127c36b..3f8f787c01b 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -199,7 +199,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): # Disable occupancy warnings to avoid polluting output when there are few # groups. with NoNumbaOccWarnings(): - specialized[ngroups, tpb, stream](*launch_args) + specialized[ngroups, tpb](*launch_args) return output diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 99e4046b0b3..7749dd242d9 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -5,23 +5,17 @@ import llvmlite.binding as ll import numpy as np from numba import types -from numba.core.datamodel import default_manager from numba.core.extending import models, register_model from numba.core.typing import signature as nb_signature from numba.core.typing.templates import AbstractTemplate, AttributeTemplate from numba.cuda.cudadecl import registry as cuda_decl_registry -from numba.cuda.cudadrv import nvvm -data_layout = nvvm.data_layout +from cudf.core.udf.utils import _get_extensionty_size -# libcudf size_type -size_type = types.int32 -# workaround for numba < 0.56 -if isinstance(data_layout, dict): - data_layout = data_layout[64] -target_data = ll.create_target_data(data_layout) +# libcudf size_type +size_type = types.int32 # String object definitions class UDFString(types.Type): @@ -30,8 +24,7 @@ class UDFString(types.Type): def __init__(self): super().__init__(name="udf_string") - llty = default_manager[self].get_value_type() - self.size_bytes = llty.get_abi_size(target_data) + self.size_bytes = _get_extensionty_size(self) @property def return_type(self): @@ -44,8 +37,7 @@ class StringView(types.Type): def __init__(self): super().__init__(name="string_view") - llty = default_manager[self].get_value_type() - self.size_bytes = llty.get_abi_size(target_data) + self.size_bytes = _get_extensionty_size(self) @property def return_type(self): From 39eb8f93e59db4feaca55010af67a8f12416b484 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 18 Jan 2023 13:27:33 -0800 Subject: [PATCH 041/121] template throughout c++ --- python/cudf/udf_cpp/groupby/function.cu | 369 ++++++++---------------- 1 file changed, 125 insertions(+), 244 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index e03ff5dec5e..f8d8594a4a2 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -258,366 +258,247 @@ __device__ void device_idxmin(T const* data, __syncthreads(); } -extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, - int64_t const* data, - int64_t size) -{ - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ int64_t sum; - if (threadIdx.x == 0) { sum = 0; } - - __syncthreads(); - - device_sum(data, items_per_thread, size, &sum); - - *numba_return_value = sum; - - return 0; -} - -extern "C" __device__ int BlockSum_float64(double* numba_return_value, - double const* data, - int64_t size) -{ - // Calculate how many elements each thread is working on +template +__device__ T BlockSum(T const* data, int64_t size) { auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; + __shared__ T sum; - __shared__ double sum; if (threadIdx.x == 0) { sum = 0; } - __syncthreads(); + device_sum(data, items_per_thread, size, &sum); + return sum; - device_sum(data, items_per_thread, size, &sum); - - *numba_return_value = sum; - - return 0; } -extern "C" __device__ int BlockMean_int64(double* numba_return_value, - int64_t const* data, - int64_t size) -{ +template +__device__ T BlockMean(T const* data, int64_t size) { // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - __shared__ int64_t sum; + __shared__ T sum; if (threadIdx.x == 0) { sum = 0; } __syncthreads(); - - device_sum(data, items_per_thread, size, &sum); - + device_sum(data, items_per_thread, size, &sum); double mean = sum / static_cast(size); - - *numba_return_value = mean; - - return 0; + return mean; } -extern "C" __device__ int BlockMean_float64(double* numba_return_value, - double const* data, - int64_t size) -{ - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ double sum; - if (threadIdx.x == 0) { sum = 0; } - - __syncthreads(); - - device_sum(data, items_per_thread, size, &sum); - - double mean = sum / static_cast(size); - - *numba_return_value = mean; - return 0; -} - -extern "C" __device__ int BlockStd_int64(double* numba_return_value, - int64_t const* data, - int64_t size) -{ +template +__device__ T BlockStd(T const* data, int64_t size) { // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ int64_t sum; + __shared__ T sum; __shared__ double var; - if (threadIdx.x == 0) { sum = 0; var = 0; } - __syncthreads(); - - device_var(data, items_per_thread, size, &sum, &var); - - *numba_return_value = sqrt(var); - - return 0; + device_var(data, items_per_thread, size, &sum, &var); + return sqrt(var); } -extern "C" __device__ int BlockStd_float64(double* numba_return_value, - double const* data, - int64_t size) -{ +template +__device__ T BlockVar(T const* data, int64_t size) { // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ double sum; + __shared__ T sum; __shared__ double var; - if (threadIdx.x == 0) { sum = 0; var = 0; } - __syncthreads(); + device_var(data, items_per_thread, size, &sum, &var); + return var; +} - device_var(data, items_per_thread, size, &sum, &var); - - *numba_return_value = sqrt(var); - return 0; +template +__device__ T BlockMax(T const* data, int64_t size) { + // Calculate how many elements each thread is working on + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; + __shared__ T smax; + if (threadIdx.x == 0) { smax = INT64_MIN; } + __syncthreads(); + device_max(data, items_per_thread, size, INT64_MIN, &smax); + return smax; } -extern "C" __device__ int BlockVar_int64(double* numba_return_value, - int64_t const* data, - int64_t size) -{ +template +__device__ T BlockMin(T const* data, int64_t size) { // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; + __shared__ T smin; + if (threadIdx.x == 0) { smin = INT64_MAX; } + __syncthreads(); + device_min(data, items_per_thread, size, INT64_MAX, &smin); + return smin; +} - __shared__ int64_t sum; - __shared__ double var; - +template +__device__ T BlockIdxMax(T const* data, int64_t* index, int64_t size) { + // Calculate how many elements each thread is working on + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; + __shared__ T smax; + __shared__ int64_t sidx; if (threadIdx.x == 0) { - sum = 0; - var = 0; + smax = INT64_MIN; + sidx = INT64_MAX; } - __syncthreads(); + device_idxmax(data, items_per_thread, index, size, INT64_MIN, &smax, &sidx); + return sidx; +} - device_var(data, items_per_thread, size, &sum, &var); +template +__device__ T BlockIdxMin(T const* data, int64_t* index, T min, int64_t size) { + auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; + __shared__ T smin; + __shared__ int64_t sidx; + if (threadIdx.x == 0) { + smin = min; + sidx = INT64_MAX; + } + __syncthreads(); + device_idxmin(data, items_per_thread, index, size, min, &smin, &sidx); + return sidx; +} - *numba_return_value = var; +extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, int64_t const* data, int64_t size) { + *numba_return_value = BlockSum(data, size); return 0; } -extern "C" __device__ int BlockVar_float64(double* numba_return_value, - double const* data, - int64_t size) -{ - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ double sum; - __shared__ double var; - - if (threadIdx.x == 0) { - sum = 0; - var = 0; - } +extern "C" __device__ int BlockSum_float64(double* numba_return_value, double const* data, int64_t size) { + *numba_return_value = BlockSum(data, size); + return 0; +} - __syncthreads(); - device_var(data, items_per_thread, size, &sum, &var); +extern "C" __device__ int BlockMean_int64(int64_t* numba_return_value, int64_t const* data, int64_t size) { + *numba_return_value = BlockMean(data, size); + return 0; +} - *numba_return_value = var; +extern "C" __device__ int BlockMean_float64(double* numba_return_value, + double const* data, + int64_t size) +{ + *numba_return_value = BlockMean(data, size); return 0; } -// Calculate maximum of the group, return the scalar -extern "C" __device__ int BlockMax_int64(int64_t* numba_return_value, + +extern "C" __device__ int BlockStd_int64(double* numba_return_value, int64_t const* data, int64_t size) { - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ int64_t smax; - - if (threadIdx.x == 0) { smax = INT64_MIN; } + *numba_return_value = BlockStd(data, size); + return 0; +} - __syncthreads(); - device_max(data, items_per_thread, size, INT64_MIN, &smax); +extern "C" __device__ int BlockStd_float64(double* numba_return_value, + double const* data, + int64_t size) +{ + *numba_return_value = BlockStd(data, size); + return 0; +} - *numba_return_value = smax; +extern "C" __device__ int BlockVar_int64(double* numba_return_value, + int64_t const* data, + int64_t size) +{ + *numba_return_value = BlockVar(data, size); return 0; } -// Calculate maximum of the group, return the scalar -extern "C" __device__ int BlockMax_float64(double* numba_return_value, + +extern "C" __device__ int BlockVar_float64(double* numba_return_value, double const* data, int64_t size) { - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ double smax; - - if (threadIdx.x == 0) { smax = -DBL_MAX; } - - __syncthreads(); - - device_max(data, items_per_thread, size, -DBL_MAX, &smax); - - *numba_return_value = smax; - - return 0; +*numba_return_value = BlockVar(data, size); +return 0; } -// Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockMin_int64(int64_t* numba_return_value, + +extern "C" __device__ int BlockMax_int64(int64_t* numba_return_value, int64_t const* data, int64_t size) { - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ int64_t smin; - - if (threadIdx.x == 0) { smin = INT64_MAX; } - - __syncthreads(); - - device_min(data, items_per_thread, size, INT64_MAX, &smin); - - *numba_return_value = smin; - + *numba_return_value = BlockMax(data, size); return 0; } -// Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockMin_float64(double* numba_return_value, +extern "C" __device__ int BlockMax_float64(double* numba_return_value, double const* data, int64_t size) { - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ double smin; + *numba_return_value = BlockMax(data, size); + return 0; +} - if (threadIdx.x == 0) { smin = DBL_MAX; } - __syncthreads(); - device_min(data, items_per_thread, size, DBL_MAX, &smin); +extern "C" __device__ int BlockMin_int64(int64_t* numba_return_value, + int64_t const* data, + int64_t size) +{ + *numba_return_value = BlockMin(data, size); + return 0; +} - *numba_return_value = smin; +extern "C" __device__ int BlockMin_float64(double* numba_return_value, + double const* data, + int64_t size) +{ + *numba_return_value = BlockMin(data, size); return 0; } -// Calculate minimum of the group, return the scalar + extern "C" __device__ int BlockIdxMax_int64(int64_t* numba_return_value, int64_t const* data, int64_t* index, int64_t size) { - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ int64_t smax; - __shared__ int64_t sidx; - - if (threadIdx.x == 0) { - smax = INT64_MIN; - sidx = INT64_MAX; - } - - __syncthreads(); - - device_idxmax(data, items_per_thread, index, size, INT64_MIN, &smax, &sidx); - - *numba_return_value = sidx; - + *numba_return_value = BlockIdxMax(data, index, size); return 0; } -// Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMax_float64(int64_t* numba_return_value, +extern "C" __device__ int BlockIdxMax_float64(double* numba_return_value, double const* data, int64_t* index, int64_t size) { - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ double smax; - __shared__ int64_t sidx; - - if (threadIdx.x == 0) { - smax = -DBL_MAX; - sidx = INT64_MAX; - } - - __syncthreads(); - - device_idxmax(data, items_per_thread, index, size, -DBL_MAX, &smax, &sidx); - - *numba_return_value = smax; - + *numba_return_value = BlockIdxMax(data, index, size); return 0; } -// Calculate minimum of the group, return the scalar + extern "C" __device__ int BlockIdxMin_int64(int64_t* numba_return_value, int64_t const* data, int64_t* index, int64_t size) { - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ int64_t smin; - __shared__ int64_t sidx; - - if (threadIdx.x == 0) { - smin = INT64_MAX; - sidx = INT64_MAX; - } - - __syncthreads(); - - device_idxmin(data, items_per_thread, index, size, INT64_MAX, &smin, &sidx); - - *numba_return_value = sidx; - + *numba_return_value = BlockIdxMin(data, index, INT64_MAX, size); return 0; } -// Calculate minimum of the group, return the scalar -extern "C" __device__ int BlockIdxMin_float64(int64_t* numba_return_value, +extern "C" __device__ int BlockIdxMin_float64(double* numba_return_value, double const* data, int64_t* index, int64_t size) { - // Calculate how many elements each thread is working on - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ double smin; - __shared__ int64_t sidx; - - if (threadIdx.x == 0) { - smin = DBL_MAX; - sidx = INT64_MAX; - } - - __syncthreads(); - - device_idxmin(data, items_per_thread, index, size, DBL_MAX, &smin, &sidx); - - *numba_return_value = sidx; - + *numba_return_value = BlockIdxMin(data, index, DBL_MAX, size); return 0; } From 6158cb710c9ce0bf88c8af33b7dd17b673c4546a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 18 Jan 2023 17:42:34 -0500 Subject: [PATCH 042/121] cpp code formatting --- python/cudf/udf_cpp/groupby/function.cu | 71 +++++++++++++------------ 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index f8d8594a4a2..2aa6119fbb9 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -259,7 +259,8 @@ __device__ void device_idxmin(T const* data, } template -__device__ T BlockSum(T const* data, int64_t size) { +__device__ T BlockSum(T const* data, int64_t size) +{ auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T sum; @@ -267,11 +268,11 @@ __device__ T BlockSum(T const* data, int64_t size) { __syncthreads(); device_sum(data, items_per_thread, size, &sum); return sum; - } template -__device__ T BlockMean(T const* data, int64_t size) { +__device__ T BlockMean(T const* data, int64_t size) +{ // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; @@ -284,9 +285,9 @@ __device__ T BlockMean(T const* data, int64_t size) { return mean; } - template -__device__ T BlockStd(T const* data, int64_t size) { +__device__ T BlockStd(T const* data, int64_t size) +{ // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T sum; @@ -301,7 +302,8 @@ __device__ T BlockStd(T const* data, int64_t size) { } template -__device__ T BlockVar(T const* data, int64_t size) { +__device__ T BlockVar(T const* data, int64_t size) +{ // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T sum; @@ -315,9 +317,9 @@ __device__ T BlockVar(T const* data, int64_t size) { return var; } - template -__device__ T BlockMax(T const* data, int64_t size) { +__device__ T BlockMax(T const* data, int64_t size) +{ // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T smax; @@ -328,7 +330,8 @@ __device__ T BlockMax(T const* data, int64_t size) { } template -__device__ T BlockMin(T const* data, int64_t size) { +__device__ T BlockMin(T const* data, int64_t size) +{ // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T smin; @@ -339,7 +342,8 @@ __device__ T BlockMin(T const* data, int64_t size) { } template -__device__ T BlockIdxMax(T const* data, int64_t* index, int64_t size) { +__device__ T BlockIdxMax(T const* data, int64_t* index, int64_t size) +{ // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T smax; @@ -354,7 +358,8 @@ __device__ T BlockIdxMax(T const* data, int64_t* index, int64_t size) { } template -__device__ T BlockIdxMin(T const* data, int64_t* index, T min, int64_t size) { +__device__ T BlockIdxMin(T const* data, int64_t* index, T min, int64_t size) +{ auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T smin; __shared__ int64_t sidx; @@ -367,24 +372,30 @@ __device__ T BlockIdxMin(T const* data, int64_t* index, T min, int64_t size) { return sidx; } - -extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, int64_t const* data, int64_t size) { +extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, + int64_t const* data, + int64_t size) +{ *numba_return_value = BlockSum(data, size); return 0; } -extern "C" __device__ int BlockSum_float64(double* numba_return_value, double const* data, int64_t size) { +extern "C" __device__ int BlockSum_float64(double* numba_return_value, + double const* data, + int64_t size) +{ *numba_return_value = BlockSum(data, size); return 0; } - -extern "C" __device__ int BlockMean_int64(int64_t* numba_return_value, int64_t const* data, int64_t size) { +extern "C" __device__ int BlockMean_int64(int64_t* numba_return_value, + int64_t const* data, + int64_t size) +{ *numba_return_value = BlockMean(data, size); return 0; } - extern "C" __device__ int BlockMean_float64(double* numba_return_value, double const* data, int64_t size) @@ -393,24 +404,21 @@ extern "C" __device__ int BlockMean_float64(double* numba_return_value, return 0; } - extern "C" __device__ int BlockStd_int64(double* numba_return_value, int64_t const* data, int64_t size) { *numba_return_value = BlockStd(data, size); return 0; -} - +} -extern "C" __device__ int BlockStd_float64(double* numba_return_value, - double const* data, +extern "C" __device__ int BlockStd_float64(double* numba_return_value, + double const* data, int64_t size) { *numba_return_value = BlockStd(data, size); return 0; -} - +} extern "C" __device__ int BlockVar_int64(double* numba_return_value, int64_t const* data, @@ -420,16 +428,14 @@ extern "C" __device__ int BlockVar_int64(double* numba_return_value, return 0; } - extern "C" __device__ int BlockVar_float64(double* numba_return_value, double const* data, int64_t size) { -*numba_return_value = BlockVar(data, size); -return 0; + *numba_return_value = BlockVar(data, size); + return 0; } - extern "C" __device__ int BlockMax_int64(int64_t* numba_return_value, int64_t const* data, int64_t size) @@ -446,8 +452,6 @@ extern "C" __device__ int BlockMax_float64(double* numba_return_value, return 0; } - - extern "C" __device__ int BlockMin_int64(int64_t* numba_return_value, int64_t const* data, int64_t size) @@ -456,16 +460,14 @@ extern "C" __device__ int BlockMin_int64(int64_t* numba_return_value, return 0; } - extern "C" __device__ int BlockMin_float64(double* numba_return_value, - double const* data, - int64_t size) + double const* data, + int64_t size) { *numba_return_value = BlockMin(data, size); return 0; } - extern "C" __device__ int BlockIdxMax_int64(int64_t* numba_return_value, int64_t const* data, int64_t* index, @@ -484,7 +486,6 @@ extern "C" __device__ int BlockIdxMax_float64(double* numba_return_value, return 0; } - extern "C" __device__ int BlockIdxMin_int64(int64_t* numba_return_value, int64_t const* data, int64_t* index, From 5ae896ad8a425d4c64ace37af3f7b76d3afdd3ed Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 18 Jan 2023 14:49:50 -0800 Subject: [PATCH 043/121] style --- python/cudf/cudf/core/indexed_frame.py | 3 +-- python/cudf/cudf/core/udf/groupby_lowering.py | 2 +- python/cudf/cudf/core/udf/groupby_typing.py | 19 +++++++++++++------ python/cudf/cudf/core/udf/groupby_utils.py | 15 ++++++--------- python/cudf/cudf/core/udf/utils.py | 17 ++++++++--------- python/strings_udf/strings_udf/_typing.py | 6 ++---- 6 files changed, 31 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 1af73044f67..d39dfccbe84 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -24,8 +24,6 @@ ) from uuid import uuid4 -from cudf.core.udf.utils import NoNumbaOccWarnings - import cupy as cp import numpy as np import pandas as pd @@ -61,6 +59,7 @@ from cudf.core.multiindex import MultiIndex from cudf.core.resample import _Resampler from cudf.core.udf.utils import ( + NoNumbaOccWarnings, _compile_or_get, _get_input_args_from_frame, _post_process_output_col, diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index 606f8c6ebc8..faf4320caf1 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -13,7 +13,7 @@ Group, GroupType, call_cuda_functions, - index_default_type + index_default_type, ) diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 890627a8c30..82cd863a42e 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -2,6 +2,7 @@ from typing import Any, Dict import numba +import pandas as pd from numba import cuda, types from numba.core.extending import ( make_attribute_wrapper, @@ -14,10 +15,10 @@ from numba.core.typing.templates import AbstractTemplate, AttributeTemplate from numba.cuda.cudadecl import registry as cuda_registry from numba.np import numpy_support -import pandas as pd - -index_default_type = numpy_support.from_dtype(pd.RangeIndex(0,0).dtype) # int64 +index_default_type = numpy_support.from_dtype( + pd.RangeIndex(0, 0).dtype +) # int64 SUPPORTED_GROUPBY_NUMBA_TYPES = [types.int64, types.float64] SUPPORTED_GROUPBY_NUMPY_TYPES = [ numpy_support.as_dtype(dt) for dt in [types.int64, types.float64] @@ -31,6 +32,7 @@ class Group(object): serves as a handle for instantiating GroupType objects in python code and accessing their attributes """ + def __init__(self, group_data, size, index, dtype, index_dtype): self.group_data = group_data self.size = size @@ -43,8 +45,9 @@ class GroupType(numba.types.Type): """ Numba extension type carrying metadata associated with a single GroupBy group. This metadata ultimately is passed to the CUDA - __device__ function which actually performs the work. + __device__ function which actually performs the work. """ + def __init__(self, group_scalar_type, index_type=index_default_type): self.group_scalar_type = group_scalar_type self.index_type = index_type @@ -180,8 +183,12 @@ class GroupAttr(AttributeTemplate): resolve_sum = _create_reduction_attr("GroupType.sum") resolve_size = _create_reduction_attr("GroupType.size", retty=types.int64) - resolve_count = _create_reduction_attr("GroupType.count", retty=types.int64) - resolve_mean = _create_reduction_attr("GroupType.mean", retty=types.float64) + resolve_count = _create_reduction_attr( + "GroupType.count", retty=types.int64 + ) + resolve_mean = _create_reduction_attr( + "GroupType.mean", retty=types.float64 + ) resolve_var = _create_reduction_attr("GroupType.var", retty=types.float64) resolve_std = _create_reduction_attr("GroupType.std", retty=types.float64) diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 3f8f787c01b..6201c922233 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -5,14 +5,12 @@ import cupy as cp import numpy as np -import numba from numba import cuda, types from numba.cuda.cudadrv.devices import get_context from numba.np import numpy_support from numba.types import Record import cudf.core.udf.utils -from cudf.core.udf.utils import NoNumbaOccWarnings from cudf.core.udf.groupby_typing import ( SUPPORTED_GROUPBY_NUMPY_TYPES, Group, @@ -23,16 +21,16 @@ groupby_apply_kernel_template, ) from cudf.core.udf.utils import ( + NoNumbaOccWarnings, + _get_extensionty_size, _get_kernel, _get_ptx_file, _get_udf_return_type, _supported_cols_from_frame, _supported_dtypes_from_frame, - _get_extensionty_size ) from cudf.utils.utils import _cudf_nvtx_annotate - dev_func_ptx = _get_ptx_file(os.path.dirname(__file__), "function_") cudf.core.udf.utils.ptx_files.append(dev_func_ptx) @@ -138,13 +136,13 @@ def _get_groupby_apply_kernel(frame, func, args): @_cudf_nvtx_annotate def jit_groupby_apply(offsets, grouped_values, function, *args): """ - Main entrypoint for JIT Groupby.apply via Numba. + Main entrypoint for JIT Groupby.apply via Numba. Parameters ---------- offsets : list - A list of intergers denoting the indices of the group - boundries in grouped_values + A list of integers denoting the indices of the group + boundaries in grouped_values grouped_values : DataFrame A DataFrame representing the source data sorted by group keys @@ -188,7 +186,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): ctx = get_context() # Dispatcher is specialized, so there's only one definition - get # it so we can get the cufunc from the code library - kern_def, = specialized.overloads.values() + (kern_def,) = specialized.overloads.values() grid, tpb = ctx.get_max_potential_block_size( func=kern_def._codelibrary.get_cufunc(), b2d_func=0, @@ -201,5 +199,4 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): with NoNumbaOccWarnings(): specialized[ngroups, tpb](*launch_args) - return output diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 519607cc3a1..9fa006e8e47 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -4,18 +4,15 @@ import os from typing import Any, Callable, Dict, List -import numba - -from numba.core.datamodel import default_manager -from numba.cuda.cudadrv import nvvm - -import llvmlite.binding as ll - import cachetools import cupy as cp +import llvmlite.binding as ll +import numba import numpy as np from numba import cuda, typeof +from numba.core.datamodel import default_manager from numba.core.errors import TypingError +from numba.cuda.cudadrv import nvvm from numba.np import numpy_support from numba.types import CPointer, Poison, Tuple, boolean, int64, void @@ -329,12 +326,15 @@ def _get_ptx_file(path, prefix): else: return regular_result[1] + class NoNumbaOccWarnings(object): def __enter__(self): numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + def __exit__(self, exc_type, exc_val, exc_tb): numba.config.CUDA_LOW_OCCUPANCY_WARNINGS = 1 + def _get_extensionty_size(ty): """ Return the size of an extension type in bytes @@ -343,6 +343,5 @@ def _get_extensionty_size(ty): if isinstance(data_layout, dict): data_layout = data_layout[64] target_data = ll.create_target_data(data_layout) - llty = default_manager[ty].get_value_type() + llty = default_manager[ty].get_value_type() return llty.get_abi_size(target_data) - diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 7749dd242d9..604a4c26714 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -1,8 +1,7 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. import operator -import llvmlite.binding as ll import numpy as np from numba import types from numba.core.extending import models, register_model @@ -12,11 +11,10 @@ from cudf.core.udf.utils import _get_extensionty_size - - # libcudf size_type size_type = types.int32 + # String object definitions class UDFString(types.Type): From a42d3077e90e042201efa5776ba2a4129579b2f6 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 18 Jan 2023 18:03:56 -0500 Subject: [PATCH 044/121] Use cudf size_type --- python/cudf/udf_cpp/groupby/CMakeLists.txt | 2 +- python/cudf/udf_cpp/groupby/function.cu | 28 +++++++++++----------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/CMakeLists.txt b/python/cudf/udf_cpp/groupby/CMakeLists.txt index 2fd45e59885..fcf036a0812 100644 --- a/python/cudf/udf_cpp/groupby/CMakeLists.txt +++ b/python/cudf/udf_cpp/groupby/CMakeLists.txt @@ -81,7 +81,7 @@ foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) target_compile_options( ${tgt} PRIVATE "$<$:${GROUPBY_FUNCTION_CUDA_FLAGS}>" ) - target_link_libraries(${tgt} PUBLIC CUDA::nvrtc) + target_link_libraries(${tgt} PUBLIC cudf::cudf) copy_ptx_to_location(${tgt} "${CMAKE_CURRENT_BINARY_DIR}/") install( diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 2aa6119fbb9..94c3b243e66 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -14,11 +14,11 @@ * limitations under the License. */ +#include + #include #include -using size_type = int; - // double atomicAdd __device__ __forceinline__ double atomicAdds(double* address, double val) { @@ -81,13 +81,13 @@ __device__ __forceinline__ int64_t atomicMin(int64_t* address, int64_t val) // Use a C++ templated __device__ function to implement the body of the algorithm. template -__device__ void device_sum(T const* data, int const items_per_thread, size_type size, T* sum) +__device__ void device_sum(T const* data, int const items_per_thread, cudf::size_type size, T* sum) { T local_sum = 0; // Calculate local sum for each thread #pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { + for (cudf::size_type item = 0; item < items_per_thread; item++) { if (threadIdx.x + (item * blockDim.x) < size) { T load = data[threadIdx.x + item * blockDim.x]; local_sum += load; @@ -102,7 +102,7 @@ __device__ void device_sum(T const* data, int const items_per_thread, size_type // Use a C++ templated __device__ function to implement the body of the algorithm. template __device__ void device_var( - T const* data, int const items_per_thread, size_type size, T* sum, double* var) + T const* data, int const items_per_thread, cudf::size_type size, T* sum, double* var) { // Calculate how many elements each thread is working on T local_sum = 0; @@ -117,7 +117,7 @@ __device__ void device_var( // Calculate local sum for each thread #pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { + for (cudf::size_type item = 0; item < items_per_thread; item++) { if (threadIdx.x + (item * blockDim.x) < size) { T load = data[threadIdx.x + item * blockDim.x]; double temp = load - mean; @@ -138,13 +138,13 @@ __device__ void device_var( // Use a C++ templated __device__ function to implement the body of the algorithm. template __device__ void device_max( - T const* data, int const items_per_thread, size_type size, T init_val, T* smax) + T const* data, int const items_per_thread, cudf::size_type size, T init_val, T* smax) { T local_max = init_val; // Calculate local max for each thread #pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { + for (cudf::size_type item = 0; item < items_per_thread; item++) { if (threadIdx.x + (item * blockDim.x) < size) { T load = data[threadIdx.x + item * blockDim.x]; local_max = max(local_max, load); @@ -162,13 +162,13 @@ __device__ void device_max( // Use a C++ templated __device__ function to implement the body of the algorithm. template __device__ void device_min( - T const* data, int const items_per_thread, size_type size, T init_val, T* smin) + T const* data, int const items_per_thread, cudf::size_type size, T init_val, T* smin) { T local_min = init_val; // Calculate local min for each thread #pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { + for (cudf::size_type item = 0; item < items_per_thread; item++) { if (threadIdx.x + (item * blockDim.x) < size) { T load = data[threadIdx.x + item * blockDim.x]; local_min = min(local_min, load); @@ -188,7 +188,7 @@ template __device__ void device_idxmax(T const* data, int const items_per_thread, int64_t const* index, - size_type size, + cudf::size_type size, T init_val, T* smax, int64_t* sidx) @@ -199,7 +199,7 @@ __device__ void device_idxmax(T const* data, // Calculate local max for each thread #pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { + for (cudf::size_type item = 0; item < items_per_thread; item++) { if (threadIdx.x + (item * blockDim.x) < size) { T load = data[threadIdx.x + item * blockDim.x]; if (load > local_max) { @@ -226,7 +226,7 @@ template __device__ void device_idxmin(T const* data, int const items_per_thread, int64_t const* index, - size_type size, + cudf::size_type size, T init_val, T* smin, int64_t* sidx) @@ -236,7 +236,7 @@ __device__ void device_idxmin(T const* data, // Calculate local max for each thread #pragma unroll - for (size_type item = 0; item < items_per_thread; item++) { + for (cudf::size_type item = 0; item < items_per_thread; item++) { if (threadIdx.x + (item * blockDim.x) < size) { T load = data[threadIdx.x + item * blockDim.x]; if (load < local_min) { From 0e0b750e14a71c73ac014210749e9bdae55cb214 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 18 Jan 2023 18:18:11 -0500 Subject: [PATCH 045/121] Use std limits instead of macros --- python/cudf/udf_cpp/groupby/function.cu | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 94c3b243e66..50a3e6eb4da 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -16,10 +16,8 @@ #include -#include -#include +#include -// double atomicAdd __device__ __forceinline__ double atomicAdds(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; @@ -323,9 +321,9 @@ __device__ T BlockMax(T const* data, int64_t size) // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T smax; - if (threadIdx.x == 0) { smax = INT64_MIN; } + if (threadIdx.x == 0) { smax = std::numeric_limits::min(); } __syncthreads(); - device_max(data, items_per_thread, size, INT64_MIN, &smax); + device_max(data, items_per_thread, size, std::numeric_limits::min(), &smax); return smax; } @@ -335,9 +333,9 @@ __device__ T BlockMin(T const* data, int64_t size) // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T smin; - if (threadIdx.x == 0) { smin = INT64_MAX; } + if (threadIdx.x == 0) { smin = std::numeric_limits::max(); } __syncthreads(); - device_min(data, items_per_thread, size, INT64_MAX, &smin); + device_min(data, items_per_thread, size, std::numeric_limits::max(), &smin); return smin; } @@ -349,11 +347,12 @@ __device__ T BlockIdxMax(T const* data, int64_t* index, int64_t size) __shared__ T smax; __shared__ int64_t sidx; if (threadIdx.x == 0) { - smax = INT64_MIN; - sidx = INT64_MAX; + smax = std::numeric_limits::min(); + sidx = std::numeric_limits::max(); } __syncthreads(); - device_idxmax(data, items_per_thread, index, size, INT64_MIN, &smax, &sidx); + device_idxmax( + data, items_per_thread, index, size, std::numeric_limits::min(), &smax, &sidx); return sidx; } @@ -365,7 +364,7 @@ __device__ T BlockIdxMin(T const* data, int64_t* index, T min, int64_t size) __shared__ int64_t sidx; if (threadIdx.x == 0) { smin = min; - sidx = INT64_MAX; + sidx = std::numeric_limits::max(); } __syncthreads(); device_idxmin(data, items_per_thread, index, size, min, &smin, &sidx); @@ -491,7 +490,8 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t* numba_return_value, int64_t* index, int64_t size) { - *numba_return_value = BlockIdxMin(data, index, INT64_MAX, size); + *numba_return_value = + BlockIdxMin(data, index, std::numeric_limits::max(), size); return 0; } @@ -500,6 +500,6 @@ extern "C" __device__ int BlockIdxMin_float64(double* numba_return_value, int64_t* index, int64_t size) { - *numba_return_value = BlockIdxMin(data, index, DBL_MAX, size); + *numba_return_value = BlockIdxMin(data, index, std::numeric_limits::max(), size); return 0; } From 95fa402f5ddf1336ce72ed984d472f818f4c32ab Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 19 Jan 2023 11:18:08 -0800 Subject: [PATCH 046/121] remove redundant comments --- python/cudf/udf_cpp/groupby/function.cu | 29 ------------------------- 1 file changed, 29 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 50a3e6eb4da..29c49181546 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -33,13 +33,11 @@ __device__ __forceinline__ double atomicAdds(double* address, double val) return __longlong_as_double(old); } -// int64_t atomicAdd __device__ __forceinline__ int64_t atomicAdds(int64_t* address, int64_t val) { return atomicAdd((unsigned long long*)address, (unsigned long long)val); } -// double atomicMax __device__ __forceinline__ double atomicMax(double* address, double val) { unsigned long long old = __double_as_longlong(*address); @@ -52,13 +50,11 @@ __device__ __forceinline__ double atomicMax(double* address, double val) return __longlong_as_double(old); } -// int64_t atomicMax __device__ __forceinline__ int64_t atomicMax(int64_t* address, int64_t val) { return atomicMax((long long*)address, (long long)val); } -// double atomicMin __device__ __forceinline__ double atomicMin(double* address, double val) { unsigned long long old = __double_as_longlong(*address); @@ -71,19 +67,16 @@ __device__ __forceinline__ double atomicMin(double* address, double val) return __longlong_as_double(old); } -// int64_t atomicMin __device__ __forceinline__ int64_t atomicMin(int64_t* address, int64_t val) { return atomicMin((long long*)address, (long long)val); } -// Use a C++ templated __device__ function to implement the body of the algorithm. template __device__ void device_sum(T const* data, int const items_per_thread, cudf::size_type size, T* sum) { T local_sum = 0; -// Calculate local sum for each thread #pragma unroll for (cudf::size_type item = 0; item < items_per_thread; item++) { if (threadIdx.x + (item * blockDim.x) < size) { @@ -97,12 +90,10 @@ __device__ void device_sum(T const* data, int const items_per_thread, cudf::size __syncthreads(); } -// Use a C++ templated __device__ function to implement the body of the algorithm. template __device__ void device_var( T const* data, int const items_per_thread, cudf::size_type size, T* sum, double* var) { - // Calculate how many elements each thread is working on T local_sum = 0; double local_var = 0; double mean; @@ -113,7 +104,6 @@ __device__ void device_var( mean = (*sum) / static_cast(size); -// Calculate local sum for each thread #pragma unroll for (cudf::size_type item = 0; item < items_per_thread; item++) { if (threadIdx.x + (item * blockDim.x) < size) { @@ -133,14 +123,12 @@ __device__ void device_var( __syncthreads(); } -// Use a C++ templated __device__ function to implement the body of the algorithm. template __device__ void device_max( T const* data, int const items_per_thread, cudf::size_type size, T init_val, T* smax) { T local_max = init_val; -// Calculate local max for each thread #pragma unroll for (cudf::size_type item = 0; item < items_per_thread; item++) { if (threadIdx.x + (item * blockDim.x) < size) { @@ -151,20 +139,17 @@ __device__ void device_max( __syncthreads(); - // Calculate local max for each group atomicMax(smax, local_max); __syncthreads(); } -// Use a C++ templated __device__ function to implement the body of the algorithm. template __device__ void device_min( T const* data, int const items_per_thread, cudf::size_type size, T init_val, T* smin) { T local_min = init_val; -// Calculate local min for each thread #pragma unroll for (cudf::size_type item = 0; item < items_per_thread; item++) { if (threadIdx.x + (item * blockDim.x) < size) { @@ -175,13 +160,11 @@ __device__ void device_min( __syncthreads(); - // Calculate local min for each group atomicMin(smin, local_min); __syncthreads(); } -// Use a C++ templated __device__ function to implement the body of the algorithm. template __device__ void device_idxmax(T const* data, int const items_per_thread, @@ -191,11 +174,9 @@ __device__ void device_idxmax(T const* data, T* smax, int64_t* sidx) { - // Calculate how many elements each thread is working on T local_max = init_val; int64_t local_idx = -1; -// Calculate local max for each thread #pragma unroll for (cudf::size_type item = 0; item < items_per_thread; item++) { if (threadIdx.x + (item * blockDim.x) < size) { @@ -209,7 +190,6 @@ __device__ void device_idxmax(T const* data, __syncthreads(); - // Calculate local max for each group atomicMax(smax, local_max); __syncthreads(); @@ -219,7 +199,6 @@ __device__ void device_idxmax(T const* data, __syncthreads(); } -// Use a C++ templated __device__ function to implement the body of the algorithm. template __device__ void device_idxmin(T const* data, int const items_per_thread, @@ -232,7 +211,6 @@ __device__ void device_idxmin(T const* data, T local_min = init_val; int64_t local_idx = -1; -// Calculate local max for each thread #pragma unroll for (cudf::size_type item = 0; item < items_per_thread; item++) { if (threadIdx.x + (item * blockDim.x) < size) { @@ -246,7 +224,6 @@ __device__ void device_idxmin(T const* data, __syncthreads(); - // Calculate local max for each group atomicMin(smin, local_min); __syncthreads(); @@ -271,7 +248,6 @@ __device__ T BlockSum(T const* data, int64_t size) template __device__ T BlockMean(T const* data, int64_t size) { - // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T sum; @@ -286,7 +262,6 @@ __device__ T BlockMean(T const* data, int64_t size) template __device__ T BlockStd(T const* data, int64_t size) { - // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T sum; __shared__ double var; @@ -302,7 +277,6 @@ __device__ T BlockStd(T const* data, int64_t size) template __device__ T BlockVar(T const* data, int64_t size) { - // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T sum; __shared__ double var; @@ -318,7 +292,6 @@ __device__ T BlockVar(T const* data, int64_t size) template __device__ T BlockMax(T const* data, int64_t size) { - // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T smax; if (threadIdx.x == 0) { smax = std::numeric_limits::min(); } @@ -330,7 +303,6 @@ __device__ T BlockMax(T const* data, int64_t size) template __device__ T BlockMin(T const* data, int64_t size) { - // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T smin; if (threadIdx.x == 0) { smin = std::numeric_limits::max(); } @@ -342,7 +314,6 @@ __device__ T BlockMin(T const* data, int64_t size) template __device__ T BlockIdxMax(T const* data, int64_t* index, int64_t size) { - // Calculate how many elements each thread is working on auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; __shared__ T smax; __shared__ int64_t sidx; From 3de3add6b16830da52084cf3a227ff99400f5449 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 19 Jan 2023 11:19:40 -0800 Subject: [PATCH 047/121] style fixes --- python/cudf/cudf/core/udf/utils.py | 2 +- python/strings_udf/strings_udf/__init__.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 594c251f9f5..d7c9a4f3d5c 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -7,7 +7,6 @@ import cachetools import cupy as cp import llvmlite.binding as ll -import numba import numpy as np from numba import cuda, typeof from numba.core.datamodel import default_manager @@ -326,6 +325,7 @@ def _get_ptx_file(path, prefix): else: return regular_result[1] + def _get_extensionty_size(ty): """ Return the size of an extension type in bytes diff --git a/python/strings_udf/strings_udf/__init__.py b/python/strings_udf/strings_udf/__init__.py index 5cfebedff6a..b7c4f58a8d1 100644 --- a/python/strings_udf/strings_udf/__init__.py +++ b/python/strings_udf/strings_udf/__init__.py @@ -62,6 +62,7 @@ def _get_cuda_version_from_ptx_file(path): return cuda_ver + path = os.path.dirname(__file__) From 865bb5d1445968e1ff5ac6138dbbc858e51d9fe1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 19 Jan 2023 16:05:59 -0800 Subject: [PATCH 048/121] debug statements --- python/strings_udf/strings_udf/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/strings_udf/strings_udf/__init__.py b/python/strings_udf/strings_udf/__init__.py index b7c4f58a8d1..a5e593e1bc1 100644 --- a/python/strings_udf/strings_udf/__init__.py +++ b/python/strings_udf/strings_udf/__init__.py @@ -67,11 +67,13 @@ def _get_cuda_version_from_ptx_file(path): def maybe_patch_numba_linker(driver_version, ptx_toolkit_version): + print(f"driver version is {driver_version}") + print(f"ptx toolkit version is {ptx_toolkit_version}") # Numba thinks cubinlinker is only needed if the driver is older than the ctk # but when strings_udf is present, it might also need to patch because the PTX # file strings_udf relies on may be newer than the driver as well if driver_version < ptx_toolkit_version: - logger.debug( + print( "Driver version %s.%s needs patching due to strings_udf" % driver_version ) @@ -80,6 +82,8 @@ def maybe_patch_numba_linker(driver_version, ptx_toolkit_version): Linker.new = new_patched_linker else: logger.debug("Cannot patch Numba Linker - unsupported version") + else: + print("not patching numba linker") # Maximum size of a string column is 2 GiB From 778894466b9ed7512f18363eb03597267ca916db Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 20 Jan 2023 06:52:03 -0800 Subject: [PATCH 049/121] patch numba linker based off groupby ptx file --- python/cudf/cudf/__init__.py | 11 ++- python/cudf/cudf/core/udf/utils.py | 85 ++++++++++++++++++++++ python/cudf/cudf/utils/gpu_utils.py | 2 +- python/strings_udf/strings_udf/__init__.py | 78 ++------------------ 4 files changed, 100 insertions(+), 76 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 28eb380f7cb..a797d82530b 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -1,8 +1,9 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from cudf.utils.gpu_utils import validate_setup validate_setup() +import os import cupy from numba import config as numba_config, cuda @@ -88,7 +89,13 @@ pass else: # Patch Numba to support CUDA enhanced compatibility. - patch_numba_linker_if_needed() + # cuDF requires a stronger set of conditions than what is + # checked by patch_numba_linker_if_needed due to the PTX + # files needed for JIT Groupby Apply and string UDFs + from cudf.core.udf.utils import _setup_numba_linker + + _setup_numba_linker(os.path.dirname(__file__) + "/core/udf/", "function_") + del patch_numba_linker_if_needed cuda.set_memory_manager(rmm.RMMNumbaManager) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index d7c9a4f3d5c..217dd0aaa97 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -8,10 +8,12 @@ import cupy as cp import llvmlite.binding as ll import numpy as np +from cubinlinker.patch import _numba_version_ok, get_logger, new_patched_linker from numba import cuda, typeof from numba.core.datamodel import default_manager from numba.core.errors import TypingError from numba.cuda.cudadrv import nvvm +from numba.cuda.cudadrv.driver import Linker from numba.np import numpy_support from numba.types import CPointer, Poison, Tuple, boolean, int64, void @@ -29,6 +31,9 @@ ) from cudf.utils.utils import _cudf_nvtx_annotate +logger = get_logger() + + JIT_SUPPORTED_TYPES = ( NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES ) @@ -336,3 +341,83 @@ def _get_extensionty_size(ty): target_data = ll.create_target_data(data_layout) llty = default_manager[ty].get_value_type() return llty.get_abi_size(target_data) + + +def _get_cuda_version_from_ptx_file(path): + """ + https://docs.nvidia.com/cuda/parallel-thread-execution/ + Each PTX module must begin with a .version + directive specifying the PTX language version + + example header: + // + // Generated by NVIDIA NVVM Compiler + // + // Compiler Build ID: CL-31057947 + // Cuda compilation tools, release 11.6, V11.6.124 + // Based on NVVM 7.0.1 + // + + .version 7.6 + .target sm_52 + .address_size 64 + + """ + with open(path) as ptx_file: + for line in ptx_file: + if line.startswith(".version"): + ver_line = line + break + else: + raise ValueError("Could not read CUDA version from ptx file.") + version = ver_line.strip("\n").split(" ")[1] + # from ptx_docs/release_notes above: + ver_map = { + "7.5": (11, 5), + "7.6": (11, 6), + "7.7": (11, 7), + "7.8": (11, 8), + "8.0": (12, 0), + } + + cuda_ver = ver_map.get(version) + if cuda_ver is None: + raise ValueError( + f"Could not map PTX version {version} to a CUDA version" + ) + + return cuda_ver + + +def _setup_numba_linker(path, prefix): + from ptxcompiler.patch import NO_DRIVER, safe_get_versions + + from cudf.core.udf.utils import ( + _get_cuda_version_from_ptx_file, + _get_ptx_file, + maybe_patch_numba_linker, + ) + + versions = safe_get_versions() + if versions != NO_DRIVER: + driver_version, runtime_version = versions + ptxpath = _get_ptx_file(path, prefix) + strings_udf_ptx_version = _get_cuda_version_from_ptx_file(ptxpath) + maybe_patch_numba_linker(driver_version, strings_udf_ptx_version) + + +def maybe_patch_numba_linker(driver_version, ptx_toolkit_version): + # Numba thinks cubinlinker is only needed if the driver is older than + # the ctk but when strings_udf is present, it might also need to patch + # because the PTX file strings_udf relies on may be newer than + # the driver as well + if driver_version < ptx_toolkit_version: + print( + "Driver version %s.%s needs patching due to strings_udf" + % driver_version + ) + if _numba_version_ok: + logger.debug("Patching Numba Linker") + Linker.new = new_patched_linker + else: + logger.debug("Cannot patch Numba Linker - unsupported version") diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py index ab3adc1651a..c10dd8ffb3e 100644 --- a/python/cudf/cudf/utils/gpu_utils.py +++ b/python/cudf/cudf/utils/gpu_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. def validate_setup(): diff --git a/python/strings_udf/strings_udf/__init__.py b/python/strings_udf/strings_udf/__init__.py index a5e593e1bc1..247a22fe1f9 100644 --- a/python/strings_udf/strings_udf/__init__.py +++ b/python/strings_udf/strings_udf/__init__.py @@ -1,91 +1,23 @@ # Copyright (c) 2022-2023, NVIDIA CORPORATION. -import glob import os -from cubinlinker.patch import _numba_version_ok, get_logger, new_patched_linker from cuda import cudart -from numba import cuda -from numba.cuda.cudadrv.driver import Linker from ptxcompiler.patch import NO_DRIVER, safe_get_versions -from cudf.core.udf.utils import _get_ptx_file +from cudf.core.udf.utils import ( + _get_cuda_version_from_ptx_file, + _get_ptx_file, + maybe_patch_numba_linker, +) from . import _version __version__ = _version.get_versions()["version"] -logger = get_logger() - - -def _get_cuda_version_from_ptx_file(path): - """ - https://docs.nvidia.com/cuda/parallel-thread-execution/ - Each PTX module must begin with a .version - directive specifying the PTX language version - - example header: - // - // Generated by NVIDIA NVVM Compiler - // - // Compiler Build ID: CL-31057947 - // Cuda compilation tools, release 11.6, V11.6.124 - // Based on NVVM 7.0.1 - // - - .version 7.6 - .target sm_52 - .address_size 64 - - """ - with open(path) as ptx_file: - for line in ptx_file: - if line.startswith(".version"): - ver_line = line - break - else: - raise ValueError("Could not read CUDA version from ptx file.") - version = ver_line.strip("\n").split(" ")[1] - # from ptx_docs/release_notes above: - ver_map = { - "7.5": (11, 5), - "7.6": (11, 6), - "7.7": (11, 7), - "7.8": (11, 8), - "8.0": (12, 0), - } - - cuda_ver = ver_map.get(version) - if cuda_ver is None: - raise ValueError( - f"Could not map PTX version {version} to a CUDA version" - ) - - return cuda_ver - path = os.path.dirname(__file__) -def maybe_patch_numba_linker(driver_version, ptx_toolkit_version): - print(f"driver version is {driver_version}") - print(f"ptx toolkit version is {ptx_toolkit_version}") - # Numba thinks cubinlinker is only needed if the driver is older than the ctk - # but when strings_udf is present, it might also need to patch because the PTX - # file strings_udf relies on may be newer than the driver as well - if driver_version < ptx_toolkit_version: - print( - "Driver version %s.%s needs patching due to strings_udf" - % driver_version - ) - if _numba_version_ok: - logger.debug("Patching Numba Linker") - Linker.new = new_patched_linker - else: - logger.debug("Cannot patch Numba Linker - unsupported version") - else: - print("not patching numba linker") - - # Maximum size of a string column is 2 GiB _STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get( "STRINGS_UDF_HEAP_SIZE", 2**31 From bdea84c2a9410c4079a3ab0f991e648124c6249c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Jan 2023 09:56:20 -0500 Subject: [PATCH 050/121] Fix idxmin/max bug --- python/cudf/udf_cpp/groupby/function.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 29c49181546..fadd8a4eeb6 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -447,7 +447,7 @@ extern "C" __device__ int BlockIdxMax_int64(int64_t* numba_return_value, return 0; } -extern "C" __device__ int BlockIdxMax_float64(double* numba_return_value, +extern "C" __device__ int BlockIdxMax_float64(int64_t* numba_return_value, double const* data, int64_t* index, int64_t size) @@ -466,7 +466,7 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t* numba_return_value, return 0; } -extern "C" __device__ int BlockIdxMin_float64(double* numba_return_value, +extern "C" __device__ int BlockIdxMin_float64(int64_t* numba_return_value, double const* data, int64_t* index, int64_t size) From 0110075130e83e0dbe03e45ce5971f17b1d12210 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Jan 2023 10:29:17 -0500 Subject: [PATCH 051/121] Use static_cast to avoid raw casting --- python/cudf/udf_cpp/groupby/function.cu | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index fadd8a4eeb6..5dbac8ebf35 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -20,7 +20,7 @@ __device__ __forceinline__ double atomicAdds(double* address, double val) { - unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int* address_as_ull = static_cast(address); unsigned long long int old = *address_as_ull, assumed; do { @@ -35,7 +35,7 @@ __device__ __forceinline__ double atomicAdds(double* address, double val) __device__ __forceinline__ int64_t atomicAdds(int64_t* address, int64_t val) { - return atomicAdd((unsigned long long*)address, (unsigned long long)val); + return atomicAdd(static_cast(address), static_cast(val)); } __device__ __forceinline__ double atomicMax(double* address, double val) @@ -43,7 +43,8 @@ __device__ __forceinline__ double atomicMax(double* address, double val) unsigned long long old = __double_as_longlong(*address); while (val > __longlong_as_double(old)) { unsigned long long assumed = old; - if ((old = atomicCAS((unsigned long long*)address, assumed, __double_as_longlong(val))) == + if ((old = atomicCAS( + static_cast(address), assumed, __double_as_longlong(val))) == assumed) break; } @@ -52,7 +53,7 @@ __device__ __forceinline__ double atomicMax(double* address, double val) __device__ __forceinline__ int64_t atomicMax(int64_t* address, int64_t val) { - return atomicMax((long long*)address, (long long)val); + return atomicMax(static_cast(address), static_cast(val)); } __device__ __forceinline__ double atomicMin(double* address, double val) @@ -60,7 +61,8 @@ __device__ __forceinline__ double atomicMin(double* address, double val) unsigned long long old = __double_as_longlong(*address); while (val < __longlong_as_double(old)) { unsigned long long assumed = old; - if ((old = atomicCAS((unsigned long long*)address, assumed, __double_as_longlong(val))) == + if ((old = atomicCAS( + static_cast(address), assumed, __double_as_longlong(val))) == assumed) break; } @@ -69,7 +71,7 @@ __device__ __forceinline__ double atomicMin(double* address, double val) __device__ __forceinline__ int64_t atomicMin(int64_t* address, int64_t val) { - return atomicMin((long long*)address, (long long)val); + return atomicMin(static_cast(address), static_cast(val)); } template From b039ce7b2770c0d3ff50c72a5b25f6352debaf38 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Jan 2023 10:38:16 -0500 Subject: [PATCH 052/121] Cleanups: reinterpret_cast + remove redundant sync --- python/cudf/udf_cpp/groupby/function.cu | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 5dbac8ebf35..fb58d78cd67 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -20,7 +20,7 @@ __device__ __forceinline__ double atomicAdds(double* address, double val) { - unsigned long long int* address_as_ull = static_cast(address); + unsigned long long int* address_as_ull = reinterpret_cast(address); unsigned long long int old = *address_as_ull, assumed; do { @@ -35,7 +35,8 @@ __device__ __forceinline__ double atomicAdds(double* address, double val) __device__ __forceinline__ int64_t atomicAdds(int64_t* address, int64_t val) { - return atomicAdd(static_cast(address), static_cast(val)); + return atomicAdd(reinterpret_cast(address), + static_cast(val)); } __device__ __forceinline__ double atomicMax(double* address, double val) @@ -44,7 +45,7 @@ __device__ __forceinline__ double atomicMax(double* address, double val) while (val > __longlong_as_double(old)) { unsigned long long assumed = old; if ((old = atomicCAS( - static_cast(address), assumed, __double_as_longlong(val))) == + reinterpret_cast(address), assumed, __double_as_longlong(val))) == assumed) break; } @@ -53,7 +54,7 @@ __device__ __forceinline__ double atomicMax(double* address, double val) __device__ __forceinline__ int64_t atomicMax(int64_t* address, int64_t val) { - return atomicMax(static_cast(address), static_cast(val)); + return atomicMax(reinterpret_cast(address), static_cast(val)); } __device__ __forceinline__ double atomicMin(double* address, double val) @@ -62,7 +63,7 @@ __device__ __forceinline__ double atomicMin(double* address, double val) while (val < __longlong_as_double(old)) { unsigned long long assumed = old; if ((old = atomicCAS( - static_cast(address), assumed, __double_as_longlong(val))) == + reinterpret_cast(address), assumed, __double_as_longlong(val))) == assumed) break; } @@ -71,7 +72,7 @@ __device__ __forceinline__ double atomicMin(double* address, double val) __device__ __forceinline__ int64_t atomicMin(int64_t* address, int64_t val) { - return atomicMin(static_cast(address), static_cast(val)); + return atomicMin(reinterpret_cast(address), static_cast(val)); } template @@ -102,8 +103,6 @@ __device__ void device_var( device_sum(data, items_per_thread, size, sum); - __syncthreads(); - mean = (*sum) / static_cast(size); #pragma unroll @@ -139,8 +138,6 @@ __device__ void device_max( } } - __syncthreads(); - atomicMax(smax, local_max); __syncthreads(); @@ -160,8 +157,6 @@ __device__ void device_min( } } - __syncthreads(); - atomicMin(smin, local_min); __syncthreads(); @@ -190,8 +185,6 @@ __device__ void device_idxmax(T const* data, } } - __syncthreads(); - atomicMax(smax, local_max); __syncthreads(); @@ -224,8 +217,6 @@ __device__ void device_idxmin(T const* data, } } - __syncthreads(); - atomicMin(smin, local_min); __syncthreads(); From 14dc67408f12c36cc878b56a1883c8c48ec99c40 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Jan 2023 12:11:33 -0500 Subject: [PATCH 053/121] Simplify block min/max logic --- python/cudf/udf_cpp/groupby/function.cu | 74 +++++++++---------------- 1 file changed, 26 insertions(+), 48 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index fb58d78cd67..c288c3b6d82 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -124,44 +124,6 @@ __device__ void device_var( __syncthreads(); } -template -__device__ void device_max( - T const* data, int const items_per_thread, cudf::size_type size, T init_val, T* smax) -{ - T local_max = init_val; - -#pragma unroll - for (cudf::size_type item = 0; item < items_per_thread; item++) { - if (threadIdx.x + (item * blockDim.x) < size) { - T load = data[threadIdx.x + item * blockDim.x]; - local_max = max(local_max, load); - } - } - - atomicMax(smax, local_max); - - __syncthreads(); -} - -template -__device__ void device_min( - T const* data, int const items_per_thread, cudf::size_type size, T init_val, T* smin) -{ - T local_min = init_val; - -#pragma unroll - for (cudf::size_type item = 0; item < items_per_thread; item++) { - if (threadIdx.x + (item * blockDim.x) < size) { - T load = data[threadIdx.x + item * blockDim.x]; - local_min = min(local_min, load); - } - } - - atomicMin(smin, local_min); - - __syncthreads(); -} - template __device__ void device_idxmax(T const* data, int const items_per_thread, @@ -285,23 +247,39 @@ __device__ T BlockVar(T const* data, int64_t size) template __device__ T BlockMax(T const* data, int64_t size) { - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - __shared__ T smax; - if (threadIdx.x == 0) { smax = std::numeric_limits::min(); } + T local_max = std::numeric_limits::min(); + __shared__ T block_max; + if (threadIdx.x == 0) { block_max = local_max; } + __syncthreads(); + +#pragma unroll + for (int64_t idx = threadIdx.x; idx < size; idx += blockDim.x) { + local_max = max(local_max, data[idx]); + } + + atomicMax(&block_max, local_max); __syncthreads(); - device_max(data, items_per_thread, size, std::numeric_limits::min(), &smax); - return smax; + + return block_max; } template __device__ T BlockMin(T const* data, int64_t size) { - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - __shared__ T smin; - if (threadIdx.x == 0) { smin = std::numeric_limits::max(); } + T local_min = std::numeric_limits::max(); + __shared__ T block_min; + if (threadIdx.x == 0) { block_min == local_min; } __syncthreads(); - device_min(data, items_per_thread, size, std::numeric_limits::max(), &smin); - return smin; + +#pragma unroll + for (int64_t idx = threadIdx.x; idx < size; idx += blockDim.x) { + local_min = min(local_min, data[idx]); + } + + atomicMin(&block_min, local_min); + __syncthreads(); + + return block_min; } template From 99af3f23caf718d3e6440a04e395164e3c2935a9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Jan 2023 12:41:31 -0500 Subject: [PATCH 054/121] Replace custom atomic add with cuda atomic_ref --- python/cudf/udf_cpp/groupby/function.cu | 29 +++++-------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index c288c3b6d82..4f483e84cba 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -16,28 +16,9 @@ #include -#include - -__device__ __forceinline__ double atomicAdds(double* address, double val) -{ - unsigned long long int* address_as_ull = reinterpret_cast(address); - unsigned long long int old = *address_as_ull, assumed; +#include - do { - assumed = old; - old = - atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); - - } while (assumed != old); - - return __longlong_as_double(old); -} - -__device__ __forceinline__ int64_t atomicAdds(int64_t* address, int64_t val) -{ - return atomicAdd(reinterpret_cast(address), - static_cast(val)); -} +#include __device__ __forceinline__ double atomicMax(double* address, double val) { @@ -88,7 +69,8 @@ __device__ void device_sum(T const* data, int const items_per_thread, cudf::size } } - atomicAdds(sum, local_sum); + cuda::atomic_ref ref{*sum}; + ref.fetch_add(local_sum, cuda::std::memory_order_relaxed); __syncthreads(); } @@ -115,7 +97,8 @@ __device__ void device_var( } } - atomicAdds(var, local_var); + cuda::atomic_ref ref{*var}; + ref.fetch_add(local_var, cuda::std::memory_order_relaxed); __syncthreads(); From 321fdaba528c89aef521e3ea1d889082200b4ad1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Jan 2023 12:48:41 -0500 Subject: [PATCH 055/121] Simplify block sum logic --- python/cudf/udf_cpp/groupby/function.cu | 35 ++++++++++--------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 4f483e84cba..40d40de2ee2 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -57,16 +57,13 @@ __device__ __forceinline__ int64_t atomicMin(int64_t* address, int64_t val) } template -__device__ void device_sum(T const* data, int const items_per_thread, cudf::size_type size, T* sum) +__device__ void device_sum(T const* data, int64_t size, T* sum) { T local_sum = 0; #pragma unroll - for (cudf::size_type item = 0; item < items_per_thread; item++) { - if (threadIdx.x + (item * blockDim.x) < size) { - T load = data[threadIdx.x + item * blockDim.x]; - local_sum += load; - } + for (int64_t idx = threadIdx.x; idx < size; idx += blockDim.x) { + local_sum += data[idx]; } cuda::atomic_ref ref{*sum}; @@ -83,7 +80,7 @@ __device__ void device_var( double local_var = 0; double mean; - device_sum(data, items_per_thread, size, sum); + device_sum(data, size, sum); mean = (*sum) / static_cast(size); @@ -174,27 +171,23 @@ __device__ void device_idxmin(T const* data, template __device__ T BlockSum(T const* data, int64_t size) { - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - __shared__ T sum; - - if (threadIdx.x == 0) { sum = 0; } + __shared__ T block_sum; + if (threadIdx.x == 0) { block_sum = 0; } __syncthreads(); - device_sum(data, items_per_thread, size, &sum); - return sum; + + device_sum(data, size, &block_sum); + return block_sum; } template __device__ T BlockMean(T const* data, int64_t size) { - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - - __shared__ T sum; - if (threadIdx.x == 0) { sum = 0; } - + __shared__ T block_sum; + if (threadIdx.x == 0) { block_sum = 0; } __syncthreads(); - device_sum(data, items_per_thread, size, &sum); - double mean = sum / static_cast(size); - return mean; + + device_sum(data, size, &block_sum); + return block_sum / static_cast(size); } template From c91a58919993f92aab88e101423f964183b0e442 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Jan 2023 13:04:20 -0500 Subject: [PATCH 056/121] Simplify block var logic --- python/cudf/udf_cpp/groupby/function.cu | 53 ++++++++++--------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 40d40de2ee2..584bc553b77 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -73,25 +73,24 @@ __device__ void device_sum(T const* data, int64_t size, T* sum) } template -__device__ void device_var( - T const* data, int const items_per_thread, cudf::size_type size, T* sum, double* var) +__device__ void device_var(T const* data, int64_t size, double* var) { T local_sum = 0; double local_var = 0; - double mean; - device_sum(data, size, sum); + __shared__ T block_sum; + if (threadIdx.x == 0) { block_sum = 0; } + __syncthreads(); + + device_sum(data, size, &block_sum); - mean = (*sum) / static_cast(size); + auto const mean = static_cast(block_sum) / static_cast(size); #pragma unroll - for (cudf::size_type item = 0; item < items_per_thread; item++) { - if (threadIdx.x + (item * blockDim.x) < size) { - T load = data[threadIdx.x + item * blockDim.x]; - double temp = load - mean; - temp = pow(temp, 2); - local_var += temp; - } + for (int64_t idx = threadIdx.x; idx < size; idx += blockDim.x) { + auto temp = static_cast(data[idx]) - mean; + temp *= temp; + local_var += temp; } cuda::atomic_ref ref{*var}; @@ -99,7 +98,7 @@ __device__ void device_var( __syncthreads(); - *var = *var / (size - 1); + *var = *var / static_cast(size - 1); __syncthreads(); } @@ -191,33 +190,25 @@ __device__ T BlockMean(T const* data, int64_t size) } template -__device__ T BlockStd(T const* data, int64_t size) +__device__ double BlockStd(T const* data, int64_t size) { - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - __shared__ T sum; __shared__ double var; - if (threadIdx.x == 0) { - sum = 0; - var = 0; - } + if (threadIdx.x == 0) { var = 0; } __syncthreads(); - device_var(data, items_per_thread, size, &sum, &var); + + device_var(data, size, &var); return sqrt(var); } template -__device__ T BlockVar(T const* data, int64_t size) +__device__ double BlockVar(T const* data, int64_t size) { - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - __shared__ T sum; - __shared__ double var; - if (threadIdx.x == 0) { - sum = 0; - var = 0; - } + __shared__ double block_var; + if (threadIdx.x == 0) { block_var = 0; } __syncthreads(); - device_var(data, items_per_thread, size, &sum, &var); - return var; + + device_var(data, size, &block_var); + return block_var; } template From cbc13e6f88d134adb99096d08517a475ee1b5735 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Jan 2023 13:30:10 -0500 Subject: [PATCH 057/121] Refactor block idxmin/idxmax --- python/cudf/udf_cpp/groupby/function.cu | 145 ++++++++++-------------- 1 file changed, 57 insertions(+), 88 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 584bc553b77..8108559179f 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -14,8 +14,6 @@ * limitations under the License. */ -#include - #include #include @@ -103,70 +101,6 @@ __device__ void device_var(T const* data, int64_t size, double* var) __syncthreads(); } -template -__device__ void device_idxmax(T const* data, - int const items_per_thread, - int64_t const* index, - cudf::size_type size, - T init_val, - T* smax, - int64_t* sidx) -{ - T local_max = init_val; - int64_t local_idx = -1; - -#pragma unroll - for (cudf::size_type item = 0; item < items_per_thread; item++) { - if (threadIdx.x + (item * blockDim.x) < size) { - T load = data[threadIdx.x + item * blockDim.x]; - if (load > local_max) { - local_max = load; - local_idx = index[threadIdx.x + item * blockDim.x]; - } - } - } - - atomicMax(smax, local_max); - - __syncthreads(); - - if (local_max == (*smax)) { atomicMin(sidx, local_idx); } - - __syncthreads(); -} - -template -__device__ void device_idxmin(T const* data, - int const items_per_thread, - int64_t const* index, - cudf::size_type size, - T init_val, - T* smin, - int64_t* sidx) -{ - T local_min = init_val; - int64_t local_idx = -1; - -#pragma unroll - for (cudf::size_type item = 0; item < items_per_thread; item++) { - if (threadIdx.x + (item * blockDim.x) < size) { - T load = data[threadIdx.x + item * blockDim.x]; - if (load < local_min) { - local_min = load; - local_idx = index[threadIdx.x + item * blockDim.x]; - } - } - } - - atomicMin(smin, local_min); - - __syncthreads(); - - if (local_min == (*smin)) { atomicMin(sidx, local_idx); } - - __syncthreads(); -} - template __device__ T BlockSum(T const* data, int64_t size) { @@ -214,7 +148,7 @@ __device__ double BlockVar(T const* data, int64_t size) template __device__ T BlockMax(T const* data, int64_t size) { - T local_max = std::numeric_limits::min(); + auto local_max = std::numeric_limits::min(); __shared__ T block_max; if (threadIdx.x == 0) { block_max = local_max; } __syncthreads(); @@ -233,7 +167,7 @@ __device__ T BlockMax(T const* data, int64_t size) template __device__ T BlockMin(T const* data, int64_t size) { - T local_min = std::numeric_limits::max(); + auto local_min = std::numeric_limits::max(); __shared__ T block_min; if (threadIdx.x == 0) { block_min == local_min; } __syncthreads(); @@ -250,34 +184,70 @@ __device__ T BlockMin(T const* data, int64_t size) } template -__device__ T BlockIdxMax(T const* data, int64_t* index, int64_t size) +__device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) { - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - __shared__ T smax; - __shared__ int64_t sidx; + __shared__ T block_max; + __shared__ int64_t block_idx_max; + + // TODO: this is wrong but can pass tests!!! + auto local_max = std::numeric_limits::min(); + auto local_idx_max = std::numeric_limits::max(); + if (threadIdx.x == 0) { - smax = std::numeric_limits::min(); - sidx = std::numeric_limits::max(); + block_max = local_max; + block_idx_max = local_idx_max; + } + __syncthreads(); + +#pragma unroll + for (int64_t idx = threadIdx.x; idx < size; idx += blockDim.x) { + auto const current_data = data[idx]; + if (current_data > local_max) { + local_max = current_data; + local_idx_max = index[idx]; + } } + + atomicMax(&block_max, local_max); __syncthreads(); - device_idxmax( - data, items_per_thread, index, size, std::numeric_limits::min(), &smax, &sidx); - return sidx; + + if (local_max == block_max) { atomicMin(&block_idx_max, local_idx_max); } + __syncthreads(); + + return block_idx_max; } template -__device__ T BlockIdxMin(T const* data, int64_t* index, T min, int64_t size) +__device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) { - auto const items_per_thread = (size + blockDim.x - 1) / blockDim.x; - __shared__ T smin; - __shared__ int64_t sidx; + __shared__ T block_min; + __shared__ int64_t block_idx_min; + + auto local_min = std::numeric_limits::max(); + auto local_idx_min = std::numeric_limits::max(); + if (threadIdx.x == 0) { - smin = min; - sidx = std::numeric_limits::max(); + block_min = local_min; + block_idx_min = local_idx_min; + } + __syncthreads(); + +#pragma unroll + for (int64_t idx = threadIdx.x; idx < size; idx += blockDim.x) { + auto const current_data = data[idx]; + if (current_data < local_min) { + local_min = current_data; + local_idx_min = index[idx]; + } } + + atomicMin(&block_min, local_min); __syncthreads(); - device_idxmin(data, items_per_thread, index, size, min, &smin, &sidx); - return sidx; + + if (local_min == block_min) { atomicMin(&block_idx_min, local_idx_min); } + __syncthreads(); + + return block_idx_min; } extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, @@ -399,8 +369,7 @@ extern "C" __device__ int BlockIdxMin_int64(int64_t* numba_return_value, int64_t* index, int64_t size) { - *numba_return_value = - BlockIdxMin(data, index, std::numeric_limits::max(), size); + *numba_return_value = BlockIdxMin(data, index, size); return 0; } @@ -409,6 +378,6 @@ extern "C" __device__ int BlockIdxMin_float64(int64_t* numba_return_value, int64_t* index, int64_t size) { - *numba_return_value = BlockIdxMin(data, index, std::numeric_limits::max(), size); + *numba_return_value = BlockIdxMin(data, index, size); return 0; } From ab20731949b2c490184e2641fc81bf50e0d86d74 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Jan 2023 13:31:44 -0500 Subject: [PATCH 058/121] Fix a minor bug --- python/cudf/udf_cpp/groupby/function.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 8108559179f..d34bd280b91 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -169,7 +169,7 @@ __device__ T BlockMin(T const* data, int64_t size) { auto local_min = std::numeric_limits::max(); __shared__ T block_min; - if (threadIdx.x == 0) { block_min == local_min; } + if (threadIdx.x == 0) { block_min = local_min; } __syncthreads(); #pragma unroll From a24f09e5aeb107d8a9d38681cf401b15534b7894 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Jan 2023 14:01:27 -0500 Subject: [PATCH 059/121] Fix the floating point min value bug --- python/cudf/udf_cpp/groupby/function.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index d34bd280b91..146ea4b3d32 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -189,8 +189,10 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) __shared__ T block_max; __shared__ int64_t block_idx_max; - // TODO: this is wrong but can pass tests!!! - auto local_max = std::numeric_limits::min(); + auto local_max = []() { + if constexpr (std::is_floating_point_v) { return -std::numeric_limits::max(); } + return std::numeric_limits::min(); + }(); auto local_idx_max = std::numeric_limits::max(); if (threadIdx.x == 0) { From d908621d2849733a7bff5eb28986b4c6868b729d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 20 Jan 2023 14:27:43 -0500 Subject: [PATCH 060/121] Refactor with CUDA CG --- python/cudf/udf_cpp/groupby/function.cu | 111 +++++++++++++++--------- 1 file changed, 68 insertions(+), 43 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 146ea4b3d32..b1392595671 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -16,6 +16,8 @@ #include +#include + #include __device__ __forceinline__ double atomicMax(double* address, double val) @@ -55,37 +57,43 @@ __device__ __forceinline__ int64_t atomicMin(int64_t* address, int64_t val) } template -__device__ void device_sum(T const* data, int64_t size, T* sum) +__device__ void device_sum(cooperative_groups::thread_block const& block, + T const* data, + int64_t size, + T* sum) { T local_sum = 0; #pragma unroll - for (int64_t idx = threadIdx.x; idx < size; idx += blockDim.x) { + for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { local_sum += data[idx]; } cuda::atomic_ref ref{*sum}; ref.fetch_add(local_sum, cuda::std::memory_order_relaxed); - __syncthreads(); + block.sync(); } template -__device__ void device_var(T const* data, int64_t size, double* var) +__device__ void device_var(cooperative_groups::thread_block const& block, + T const* data, + int64_t size, + double* var) { T local_sum = 0; double local_var = 0; __shared__ T block_sum; - if (threadIdx.x == 0) { block_sum = 0; } - __syncthreads(); + if (block.thread_rank() == 0) { block_sum = 0; } + block.sync(); - device_sum(data, size, &block_sum); + device_sum(block, data, size, &block_sum); auto const mean = static_cast(block_sum) / static_cast(size); #pragma unroll - for (int64_t idx = threadIdx.x; idx < size; idx += blockDim.x) { + for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { auto temp = static_cast(data[idx]) - mean; temp *= temp; local_var += temp; @@ -93,73 +101,84 @@ __device__ void device_var(T const* data, int64_t size, double* var) cuda::atomic_ref ref{*var}; ref.fetch_add(local_var, cuda::std::memory_order_relaxed); - - __syncthreads(); + block.sync(); *var = *var / static_cast(size - 1); - - __syncthreads(); + block.sync(); } template __device__ T BlockSum(T const* data, int64_t size) { + auto block = cooperative_groups::this_thread_block(); + __shared__ T block_sum; - if (threadIdx.x == 0) { block_sum = 0; } - __syncthreads(); + if (block.thread_rank() == 0) { block_sum = 0; } + block.sync(); - device_sum(data, size, &block_sum); + device_sum(block, data, size, &block_sum); return block_sum; } template __device__ T BlockMean(T const* data, int64_t size) { + auto block = cooperative_groups::this_thread_block(); + __shared__ T block_sum; - if (threadIdx.x == 0) { block_sum = 0; } - __syncthreads(); + if (block.thread_rank() == 0) { block_sum = 0; } + block.sync(); - device_sum(data, size, &block_sum); + device_sum(block, data, size, &block_sum); return block_sum / static_cast(size); } template __device__ double BlockStd(T const* data, int64_t size) { + auto block = cooperative_groups::this_thread_block(); + __shared__ double var; - if (threadIdx.x == 0) { var = 0; } - __syncthreads(); + if (block.thread_rank() == 0) { var = 0; } + block.sync(); - device_var(data, size, &var); + device_var(block, data, size, &var); return sqrt(var); } template __device__ double BlockVar(T const* data, int64_t size) { + auto block = cooperative_groups::this_thread_block(); + __shared__ double block_var; - if (threadIdx.x == 0) { block_var = 0; } - __syncthreads(); + if (block.thread_rank() == 0) { block_var = 0; } + block.sync(); - device_var(data, size, &block_var); + device_var(block, data, size, &block_var); return block_var; } template __device__ T BlockMax(T const* data, int64_t size) { - auto local_max = std::numeric_limits::min(); + auto block = cooperative_groups::this_thread_block(); + + auto local_max = []() { + if constexpr (std::is_floating_point_v) { return -std::numeric_limits::max(); } + return std::numeric_limits::min(); + }(); __shared__ T block_max; - if (threadIdx.x == 0) { block_max = local_max; } - __syncthreads(); + if (block.thread_rank() == 0) { block_max = local_max; } + block.sync(); #pragma unroll - for (int64_t idx = threadIdx.x; idx < size; idx += blockDim.x) { + for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { local_max = max(local_max, data[idx]); } atomicMax(&block_max, local_max); - __syncthreads(); + block.sync(); return block_max; } @@ -167,18 +186,20 @@ __device__ T BlockMax(T const* data, int64_t size) template __device__ T BlockMin(T const* data, int64_t size) { + auto block = cooperative_groups::this_thread_block(); + auto local_min = std::numeric_limits::max(); __shared__ T block_min; - if (threadIdx.x == 0) { block_min = local_min; } - __syncthreads(); + if (block.thread_rank() == 0) { block_min = local_min; } + block.sync(); #pragma unroll - for (int64_t idx = threadIdx.x; idx < size; idx += blockDim.x) { + for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { local_min = min(local_min, data[idx]); } atomicMin(&block_min, local_min); - __syncthreads(); + block.sync(); return block_min; } @@ -186,6 +207,8 @@ __device__ T BlockMin(T const* data, int64_t size) template __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) { + auto block = cooperative_groups::this_thread_block(); + __shared__ T block_max; __shared__ int64_t block_idx_max; @@ -195,14 +218,14 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) }(); auto local_idx_max = std::numeric_limits::max(); - if (threadIdx.x == 0) { + if (block.thread_rank() == 0) { block_max = local_max; block_idx_max = local_idx_max; } - __syncthreads(); + block.sync(); #pragma unroll - for (int64_t idx = threadIdx.x; idx < size; idx += blockDim.x) { + for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { auto const current_data = data[idx]; if (current_data > local_max) { local_max = current_data; @@ -211,10 +234,10 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) } atomicMax(&block_max, local_max); - __syncthreads(); + block.sync(); if (local_max == block_max) { atomicMin(&block_idx_max, local_idx_max); } - __syncthreads(); + block.sync(); return block_idx_max; } @@ -222,20 +245,22 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) template __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) { + auto block = cooperative_groups::this_thread_block(); + __shared__ T block_min; __shared__ int64_t block_idx_min; auto local_min = std::numeric_limits::max(); auto local_idx_min = std::numeric_limits::max(); - if (threadIdx.x == 0) { + if (block.thread_rank() == 0) { block_min = local_min; block_idx_min = local_idx_min; } - __syncthreads(); + block.sync(); #pragma unroll - for (int64_t idx = threadIdx.x; idx < size; idx += blockDim.x) { + for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { auto const current_data = data[idx]; if (current_data < local_min) { local_min = current_data; @@ -244,10 +269,10 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) } atomicMin(&block_min, local_min); - __syncthreads(); + block.sync(); if (local_min == block_min) { atomicMin(&block_idx_min, local_idx_min); } - __syncthreads(); + block.sync(); return block_idx_min; } From 595746a76c9cf5987797f7c4405283c28e0babb1 Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Sun, 22 Jan 2023 14:13:01 +0000 Subject: [PATCH 061/121] C++ changes: (1) Addressing more reviewer's comment, (2) Replacing custom atomic with cuda/atomic for max and min (3) New C++ templating style --- python/cudf/udf_cpp/groupby/function.cu | 258 ++++++++---------------- 1 file changed, 89 insertions(+), 169 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index b1392595671..0ff836adbfc 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -20,42 +20,6 @@ #include -__device__ __forceinline__ double atomicMax(double* address, double val) -{ - unsigned long long old = __double_as_longlong(*address); - while (val > __longlong_as_double(old)) { - unsigned long long assumed = old; - if ((old = atomicCAS( - reinterpret_cast(address), assumed, __double_as_longlong(val))) == - assumed) - break; - } - return __longlong_as_double(old); -} - -__device__ __forceinline__ int64_t atomicMax(int64_t* address, int64_t val) -{ - return atomicMax(reinterpret_cast(address), static_cast(val)); -} - -__device__ __forceinline__ double atomicMin(double* address, double val) -{ - unsigned long long old = __double_as_longlong(*address); - while (val < __longlong_as_double(old)) { - unsigned long long assumed = old; - if ((old = atomicCAS( - reinterpret_cast(address), assumed, __double_as_longlong(val))) == - assumed) - break; - } - return __longlong_as_double(old); -} - -__device__ __forceinline__ int64_t atomicMin(int64_t* address, int64_t val) -{ - return atomicMin(reinterpret_cast(address), static_cast(val)); -} - template __device__ void device_sum(cooperative_groups::thread_block const& block, T const* data, @@ -177,7 +141,9 @@ __device__ T BlockMax(T const* data, int64_t size) local_max = max(local_max, data[idx]); } - atomicMax(&block_max, local_max); + cuda::atomic_ref ref{block_max}; + ref.fetch_max(local_max, cuda::std::memory_order_relaxed); + block.sync(); return block_max; @@ -198,7 +164,9 @@ __device__ T BlockMin(T const* data, int64_t size) local_min = min(local_min, data[idx]); } - atomicMin(&block_min, local_min); + cuda::atomic_ref ref{block_min}; + ref.fetch_min(local_min, cuda::std::memory_order_relaxed); + block.sync(); return block_min; @@ -233,10 +201,14 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) } } - atomicMax(&block_max, local_max); + cuda::atomic_ref ref{block_max}; + ref.fetch_max(local_max, cuda::std::memory_order_relaxed); block.sync(); - if (local_max == block_max) { atomicMin(&block_idx_max, local_idx_max); } + cuda::atomic_ref ref_idx{block_idx_max}; + if (local_max == block_max) { + ref_idx.fetch_min(local_idx_max, cuda::std::memory_order_relaxed); + } block.sync(); return block_idx_max; @@ -268,143 +240,91 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) } } - atomicMin(&block_min, local_min); + cuda::atomic_ref ref{block_min}; + ref.fetch_min(local_min, cuda::std::memory_order_relaxed); block.sync(); - if (local_min == block_min) { atomicMin(&block_idx_min, local_idx_min); } + cuda::atomic_ref ref_idx{block_idx_min}; + if (local_min == block_min) { + ref_idx.fetch_min(local_idx_min, cuda::std::memory_order_relaxed); + } block.sync(); return block_idx_min; } -extern "C" __device__ int BlockSum_int64(int64_t* numba_return_value, - int64_t const* data, - int64_t size) -{ - *numba_return_value = BlockSum(data, size); - return 0; -} - -extern "C" __device__ int BlockSum_float64(double* numba_return_value, - double const* data, - int64_t size) -{ - *numba_return_value = BlockSum(data, size); - return 0; -} - -extern "C" __device__ int BlockMean_int64(int64_t* numba_return_value, - int64_t const* data, - int64_t size) -{ - *numba_return_value = BlockMean(data, size); - return 0; -} - -extern "C" __device__ int BlockMean_float64(double* numba_return_value, - double const* data, - int64_t size) -{ - *numba_return_value = BlockMean(data, size); - return 0; -} - -extern "C" __device__ int BlockStd_int64(double* numba_return_value, - int64_t const* data, - int64_t size) -{ - *numba_return_value = BlockStd(data, size); - return 0; -} - -extern "C" __device__ int BlockStd_float64(double* numba_return_value, - double const* data, - int64_t size) -{ - *numba_return_value = BlockStd(data, size); - return 0; -} - -extern "C" __device__ int BlockVar_int64(double* numba_return_value, - int64_t const* data, - int64_t size) -{ - *numba_return_value = BlockVar(data, size); - return 0; -} - -extern "C" __device__ int BlockVar_float64(double* numba_return_value, - double const* data, - int64_t size) -{ - *numba_return_value = BlockVar(data, size); - return 0; -} - -extern "C" __device__ int BlockMax_int64(int64_t* numba_return_value, - int64_t const* data, - int64_t size) -{ - *numba_return_value = BlockMax(data, size); - return 0; -} - -extern "C" __device__ int BlockMax_float64(double* numba_return_value, - double const* data, - int64_t size) -{ - *numba_return_value = BlockMax(data, size); - return 0; -} - -extern "C" __device__ int BlockMin_int64(int64_t* numba_return_value, - int64_t const* data, - int64_t size) -{ - *numba_return_value = BlockMin(data, size); - return 0; -} - -extern "C" __device__ int BlockMin_float64(double* numba_return_value, - double const* data, - int64_t size) -{ - *numba_return_value = BlockMin(data, size); - return 0; -} - -extern "C" __device__ int BlockIdxMax_int64(int64_t* numba_return_value, - int64_t const* data, - int64_t* index, - int64_t size) -{ - *numba_return_value = BlockIdxMax(data, index, size); - return 0; -} +extern "C" { +#define make_definition(name, cname, type) \ + __device__ int name ## _ ## cname (int64_t* numba_return_value, type* const data, int64_t size) { \ + *numba_return_value = name(data, size); \ + return 0; \ + } -extern "C" __device__ int BlockIdxMax_float64(int64_t* numba_return_value, - double const* data, - int64_t* index, - int64_t size) -{ - *numba_return_value = BlockIdxMax(data, index, size); - return 0; +// make_definition(BlockSum, int8, int8_t); +// make_definition(BlockSum, int16, int16_t); +make_definition(BlockSum, int32, int); +make_definition(BlockSum, int64, int64_t); +make_definition(BlockSum, float32, float); +make_definition(BlockSum, float64, double); +// make_definition(BlockSum, bool, bool); +// make_definition(BlockMean, int8, int8_t); +// make_definition(BlockMean, int16, int16_t); +make_definition(BlockMean, int32, int); +make_definition(BlockMean, int64, int64_t); +make_definition(BlockMean, float32, float); +make_definition(BlockMean, float64, double); +// make_definition(BlockMean, bool, bool); +// make_definition(BlockStd, int8, int8_t); +// make_definition(BlockStd, int16, int16_t); +make_definition(BlockStd, int32, int); +make_definition(BlockStd, int64, int64_t); +make_definition(BlockStd, float32, float); +make_definition(BlockStd, float64, double); +// make_definition(BlockStd, bool, bool); +// make_definition(BlockVar, int8, int8_t); +// make_definition(BlockVar, int16, int16_t); +make_definition(BlockVar, int32, int); +make_definition(BlockVar, int64, int64_t); +make_definition(BlockVar, float32, float); +make_definition(BlockVar, float64, double); +// make_definition(BlockVar, bool, bool); +// make_definition(BlockMin, int8, int8_t); +// make_definition(BlockMin, int16, int16_t); +make_definition(BlockMin, int32, int); +make_definition(BlockMin, int64, int64_t); +make_definition(BlockMin, float32, float); +make_definition(BlockMin, float64, double); +// make_definition(BlockMin, bool, bool); +// make_definition(BlockMax, int8, int8_t); +// make_definition(BlockMax, int16, int16_t); +make_definition(BlockMax, int32, int); +make_definition(BlockMax, int64, int64_t); +make_definition(BlockMax, float32, float); +make_definition(BlockMax, float64, double); +// make_definition(BlockMax, bool, bool); +#undef make_definition } -extern "C" __device__ int BlockIdxMin_int64(int64_t* numba_return_value, - int64_t const* data, - int64_t* index, - int64_t size) -{ - *numba_return_value = BlockIdxMin(data, index, size); - return 0; -} +extern "C" { +#define make_definition_idx(name, cname, type) \ + __device__ int name ## _ ## cname (int64_t* numba_return_value, type* const data, int64_t* index, int64_t size) { \ + *numba_return_value = name(data, index, size); \ + return 0; \ + } -extern "C" __device__ int BlockIdxMin_float64(int64_t* numba_return_value, - double const* data, - int64_t* index, - int64_t size) -{ - *numba_return_value = BlockIdxMin(data, index, size); - return 0; -} +// make_definition_idx(BlockIdxMin, int8, int8_t); +// make_definition_idx(BlockIdxMin, int16, int16_t); +make_definition_idx(BlockIdxMin, int32, int); +make_definition_idx(BlockIdxMin, int64, int64_t); +make_definition_idx(BlockIdxMin, float32, float); +make_definition_idx(BlockIdxMin, float64, double); +// make_definition_idx(BlockIdxMin, bool, bool); +// make_definition_idx(BlockIdxMax, int8, int8_t); +// make_definition_idx(BlockIdxMax, int16, int16_t); +make_definition_idx(BlockIdxMax, int32, int); +make_definition_idx(BlockIdxMax, int64, int64_t); +make_definition_idx(BlockIdxMax, float32, float); +make_definition_idx(BlockIdxMax, float64, double); +// make_definition_idx(BlockIdxMax, bool, bool); +#undef make_definition_idx +} \ No newline at end of file From bb8b7c3b7a3ca8662512f0eb37d1a2cce21a20ee Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Sun, 22 Jan 2023 19:53:08 -0800 Subject: [PATCH 062/121] style --- python/cudf/udf_cpp/groupby/function.cu | 33 ++++++++++++------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 0ff836adbfc..a3aa8b14065 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -206,9 +206,7 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) block.sync(); cuda::atomic_ref ref_idx{block_idx_max}; - if (local_max == block_max) { - ref_idx.fetch_min(local_idx_max, cuda::std::memory_order_relaxed); - } + if (local_max == block_max) { ref_idx.fetch_min(local_idx_max, cuda::std::memory_order_relaxed); } block.sync(); return block_idx_max; @@ -245,20 +243,19 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) block.sync(); cuda::atomic_ref ref_idx{block_idx_min}; - if (local_min == block_min) { - ref_idx.fetch_min(local_idx_min, cuda::std::memory_order_relaxed); - } + if (local_min == block_min) { ref_idx.fetch_min(local_idx_min, cuda::std::memory_order_relaxed); } block.sync(); return block_idx_min; } extern "C" { -#define make_definition(name, cname, type) \ - __device__ int name ## _ ## cname (int64_t* numba_return_value, type* const data, int64_t size) { \ - *numba_return_value = name(data, size); \ - return 0; \ - } +#define make_definition(name, cname, type) \ + __device__ int name##_##cname(int64_t* numba_return_value, type* const data, int64_t size) \ + { \ + *numba_return_value = name(data, size); \ + return 0; \ + } // make_definition(BlockSum, int8, int8_t); // make_definition(BlockSum, int16, int16_t); @@ -306,11 +303,13 @@ make_definition(BlockMax, float64, double); } extern "C" { -#define make_definition_idx(name, cname, type) \ - __device__ int name ## _ ## cname (int64_t* numba_return_value, type* const data, int64_t* index, int64_t size) { \ - *numba_return_value = name(data, index, size); \ - return 0; \ - } +#define make_definition_idx(name, cname, type) \ + __device__ int name##_##cname( \ + int64_t* numba_return_value, type* const data, int64_t* index, int64_t size) \ + { \ + *numba_return_value = name(data, index, size); \ + return 0; \ + } // make_definition_idx(BlockIdxMin, int8, int8_t); // make_definition_idx(BlockIdxMin, int16, int16_t); @@ -327,4 +326,4 @@ make_definition_idx(BlockIdxMax, float32, float); make_definition_idx(BlockIdxMax, float64, double); // make_definition_idx(BlockIdxMax, bool, bool); #undef make_definition_idx -} \ No newline at end of file +} From 1f475f0942a1b80c885e48ee7cbd016cbf4c4af5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Jan 2023 10:13:16 -0500 Subject: [PATCH 063/121] Use proper cuda thread scope --- python/cudf/udf_cpp/groupby/function.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index a3aa8b14065..a6a18c7a3c1 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -33,7 +33,7 @@ __device__ void device_sum(cooperative_groups::thread_block const& block, local_sum += data[idx]; } - cuda::atomic_ref ref{*sum}; + cuda::atomic_ref ref{*sum}; ref.fetch_add(local_sum, cuda::std::memory_order_relaxed); block.sync(); @@ -63,7 +63,7 @@ __device__ void device_var(cooperative_groups::thread_block const& block, local_var += temp; } - cuda::atomic_ref ref{*var}; + cuda::atomic_ref ref{*var}; ref.fetch_add(local_var, cuda::std::memory_order_relaxed); block.sync(); @@ -141,7 +141,7 @@ __device__ T BlockMax(T const* data, int64_t size) local_max = max(local_max, data[idx]); } - cuda::atomic_ref ref{block_max}; + cuda::atomic_ref ref{block_max}; ref.fetch_max(local_max, cuda::std::memory_order_relaxed); block.sync(); @@ -164,7 +164,7 @@ __device__ T BlockMin(T const* data, int64_t size) local_min = min(local_min, data[idx]); } - cuda::atomic_ref ref{block_min}; + cuda::atomic_ref ref{block_min}; ref.fetch_min(local_min, cuda::std::memory_order_relaxed); block.sync(); @@ -201,11 +201,11 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) } } - cuda::atomic_ref ref{block_max}; + cuda::atomic_ref ref{block_max}; ref.fetch_max(local_max, cuda::std::memory_order_relaxed); block.sync(); - cuda::atomic_ref ref_idx{block_idx_max}; + cuda::atomic_ref ref_idx{block_idx_max}; if (local_max == block_max) { ref_idx.fetch_min(local_idx_max, cuda::std::memory_order_relaxed); } block.sync(); @@ -238,11 +238,11 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) } } - cuda::atomic_ref ref{block_min}; + cuda::atomic_ref ref{block_min}; ref.fetch_min(local_min, cuda::std::memory_order_relaxed); block.sync(); - cuda::atomic_ref ref_idx{block_idx_min}; + cuda::atomic_ref ref_idx{block_idx_min}; if (local_min == block_min) { ref_idx.fetch_min(local_idx_min, cuda::std::memory_order_relaxed); } block.sync(); From 209188f92564c9d7a53a8b0485993f8e53fdc374 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Jan 2023 11:15:53 -0500 Subject: [PATCH 064/121] Request cpp review for udf_cpp --- .github/CODEOWNERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 4b3ed8d3e38..9578d32d13d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,5 +1,6 @@ #cpp code owners -cpp/ @rapidsai/cudf-cpp-codeowners +cpp/ @rapidsai/cudf-cpp-codeowners +python/cudf/udf_cpp/ @rapidsai/cudf-cpp-codeowners #python code owners python/ @rapidsai/cudf-python-codeowners From 826ed25d3c8f821d2aa33896d7369da338a9b849 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Jan 2023 11:32:16 -0500 Subject: [PATCH 065/121] Remove unsupported data types --- python/cudf/udf_cpp/groupby/function.cu | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index a6a18c7a3c1..ad900826a7b 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -257,48 +257,30 @@ extern "C" { return 0; \ } -// make_definition(BlockSum, int8, int8_t); -// make_definition(BlockSum, int16, int16_t); make_definition(BlockSum, int32, int); make_definition(BlockSum, int64, int64_t); make_definition(BlockSum, float32, float); make_definition(BlockSum, float64, double); -// make_definition(BlockSum, bool, bool); -// make_definition(BlockMean, int8, int8_t); -// make_definition(BlockMean, int16, int16_t); make_definition(BlockMean, int32, int); make_definition(BlockMean, int64, int64_t); make_definition(BlockMean, float32, float); make_definition(BlockMean, float64, double); -// make_definition(BlockMean, bool, bool); -// make_definition(BlockStd, int8, int8_t); -// make_definition(BlockStd, int16, int16_t); make_definition(BlockStd, int32, int); make_definition(BlockStd, int64, int64_t); make_definition(BlockStd, float32, float); make_definition(BlockStd, float64, double); -// make_definition(BlockStd, bool, bool); -// make_definition(BlockVar, int8, int8_t); -// make_definition(BlockVar, int16, int16_t); make_definition(BlockVar, int32, int); make_definition(BlockVar, int64, int64_t); make_definition(BlockVar, float32, float); make_definition(BlockVar, float64, double); -// make_definition(BlockVar, bool, bool); -// make_definition(BlockMin, int8, int8_t); -// make_definition(BlockMin, int16, int16_t); make_definition(BlockMin, int32, int); make_definition(BlockMin, int64, int64_t); make_definition(BlockMin, float32, float); make_definition(BlockMin, float64, double); -// make_definition(BlockMin, bool, bool); -// make_definition(BlockMax, int8, int8_t); -// make_definition(BlockMax, int16, int16_t); make_definition(BlockMax, int32, int); make_definition(BlockMax, int64, int64_t); make_definition(BlockMax, float32, float); make_definition(BlockMax, float64, double); -// make_definition(BlockMax, bool, bool); #undef make_definition } @@ -311,19 +293,13 @@ extern "C" { return 0; \ } -// make_definition_idx(BlockIdxMin, int8, int8_t); -// make_definition_idx(BlockIdxMin, int16, int16_t); make_definition_idx(BlockIdxMin, int32, int); make_definition_idx(BlockIdxMin, int64, int64_t); make_definition_idx(BlockIdxMin, float32, float); make_definition_idx(BlockIdxMin, float64, double); -// make_definition_idx(BlockIdxMin, bool, bool); -// make_definition_idx(BlockIdxMax, int8, int8_t); -// make_definition_idx(BlockIdxMax, int16, int16_t); make_definition_idx(BlockIdxMax, int32, int); make_definition_idx(BlockIdxMax, int64, int64_t); make_definition_idx(BlockIdxMax, float32, float); make_definition_idx(BlockIdxMax, float64, double); -// make_definition_idx(BlockIdxMax, bool, bool); #undef make_definition_idx } From 1cf91ea37ff772ea4bcf55d8e58bb4073deb55e4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Jan 2023 12:14:26 -0500 Subject: [PATCH 066/121] Use exclusively thread 0 to write var output + minor cleanups --- python/cudf/udf_cpp/groupby/function.cu | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index ad900826a7b..d27bea17480 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -67,7 +67,7 @@ __device__ void device_var(cooperative_groups::thread_block const& block, ref.fetch_add(local_var, cuda::std::memory_order_relaxed); block.sync(); - *var = *var / static_cast(size - 1); + if (block.thread_rank() == 0) { *var = *var / static_cast(size - 1); } block.sync(); } @@ -205,8 +205,10 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) ref.fetch_max(local_max, cuda::std::memory_order_relaxed); block.sync(); - cuda::atomic_ref ref_idx{block_idx_max}; - if (local_max == block_max) { ref_idx.fetch_min(local_idx_max, cuda::std::memory_order_relaxed); } + if (local_max == block_max) { + cuda::atomic_ref ref_idx{block_idx_max}; + ref_idx.fetch_min(local_idx_max, cuda::std::memory_order_relaxed); + } block.sync(); return block_idx_max; @@ -242,8 +244,10 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) ref.fetch_min(local_min, cuda::std::memory_order_relaxed); block.sync(); - cuda::atomic_ref ref_idx{block_idx_min}; - if (local_min == block_min) { ref_idx.fetch_min(local_idx_min, cuda::std::memory_order_relaxed); } + if (local_min == block_min) { + cuda::atomic_ref ref_idx{block_idx_min}; + ref_idx.fetch_min(local_idx_min, cuda::std::memory_order_relaxed); + } block.sync(); return block_idx_min; From 381dd00cb198e6a614949c08b5e5047a447ded80 Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Mon, 23 Jan 2023 18:47:23 +0000 Subject: [PATCH 067/121] Addressing more reviewer's commment and fix mean bug --- python/cudf/udf_cpp/groupby/function.cu | 75 ++++++++++++------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index ad900826a7b..858f0e5b163 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -67,7 +67,7 @@ __device__ void device_var(cooperative_groups::thread_block const& block, ref.fetch_add(local_var, cuda::std::memory_order_relaxed); block.sync(); - *var = *var / static_cast(size - 1); + if (block.thread_rank() == 0) *var = *var / static_cast(size - 1); block.sync(); } @@ -85,7 +85,7 @@ __device__ T BlockSum(T const* data, int64_t size) } template -__device__ T BlockMean(T const* data, int64_t size) +__device__ double BlockMean(T const* data, int64_t size) { auto block = cooperative_groups::this_thread_block(); @@ -250,49 +250,46 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) } extern "C" { -#define make_definition(name, cname, type) \ - __device__ int name##_##cname(int64_t* numba_return_value, type* const data, int64_t size) \ - { \ - *numba_return_value = name(data, size); \ - return 0; \ +#define make_definition(name, cname, type, return_type) \ + __device__ int name##_##cname(return_type* numba_return_value, type* const data, int64_t size) \ + { \ + *numba_return_value = name(data, size); \ + return 0; \ } -make_definition(BlockSum, int32, int); -make_definition(BlockSum, int64, int64_t); -make_definition(BlockSum, float32, float); -make_definition(BlockSum, float64, double); -make_definition(BlockMean, int32, int); -make_definition(BlockMean, int64, int64_t); -make_definition(BlockMean, float32, float); -make_definition(BlockMean, float64, double); -make_definition(BlockStd, int32, int); -make_definition(BlockStd, int64, int64_t); -make_definition(BlockStd, float32, float); -make_definition(BlockStd, float64, double); -make_definition(BlockVar, int32, int); -make_definition(BlockVar, int64, int64_t); -make_definition(BlockVar, float32, float); -make_definition(BlockVar, float64, double); -make_definition(BlockMin, int32, int); -make_definition(BlockMin, int64, int64_t); -make_definition(BlockMin, float32, float); -make_definition(BlockMin, float64, double); -make_definition(BlockMax, int32, int); -make_definition(BlockMax, int64, int64_t); -make_definition(BlockMax, float32, float); -make_definition(BlockMax, float64, double); +make_definition(BlockSum, int32, int, int); +make_definition(BlockSum, int64, int64_t, int64_t); +make_definition(BlockSum, float32, float, float); +make_definition(BlockSum, float64, double, double); +make_definition(BlockMean, int32, int, double); +make_definition(BlockMean, int64, int64_t, double); +make_definition(BlockMean, float32, float, double); +make_definition(BlockMean, float64, double, double); +make_definition(BlockStd, int32, int, double); +make_definition(BlockStd, int64, int64_t, double); +make_definition(BlockStd, float32, float, double); +make_definition(BlockStd, float64, double, double); +make_definition(BlockVar, int32, int, double); +make_definition(BlockVar, int64, int64_t, double); +make_definition(BlockVar, float32, float, double); +make_definition(BlockVar, float64, double, double); +make_definition(BlockMin, int32, int, int); +make_definition(BlockMin, int64, int64_t, int64_t); +make_definition(BlockMin, float32, float, float); +make_definition(BlockMin, float64, double, double); +make_definition(BlockMax, int32, int, int); +make_definition(BlockMax, int64, int64_t, int64_t); +make_definition(BlockMax, float32, float, float); +make_definition(BlockMax, float64, double, double); #undef make_definition } extern "C" { -#define make_definition_idx(name, cname, type) \ - __device__ int name##_##cname( \ - int64_t* numba_return_value, type* const data, int64_t* index, int64_t size) \ - { \ - *numba_return_value = name(data, index, size); \ - return 0; \ - } - +#define make_definition_idx(name, cname, type) \ + __device__ int name ## _ ## cname (int64_t* numba_return_value, type* const data, int64_t* index, int64_t size) { \ + *numba_return_value = name(data, index, size); \ + return 0; \ + } make_definition_idx(BlockIdxMin, int32, int); make_definition_idx(BlockIdxMin, int64, int64_t); make_definition_idx(BlockIdxMin, float32, float); From ee87548bb1a984a907c2ca5c0385da0a8c4e684a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 23 Jan 2023 11:22:13 -0800 Subject: [PATCH 068/121] error if nulls are present, dont not patch numba linker in strings_udf --- python/cudf/cudf/core/groupby/groupby.py | 7 +++++++ python/strings_udf/cpp/CMakeLists.txt | 2 +- python/strings_udf/strings_udf/__init__.py | 9 +-------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 3082ca8520c..94777862044 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -852,6 +852,13 @@ def mult(df): group_names, offsets, group_keys, grouped_values = self._grouped() if engine == "jit": + # Nulls are not yet supported + for colname in self.grouping.values._data.keys(): + if self.obj._data[colname].has_nulls(): + raise ValueError( + "Nulls not yet supported with groupby JIT engine" + ) + chunk_results = jit_groupby_apply( offsets, grouped_values, function, *args ) diff --git a/python/strings_udf/cpp/CMakeLists.txt b/python/strings_udf/cpp/CMakeLists.txt index 2cab9871f74..3e58d10d6e2 100644 --- a/python/strings_udf/cpp/CMakeLists.txt +++ b/python/strings_udf/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/strings_udf/strings_udf/__init__.py b/python/strings_udf/strings_udf/__init__.py index 247a22fe1f9..66c037125e6 100644 --- a/python/strings_udf/strings_udf/__init__.py +++ b/python/strings_udf/strings_udf/__init__.py @@ -4,11 +4,7 @@ from cuda import cudart from ptxcompiler.patch import NO_DRIVER, safe_get_versions -from cudf.core.udf.utils import ( - _get_cuda_version_from_ptx_file, - _get_ptx_file, - maybe_patch_numba_linker, -) +from cudf.core.udf.utils import _get_cuda_version_from_ptx_file, _get_ptx_file from . import _version @@ -45,7 +41,4 @@ def set_malloc_heap_size(size=None): ptxpath = None versions = safe_get_versions() if versions != NO_DRIVER: - driver_version, runtime_version = versions ptxpath = _get_ptx_file(path, "shim_") - strings_udf_ptx_version = _get_cuda_version_from_ptx_file(ptxpath) - maybe_patch_numba_linker(driver_version, strings_udf_ptx_version) From 1d4edc8658f5df88396b60808088706f9a575dec Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 23 Jan 2023 11:34:19 -0800 Subject: [PATCH 069/121] Style --- python/cudf/udf_cpp/groupby/function.cu | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 858f0e5b163..6a4ce038013 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -250,11 +250,11 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) } extern "C" { -#define make_definition(name, cname, type, return_type) \ +#define make_definition(name, cname, type, return_type) \ __device__ int name##_##cname(return_type* numba_return_value, type* const data, int64_t size) \ - { \ - *numba_return_value = name(data, size); \ - return 0; \ + { \ + *numba_return_value = name(data, size); \ + return 0; \ } make_definition(BlockSum, int32, int, int); @@ -285,11 +285,13 @@ make_definition(BlockMax, float64, double, double); } extern "C" { -#define make_definition_idx(name, cname, type) \ - __device__ int name ## _ ## cname (int64_t* numba_return_value, type* const data, int64_t* index, int64_t size) { \ - *numba_return_value = name(data, index, size); \ - return 0; \ - } +#define make_definition_idx(name, cname, type) \ + __device__ int name##_##cname( \ + int64_t* numba_return_value, type* const data, int64_t* index, int64_t size) \ + { \ + *numba_return_value = name(data, index, size); \ + return 0; \ + } make_definition_idx(BlockIdxMin, int32, int); make_definition_idx(BlockIdxMin, int64, int64_t); make_definition_idx(BlockIdxMin, float32, float); From 4fe21fb47fbdc10065f5b6c2c08dada1c87da625 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Jan 2023 14:40:19 -0500 Subject: [PATCH 070/121] Replace int with int32_t --- python/cudf/udf_cpp/groupby/function.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 5f0b323c91e..7331946b05b 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -261,27 +261,27 @@ extern "C" { return 0; \ } -make_definition(BlockSum, int32, int, int); +make_definition(BlockSum, int32, int32_t, int32_t); make_definition(BlockSum, int64, int64_t, int64_t); make_definition(BlockSum, float32, float, float); make_definition(BlockSum, float64, double, double); -make_definition(BlockMean, int32, int, double); +make_definition(BlockMean, int32, int32_t, double); make_definition(BlockMean, int64, int64_t, double); make_definition(BlockMean, float32, float, double); make_definition(BlockMean, float64, double, double); -make_definition(BlockStd, int32, int, double); +make_definition(BlockStd, int32, int32_t, double); make_definition(BlockStd, int64, int64_t, double); make_definition(BlockStd, float32, float, double); make_definition(BlockStd, float64, double, double); -make_definition(BlockVar, int32, int, double); +make_definition(BlockVar, int32, int32_t, double); make_definition(BlockVar, int64, int64_t, double); make_definition(BlockVar, float32, float, double); make_definition(BlockVar, float64, double, double); -make_definition(BlockMin, int32, int, int); +make_definition(BlockMin, int32, int32_t, int32_t); make_definition(BlockMin, int64, int64_t, int64_t); make_definition(BlockMin, float32, float, float); make_definition(BlockMin, float64, double, double); -make_definition(BlockMax, int32, int, int); +make_definition(BlockMax, int32, int32_t, int32_t); make_definition(BlockMax, int64, int64_t, int64_t); make_definition(BlockMax, float32, float, float); make_definition(BlockMax, float64, double, double); @@ -296,11 +296,11 @@ extern "C" { *numba_return_value = name(data, index, size); \ return 0; \ } -make_definition_idx(BlockIdxMin, int32, int); +make_definition_idx(BlockIdxMin, int32, int32_t); make_definition_idx(BlockIdxMin, int64, int64_t); make_definition_idx(BlockIdxMin, float32, float); make_definition_idx(BlockIdxMin, float64, double); -make_definition_idx(BlockIdxMax, int32, int); +make_definition_idx(BlockIdxMax, int32, int32_t); make_definition_idx(BlockIdxMax, int64, int64_t); make_definition_idx(BlockIdxMax, float32, float); make_definition_idx(BlockIdxMax, float64, double); From 3fbe3fff30de4c0eae756b3f9124c67e21e4a7d4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 23 Jan 2023 16:41:21 -0500 Subject: [PATCH 071/121] Remove unused template specializations --- python/cudf/udf_cpp/groupby/function.cu | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 7331946b05b..da62eac7c9e 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -261,29 +261,17 @@ extern "C" { return 0; \ } -make_definition(BlockSum, int32, int32_t, int32_t); make_definition(BlockSum, int64, int64_t, int64_t); -make_definition(BlockSum, float32, float, float); make_definition(BlockSum, float64, double, double); -make_definition(BlockMean, int32, int32_t, double); make_definition(BlockMean, int64, int64_t, double); -make_definition(BlockMean, float32, float, double); make_definition(BlockMean, float64, double, double); -make_definition(BlockStd, int32, int32_t, double); make_definition(BlockStd, int64, int64_t, double); -make_definition(BlockStd, float32, float, double); make_definition(BlockStd, float64, double, double); -make_definition(BlockVar, int32, int32_t, double); make_definition(BlockVar, int64, int64_t, double); -make_definition(BlockVar, float32, float, double); make_definition(BlockVar, float64, double, double); -make_definition(BlockMin, int32, int32_t, int32_t); make_definition(BlockMin, int64, int64_t, int64_t); -make_definition(BlockMin, float32, float, float); make_definition(BlockMin, float64, double, double); -make_definition(BlockMax, int32, int32_t, int32_t); make_definition(BlockMax, int64, int64_t, int64_t); -make_definition(BlockMax, float32, float, float); make_definition(BlockMax, float64, double, double); #undef make_definition } @@ -296,13 +284,9 @@ extern "C" { *numba_return_value = name(data, index, size); \ return 0; \ } -make_definition_idx(BlockIdxMin, int32, int32_t); make_definition_idx(BlockIdxMin, int64, int64_t); -make_definition_idx(BlockIdxMin, float32, float); make_definition_idx(BlockIdxMin, float64, double); -make_definition_idx(BlockIdxMax, int32, int32_t); make_definition_idx(BlockIdxMax, int64, int64_t); -make_definition_idx(BlockIdxMax, float32, float); make_definition_idx(BlockIdxMax, float64, double); #undef make_definition_idx } From 9af367090fa98717eba426e37e4a06e5cfffe73a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 06:44:20 -0800 Subject: [PATCH 072/121] update utility functions to no longer be strings_udf specific --- python/cudf/cudf/core/udf/utils.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 217dd0aaa97..8b4ce9ae15b 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -294,7 +294,7 @@ def _get_appropriate_file(sms, cc): def _get_ptx_file(path, prefix): if "RAPIDS_NO_INITIALIZE" in os.environ: - # shim_60.ptx is always built + # cc=60 ptx is always built cc = int(os.environ.get("STRINGS_UDF_CC", "60")) else: dev = cuda.get_current_device() @@ -324,7 +324,7 @@ def _get_ptx_file(path, prefix): if regular_result is None: raise RuntimeError( - "This strings_udf installation is missing the necessary PTX " + "This cuDF installation is missing the necessary PTX " f"files that are <={cc}." ) else: @@ -402,18 +402,17 @@ def _setup_numba_linker(path, prefix): if versions != NO_DRIVER: driver_version, runtime_version = versions ptxpath = _get_ptx_file(path, prefix) - strings_udf_ptx_version = _get_cuda_version_from_ptx_file(ptxpath) - maybe_patch_numba_linker(driver_version, strings_udf_ptx_version) + ptx_toolkit_version = _get_cuda_version_from_ptx_file(ptxpath) + maybe_patch_numba_linker(driver_version, ptx_toolkit_version) def maybe_patch_numba_linker(driver_version, ptx_toolkit_version): # Numba thinks cubinlinker is only needed if the driver is older than - # the ctk but when strings_udf is present, it might also need to patch - # because the PTX file strings_udf relies on may be newer than - # the driver as well + # the ctk, but when PTX files are present, it might also need to patch + # because those PTX files may newer than the driver as well if driver_version < ptx_toolkit_version: print( - "Driver version %s.%s needs patching due to strings_udf" + "Driver version %s.%s needs patching due to PTX files" % driver_version ) if _numba_version_ok: From afd094943874713107a47ec786c93e0cfd25a6c2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 06:52:18 -0800 Subject: [PATCH 073/121] tweak thread guard logic in groupby template --- python/cudf/cudf/core/udf/templates.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/udf/templates.py b/python/cudf/cudf/core/udf/templates.py index f982d904d7f..9a032146992 100644 --- a/python/cudf/cudf/core/udf/templates.py +++ b/python/cudf/cudf/core/udf/templates.py @@ -74,5 +74,7 @@ def _kernel(offset, out, index, {input_columns}, {extra_args}): {group_initializers} - out[block_id] = f_(dataframe_group, {extra_args}) + result = f_(dataframe_group, {extra_args}) + if cuda.threadIdx.x == 0: + out[block_id] = result """ From 1828ef722e17f8a4aa069ef1d89152cada7aa393 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 24 Jan 2023 12:00:31 -0600 Subject: [PATCH 074/121] Apply suggestions from code review Co-authored-by: Vyas Ramasubramani --- python/cudf/CMakeLists.txt | 1 - python/cudf/cudf/core/udf/groupby_lowering.py | 6 +----- python/cudf/cudf/core/udf/groupby_typing.py | 4 ++-- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 50a60014cc8..c52570e1357 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -29,7 +29,6 @@ project( # that is fixed we need to keep C. C CXX - # Temporarily enabling for groupby UDFs compilation until we come up with a better # solution. CUDA ) diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index faf4320caf1..7d43862fb66 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -145,11 +145,7 @@ def cuda_Group_size(context, builder, sig, args): return grp.size -def cuda_Group_count(context, builder, sig, args): - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - return grp.size +cuda_Group_count = cuda_Group_size for ty in SUPPORTED_GROUPBY_NUMBA_TYPES: diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 82cd863a42e..f1c02375803 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -25,7 +25,7 @@ ] -class Group(object): +class Group: """ A piece of python code whose purpose is to be replaced during compilation. After being registered to GroupType, @@ -97,7 +97,7 @@ def __init__( ("size", types.int64), ("index", types.CPointer(fe_type.index_type)), ] - models.StructModel.__init__(self, dmm, fe_type, members) + super().__init__(self, dmm, fe_type, members) call_cuda_functions: Dict[Any, Any] = {} From 648995021873e74fecb07baa9624396d1c0c2130 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 10:07:31 -0800 Subject: [PATCH 075/121] fix small bug --- python/cudf/cudf/core/udf/groupby_typing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index f1c02375803..7daefdfd393 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -97,7 +97,7 @@ def __init__( ("size", types.int64), ("index", types.CPointer(fe_type.index_type)), ] - super().__init__(self, dmm, fe_type, members) + super().__init__(dmm, fe_type, members) call_cuda_functions: Dict[Any, Any] = {} From 9b83d786d885d4746319072879ad956bff482c74 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 10:20:00 -0800 Subject: [PATCH 076/121] refactor group constructor lowering --- python/cudf/cudf/core/udf/groupby_lowering.py | 29 +++++-------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index 7d43862fb66..710ec751a7e 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -60,30 +60,15 @@ def group_constructor(context, builder, sig, args): Instruction boilerplate used for instantiating a Group struct from a data pointer, an index pointer, and a size """ - - group_data, size, index = args - # a variable logically corresponding to the calling `Group` grp = cgutils.create_struct_proxy(sig.return_type)(context, builder) - - # the group data array and its pointer - arr_group_data = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=group_data - ) - group_data_ptr = arr_group_data.data - - # the group index array and its pointer - arr_index = cgutils.create_struct_proxy(sig.args[2])( - context, builder, value=index - ) - index_ptr = arr_index.data - - # fill the struct explicitly - grp.group_data = group_data_ptr - grp.index = index_ptr - grp.size = size - - # return the struct by value + grp.group_data = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ).data + grp.index = cgutils.create_struct_proxy(sig.args[2])( + context, builder, value=args[2] + ).data + grp.size = args[1] return grp._getvalue() From 73a2ba138108be81be8ad65b56b4c6ebd4e4b637 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 10:52:20 -0800 Subject: [PATCH 077/121] partially address reviews --- python/cudf/cudf/core/groupby/groupby.py | 3 +- python/cudf/cudf/core/udf/groupby_lowering.py | 32 +++++++++++-------- python/cudf/cudf/core/udf/groupby_typing.py | 26 ++++++--------- 3 files changed, 30 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index d740596f719..e42c7897b42 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -881,7 +881,8 @@ def mult(df): if ngroups > self._MAX_GROUPS_BEFORE_WARN: warnings.warn( f"GroupBy.apply() performance scales poorly with " - f"number of groups. Got {ngroups} groups." + f"number of groups. Got {ngroups} groups. Some functions " + "may perform better by passing engine='jit'" ) chunks = [ diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index 710ec751a7e..d2b9700fbc9 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -17,7 +17,7 @@ ) -def lowering_function(context, builder, sig, args, function): +def group_reduction_impl_basic(context, builder, sig, args, function): """ Instruction boilerplate used for calling a groupby reduction __device__ function. Centers around a forward declaration of @@ -72,14 +72,14 @@ def group_constructor(context, builder, sig, args): return grp._getvalue() -def cuda_Group_idx_max_or_min(context, builder, sig, args, function): +def group_reduction_impl_idx_max_or_min(context, builder, sig, args, function): """ Instruction boilerplate used for calling a groupby reduction __device__ function in the case where the function is either - `idxmax` or `idxmin`. See `lowering_function` for details. This - lowering differs from other reductions due to the presence of - the index. This results in the forward declaration expecting - an extra arg. + `idxmax` or `idxmin`. See `group_reduction_impl_basic` for + details. This lowering differs from other reductions due to + the presence of the index. This results in the forward + declaration expecting an extra arg. """ retty = sig.return_type @@ -112,15 +112,19 @@ def cuda_Group_idx_max_or_min(context, builder, sig, args, function): ) -cuda_Group_max = partial(lowering_function, function="max") -cuda_Group_min = partial(lowering_function, function="min") -cuda_Group_sum = partial(lowering_function, function="sum") -cuda_Group_mean = partial(lowering_function, function="mean") -cuda_Group_std = partial(lowering_function, function="std") -cuda_Group_var = partial(lowering_function, function="var") +cuda_Group_max = partial(group_reduction_impl_basic, function="max") +cuda_Group_min = partial(group_reduction_impl_basic, function="min") +cuda_Group_sum = partial(group_reduction_impl_basic, function="sum") +cuda_Group_mean = partial(group_reduction_impl_basic, function="mean") +cuda_Group_std = partial(group_reduction_impl_basic, function="std") +cuda_Group_var = partial(group_reduction_impl_basic, function="var") -cuda_Group_idxmax = partial(cuda_Group_idx_max_or_min, function="idxmax") -cuda_Group_idxmin = partial(cuda_Group_idx_max_or_min, function="idxmin") +cuda_Group_idxmax = partial( + group_reduction_impl_idx_max_or_min, function="idxmax" +) +cuda_Group_idxmin = partial( + group_reduction_impl_idx_max_or_min, function="idxmin" +) def cuda_Group_size(context, builder, sig, args): diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 7daefdfd393..863ce4aa19c 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -33,12 +33,7 @@ class Group: in python code and accessing their attributes """ - def __init__(self, group_data, size, index, dtype, index_dtype): - self.group_data = group_data - self.size = size - self.index = index - self.dtype = dtype - self.index_dtype = index_dtype + pass class GroupType(numba.types.Type): @@ -103,24 +98,24 @@ def __init__( call_cuda_functions: Dict[Any, Any] = {} -def _register_cuda_reduction_caller(func, inputty, retty): +def _register_cuda_reduction_caller(funcname, inputty, retty): cuda_func = cuda.declare_device( - f"Block{func}_{inputty}", retty(types.CPointer(inputty), types.int64) + f"Block{funcname}_{inputty}", + retty(types.CPointer(inputty), types.int64), ) def caller(data, size): return cuda_func(data, size) - if call_cuda_functions.get(func.lower()) is None: - call_cuda_functions[func.lower()] = {} + call_cuda_functions.setdefault(funcname.lower(), {}) type_key = (retty, inputty) - call_cuda_functions[func.lower()][type_key] = caller + call_cuda_functions[funcname.lower()][type_key] = caller -def _register_cuda_idxreduction_caller(func, inputty): +def _register_cuda_idxreduction_caller(funcname, inputty): cuda_func = cuda.declare_device( - f"Block{func}_{inputty}", + f"Block{funcname}_{inputty}", types.int64( types.CPointer(inputty), types.CPointer(types.int64), types.int64 ), @@ -131,9 +126,8 @@ def caller(data, index, size): # only support default index type right now type_key = (index_default_type, inputty) - if call_cuda_functions.get(func.lower()) is None: - call_cuda_functions[func.lower()] = {} - call_cuda_functions[func.lower()][type_key] = caller + call_cuda_functions.setdefault(funcname.lower(), {}) + call_cuda_functions[funcname.lower()][type_key] = caller def _create_reduction_attr(name, retty=None): From 4dfb7904b53243e115af439e35a89377402fa616 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 11:08:10 -0800 Subject: [PATCH 078/121] continue addressing reviews --- python/cudf/cudf/__init__.py | 3 ++- python/cudf/cudf/core/udf/groupby_typing.py | 24 +++++++++------------ python/cudf/cudf/core/udf/utils.py | 6 ++---- 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index a797d82530b..49d5d329a3b 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -92,9 +92,10 @@ # cuDF requires a stronger set of conditions than what is # checked by patch_numba_linker_if_needed due to the PTX # files needed for JIT Groupby Apply and string UDFs + from cudf.core.udf.groupby_utils import dev_func_ptx from cudf.core.udf.utils import _setup_numba_linker - _setup_numba_linker(os.path.dirname(__file__) + "/core/udf/", "function_") + _setup_numba_linker(dev_func_ptx) del patch_numba_linker_if_needed diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 863ce4aa19c..2ce525abb1b 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -113,7 +113,7 @@ def caller(data, size): call_cuda_functions[funcname.lower()][type_key] = caller -def _register_cuda_idxreduction_caller(funcname, inputty): +def _register_cuda_idx_reduction_caller(funcname, inputty): cuda_func = cuda.declare_device( f"Block{funcname}_{inputty}", types.int64( @@ -134,15 +134,11 @@ def _create_reduction_attr(name, retty=None): class Attr(AbstractTemplate): key = name - if retty: - - def generic(self, args, kws): - return nb_signature(retty, recvr=self.this) - - else: - - def generic(self, args, kws): - return nb_signature(self.this.group_scalar_type, recvr=self.this) + def generic(self, args, kws): + return nb_signature( + self.this.group_scalar_type if not retty else retty, + recvr=self.this, + ) Attr.generic = generic @@ -210,10 +206,10 @@ def resolve_idxmin(self, mod): _register_cuda_reduction_caller("Std", types.float64, types.float64) _register_cuda_reduction_caller("Var", types.int64, types.float64) _register_cuda_reduction_caller("Var", types.float64, types.float64) -_register_cuda_idxreduction_caller("IdxMax", types.int64) -_register_cuda_idxreduction_caller("IdxMax", types.float64) -_register_cuda_idxreduction_caller("IdxMin", types.int64) -_register_cuda_idxreduction_caller("IdxMin", types.float64) +_register_cuda_idx_reduction_caller("IdxMax", types.int64) +_register_cuda_idx_reduction_caller("IdxMax", types.float64) +_register_cuda_idx_reduction_caller("IdxMin", types.int64) +_register_cuda_idx_reduction_caller("IdxMin", types.float64) make_attribute_wrapper(GroupType, "group_data", "group_data") diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 8b4ce9ae15b..d8f49deb98c 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -389,20 +389,18 @@ def _get_cuda_version_from_ptx_file(path): return cuda_ver -def _setup_numba_linker(path, prefix): +def _setup_numba_linker(path): from ptxcompiler.patch import NO_DRIVER, safe_get_versions from cudf.core.udf.utils import ( _get_cuda_version_from_ptx_file, - _get_ptx_file, maybe_patch_numba_linker, ) versions = safe_get_versions() if versions != NO_DRIVER: driver_version, runtime_version = versions - ptxpath = _get_ptx_file(path, prefix) - ptx_toolkit_version = _get_cuda_version_from_ptx_file(ptxpath) + ptx_toolkit_version = _get_cuda_version_from_ptx_file(path) maybe_patch_numba_linker(driver_version, ptx_toolkit_version) From 43be944edcd96b7a406d0da692135518c66eeb2e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 11:10:22 -0800 Subject: [PATCH 079/121] extraneous copyright --- python/cudf/cudf/utils/gpu_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py index c10dd8ffb3e..ab3adc1651a 100644 --- a/python/cudf/cudf/utils/gpu_utils.py +++ b/python/cudf/cudf/utils/gpu_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. def validate_setup(): From b5f8f6339ec9dd11e8aa3c89206cab41362b1804 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 11:11:00 -0800 Subject: [PATCH 080/121] fix small comment error in cmake --- python/cudf/CMakeLists.txt | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index c52570e1357..638606e27bc 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -27,10 +27,7 @@ project( # language to be enabled here. The test project that is built in scikit-build to verify # various linking options for the python library is hardcoded to build with C, so until # that is fixed we need to keep C. - C - CXX - # solution. - CUDA + C CXX CUDA ) option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files" From 8bbd7254c7703563dd3e3e585946fd09a4707a17 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 11:28:23 -0800 Subject: [PATCH 081/121] inline _is_jit_supported_type --- python/cudf/cudf/core/udf/utils.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index d8f49deb98c..f6899473c18 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -20,7 +20,6 @@ import rmm from cudf.core.column.column import as_column -from cudf.core.dtypes import CategoricalDtype from cudf.core.udf.masked_typing import MaskedType from cudf.utils import cudautils from cudf.utils.dtypes import ( @@ -97,17 +96,10 @@ def _get_udf_return_type(argty, func: Callable, args=()): return result -def _is_jit_supported_type(dtype, supported_types): - # category dtype isn't hashable - if isinstance(dtype, CategoricalDtype): - return False - return str(dtype) in supported_types - - def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { colname: col.dtype - if _is_jit_supported_type(col.dtype, supported_types=supported_types) + if str(col.dtype) in supported_types else np.dtype("O") for colname, col in frame._data.items() } @@ -117,7 +109,7 @@ def _supported_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { colname: col.dtype for colname, col in frame._data.items() - if _is_jit_supported_type(col.dtype, supported_types=supported_types) + if str(col.dtype) in supported_types } @@ -125,7 +117,7 @@ def _supported_cols_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { colname: col for colname, col in frame._data.items() - if _is_jit_supported_type(col.dtype, supported_types=supported_types) + if str(col.dtype) in supported_types } From 2df32167d6b80bfc2c4d4f3625635d57feee67f4 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 12:15:22 -0800 Subject: [PATCH 082/121] adjust logic in maybe_patch_numba_linker --- python/cudf/cudf/core/udf/utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index f6899473c18..7c49876a895 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -393,14 +393,20 @@ def _setup_numba_linker(path): if versions != NO_DRIVER: driver_version, runtime_version = versions ptx_toolkit_version = _get_cuda_version_from_ptx_file(path) - maybe_patch_numba_linker(driver_version, ptx_toolkit_version) + maybe_patch_numba_linker( + driver_version, runtime_version, ptx_toolkit_version + ) -def maybe_patch_numba_linker(driver_version, ptx_toolkit_version): +def maybe_patch_numba_linker( + driver_version, runtime_version, ptx_toolkit_version +): # Numba thinks cubinlinker is only needed if the driver is older than # the ctk, but when PTX files are present, it might also need to patch # because those PTX files may newer than the driver as well - if driver_version < ptx_toolkit_version: + if (driver_version < ptx_toolkit_version) or ( + driver_version < runtime_version + ): print( "Driver version %s.%s needs patching due to PTX files" % driver_version From e8137e3fb0b4903cd3d62fc008aabe5e50b6de23 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Jan 2023 15:41:23 -0500 Subject: [PATCH 083/121] Apply suggestions from code review Co-authored-by: Bradley Dice --- python/cudf/udf_cpp/groupby/function.cu | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index da62eac7c9e..9049e9f5c19 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -58,9 +58,8 @@ __device__ void device_var(cooperative_groups::thread_block const& block, #pragma unroll for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { - auto temp = static_cast(data[idx]) - mean; - temp *= temp; - local_var += temp; + auto delta = static_cast(data[idx]) - mean; + local_var += delta * delta; } cuda::atomic_ref ref{*var}; @@ -129,8 +128,8 @@ __device__ T BlockMax(T const* data, int64_t size) auto block = cooperative_groups::this_thread_block(); auto local_max = []() { - if constexpr (std::is_floating_point_v) { return -std::numeric_limits::max(); } - return std::numeric_limits::min(); + if constexpr (std::is_floating_point_v) { return -std::numeric_limits::infinity(); } + return std::numeric_limits::lowest(); }(); __shared__ T block_max; if (block.thread_rank() == 0) { block_max = local_max; } @@ -181,8 +180,8 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) __shared__ int64_t block_idx_max; auto local_max = []() { - if constexpr (std::is_floating_point_v) { return -std::numeric_limits::max(); } - return std::numeric_limits::min(); + if constexpr (std::is_floating_point_v) { return -std::numeric_limits::infinity(); } + return std::numeric_limits::lowest(); }(); auto local_idx_max = std::numeric_limits::max(); From c5e744569a1e3f7995f35f6f4a3d9192199e640e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Jan 2023 15:44:27 -0500 Subject: [PATCH 084/121] Correct numerical limits --- python/cudf/udf_cpp/groupby/function.cu | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 9049e9f5c19..dcfd0cd7b44 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -58,7 +58,7 @@ __device__ void device_var(cooperative_groups::thread_block const& block, #pragma unroll for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { - auto delta = static_cast(data[idx]) - mean; + auto const delta = static_cast(data[idx]) - mean; local_var += delta * delta; } @@ -153,7 +153,11 @@ __device__ T BlockMin(T const* data, int64_t size) { auto block = cooperative_groups::this_thread_block(); - auto local_min = std::numeric_limits::max(); + auto local_min = []() { + if constexpr (std::is_floating_point_v) { return std::numeric_limits::infinity(); } + return std::numeric_limits::max(); + }(); + __shared__ T block_min; if (block.thread_rank() == 0) { block_min = local_min; } block.sync(); @@ -221,7 +225,10 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) __shared__ T block_min; __shared__ int64_t block_idx_min; - auto local_min = std::numeric_limits::max(); + auto local_min = []() { + if constexpr (std::is_floating_point_v) { return std::numeric_limits::infinity(); } + return std::numeric_limits::max(); + }(); auto local_idx_min = std::numeric_limits::max(); if (block.thread_rank() == 0) { From 761261c142097c74e09a9b17a76edf0303d9cf29 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 24 Jan 2023 14:53:52 -0600 Subject: [PATCH 085/121] Apply suggestions from code review Co-authored-by: Bradley Dice --- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/core/udf/groupby_typing.py | 5 ++--- python/cudf/cudf/core/udf/groupby_utils.py | 8 +++++--- python/cudf/cudf/core/udf/utils.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 2 +- python/cudf/udf_cpp/groupby/CMakeLists.txt | 2 +- 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e42c7897b42..71b60bf2bde 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -792,7 +792,7 @@ def apply(self, function, *args, engine="cudf"): Parameters ---------- - func : function + function : callable The python transformation function that will be applied on the grouped chunk. engine: {'cudf', 'jit'}, default 'cudf' diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 2ce525abb1b..15a4d1d7683 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -212,6 +212,5 @@ def resolve_idxmin(self, mod): _register_cuda_idx_reduction_caller("IdxMin", types.float64) -make_attribute_wrapper(GroupType, "group_data", "group_data") -make_attribute_wrapper(GroupType, "index", "index") -make_attribute_wrapper(GroupType, "size", "size") +for attr in ("group_data", "index", "size") + make_attribute_wrapper(GroupType, attr, attr) diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index dc32dc61416..7d2fa48b10b 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -72,7 +72,9 @@ def _get_frame_groupby_type(dtype, index_dtype): # Align the next member of the struct to be a multiple of the # memory access size, per PTX ISA 7.4/5.4.5 if i < len(sizes) - 1: - offset = int(math.ceil(offset / 8) * 8) + alignment = offset % 8 + if alignment != 0: + offset += 8 - alignment # Numba requires that structures are aligned for the CUDA target _is_aligned_struct = True @@ -145,8 +147,8 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): grouped_values : DataFrame A DataFrame representing the source data sorted by group keys - function: callable - The user UDF defined on a DataFrame + function : callable + The user-defined function to execute """ offsets = cp.asarray(offsets) ngroups = len(offsets) - 1 diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 7c49876a895..c2e8cb4aa30 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -277,7 +277,7 @@ def _post_process_output_col(col, retty): def _get_appropriate_file(sms, cc): - filtered_sms = list(filter(lambda x: x[0] <= cc, sms)) + filtered_sms = [x for x in sms if x[0] <= cc] if filtered_sms: return max(filtered_sms, key=lambda y: y[0]) else: diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index e969e8ab7ce..367f513a7c9 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -393,9 +393,9 @@ def run_groupby_apply_jit_test(data, func, keys, *args): expect_groupby_obj = data.to_pandas().groupby(keys, as_index=False) got_groupby_obj = data.groupby(keys) + # compare cuDF jit to pandas cudf_jit_result = got_groupby_obj.apply(func, *args, engine="jit") pandas_result = expect_groupby_obj.apply(func, *args) - # compare cuDF jit to pandas assert_groupby_results_equal(cudf_jit_result, pandas_result) diff --git a/python/cudf/udf_cpp/groupby/CMakeLists.txt b/python/cudf/udf_cpp/groupby/CMakeLists.txt index fcf036a0812..190a590ab79 100644 --- a/python/cudf/udf_cpp/groupby/CMakeLists.txt +++ b/python/cudf/udf_cpp/groupby/CMakeLists.txt @@ -23,7 +23,7 @@ rapids_cuda_init_architectures(groupby-udf-cpp) # Create a project so that we can enable CUDA architectures in this file. project( groupby-udf-cpp - VERSION 0.0.0 # Placeholder since this isn't a real project + VERSION 23.02.00 LANGUAGES CUDA ) From 9884897f9c71e349859cfcea73c1e1d02c87c2e3 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 12:57:08 -0800 Subject: [PATCH 086/121] fix small syntax error --- python/cudf/cudf/core/udf/groupby_typing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 15a4d1d7683..99c12560921 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -212,5 +212,5 @@ def resolve_idxmin(self, mod): _register_cuda_idx_reduction_caller("IdxMin", types.float64) -for attr in ("group_data", "index", "size") +for attr in ("group_data", "index", "size"): make_attribute_wrapper(GroupType, attr, attr) From 41b42c7c6a5df5a26926155d8065c2866b4dc9ee Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 12:58:51 -0800 Subject: [PATCH 087/121] add an updater to update-version.sh --- ci/release/update-version.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 335d907b7b9..c59b6bc4f1d 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -43,6 +43,9 @@ sed_runner 's/'"cudf_version .*)"'/'"cudf_version ${NEXT_FULL_TAG})"'/g' python/ # Strings UDF update sed_runner 's/'"strings_udf_version .*)"'/'"strings_udf_version ${NEXT_FULL_TAG})"'/g' python/strings_udf/CMakeLists.txt +# Groupby UDF update +sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' python/cudf/udf_cpp/CMakeLists.txt + # cpp libcudf_kafka update sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt From 5855f5c963284bb91c0e93e15eb53e5a3b15038f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 13:14:54 -0800 Subject: [PATCH 088/121] refactor groupby.apply top level impl into separate methods --- python/cudf/cudf/core/groupby/groupby.py | 113 ++++++++++++++--------- 1 file changed, 71 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 71b60bf2bde..fbcfaa447d3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -787,6 +787,60 @@ def pipe(self, func, *args, **kwargs): """ return cudf.core.common.pipe(self, func, *args, **kwargs) + def _jit_groupby_apply( + self, function, group_names, offsets, group_keys, grouped_values, *args + ): + # Nulls are not yet supported + for colname in self.grouping.values._data.keys(): + if self.obj._data[colname].has_nulls(): + raise ValueError( + "Nulls not yet supported with groupby JIT engine" + ) + + chunk_results = jit_groupby_apply( + offsets, grouped_values, function, *args + ) + result = cudf.Series(chunk_results, index=group_names) + result.index.names = self.grouping.names + result = result.reset_index() + result[None] = result.pop(0) + return result + + def _iterative_groupby_apply( + self, function, group_names, offsets, group_keys, grouped_values, *args + ): + ngroups = len(offsets) - 1 + if ngroups > self._MAX_GROUPS_BEFORE_WARN: + warnings.warn( + f"GroupBy.apply() performance scales poorly with " + f"number of groups. Got {ngroups} groups. Some functions " + "may perform better by passing engine='jit'", + RuntimeWarning, + ) + + chunks = [ + grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) + ] + chunk_results = [function(chk, *args) for chk in chunks] + if not len(chunk_results): + return self.obj.head(0) + + if cudf.api.types.is_scalar(chunk_results[0]): + result = cudf.Series(chunk_results, index=group_names) + result.index.names = self.grouping.names + elif isinstance(chunk_results[0], cudf.Series) and isinstance( + self.obj, cudf.DataFrame + ): + result = cudf.concat(chunk_results, axis=1).T + result.index.names = self.grouping.names + else: + result = cudf.concat(chunk_results) + if self._group_keys: + index_data = group_keys._data.copy(deep=True) + index_data[None] = grouped_values.index._column + result.index = cudf.MultiIndex._from_data(index_data) + return result + def apply(self, function, *args, engine="cudf"): """Apply a python transformation function over the grouped chunk. @@ -795,6 +849,8 @@ def apply(self, function, *args, engine="cudf"): function : callable The python transformation function that will be applied on the grouped chunk. + args : tuple + Optional positional arguments to pass to the function. engine: {'cudf', 'jit'}, default 'cudf' Selects the GroupBy.apply implementation. Use `jit` to select the numba JIT pipeline. @@ -862,50 +918,23 @@ def mult(df): group_names, offsets, group_keys, grouped_values = self._grouped() if engine == "jit": - # Nulls are not yet supported - for colname in self.grouping.values._data.keys(): - if self.obj._data[colname].has_nulls(): - raise ValueError( - "Nulls not yet supported with groupby JIT engine" - ) - - chunk_results = jit_groupby_apply( - offsets, grouped_values, function, *args + result = self._jit_groupby_apply( + function, + group_names, + offsets, + group_keys, + grouped_values, + *args, ) - result = cudf.Series(chunk_results, index=group_names) - result.index.names = self.grouping.names - result = result.reset_index() - result[None] = result.pop(0) elif engine == "cudf": - ngroups = len(offsets) - 1 - if ngroups > self._MAX_GROUPS_BEFORE_WARN: - warnings.warn( - f"GroupBy.apply() performance scales poorly with " - f"number of groups. Got {ngroups} groups. Some functions " - "may perform better by passing engine='jit'" - ) - - chunks = [ - grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) - ] - chunk_results = [function(chk, *args) for chk in chunks] - if not len(chunk_results): - return self.obj.head(0) - - if cudf.api.types.is_scalar(chunk_results[0]): - result = cudf.Series(chunk_results, index=group_names) - result.index.names = self.grouping.names - elif isinstance(chunk_results[0], cudf.Series) and isinstance( - self.obj, cudf.DataFrame - ): - result = cudf.concat(chunk_results, axis=1).T - result.index.names = self.grouping.names - else: - result = cudf.concat(chunk_results) - if self._group_keys: - index_data = group_keys._data.copy(deep=True) - index_data[None] = grouped_values.index._column - result.index = cudf.MultiIndex._from_data(index_data) + result = self._iterative_groupby_apply( + function, + group_names, + offsets, + group_keys, + grouped_values, + *args, + ) else: raise ValueError(f"Unsupported engine '{engine}'") From d6a3ef284b7d505b426b4703c05dad965fbcacf1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 13:19:33 -0800 Subject: [PATCH 089/121] GroupType.size_type -> GroupType.group_size_type --- python/cudf/cudf/core/udf/groupby_lowering.py | 6 ++++-- python/cudf/cudf/core/udf/groupby_typing.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index d2b9700fbc9..75dbf8fdf01 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -49,7 +49,7 @@ def group_reduction_impl_basic(context, builder, sig, args, function): return context.compile_internal( builder, func, - nb_signature(retty, group_dataty, grp_type.size_type), + nb_signature(retty, group_dataty, grp_type.group_size_type), (builder.load(group_data_ptr), grp.size), ) @@ -107,7 +107,9 @@ def group_reduction_impl_idx_max_or_min(context, builder, sig, args, function): return context.compile_internal( builder, func, - nb_signature(retty, group_dataty, index_dataty, grp_type.size_type), + nb_signature( + retty, group_dataty, index_dataty, grp_type.group_size_type + ), (builder.load(group_data_ptr), builder.load(index_ptr), grp.size), ) diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 99c12560921..0e48dd6e2da 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -47,7 +47,7 @@ def __init__(self, group_scalar_type, index_type=index_default_type): self.group_scalar_type = group_scalar_type self.index_type = index_type self.group_data_type = types.CPointer(group_scalar_type) - self.size_type = types.int64 + self.group_size_type = types.int64 self.group_index_type = types.CPointer(index_type) super().__init__( name=f"Group({self.group_scalar_type}, {self.index_type})" From 9b60a6270e25b1d879f4717398821892344ded35 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 13:28:09 -0800 Subject: [PATCH 090/121] introduce group_size_type as a global --- python/cudf/cudf/core/udf/groupby_lowering.py | 3 ++- python/cudf/cudf/core/udf/groupby_typing.py | 24 ++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index 75dbf8fdf01..92e55c975a5 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -13,6 +13,7 @@ Group, GroupType, call_cuda_functions, + group_size_type, index_default_type, ) @@ -54,7 +55,7 @@ def group_reduction_impl_basic(context, builder, sig, args, function): ) -@lower_builtin(Group, types.Array, types.int64, types.Array) +@lower_builtin(Group, types.Array, group_size_type, types.Array) def group_constructor(context, builder, sig, args): """ Instruction boilerplate used for instantiating a Group diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 0e48dd6e2da..712d9df147e 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -19,6 +19,7 @@ index_default_type = numpy_support.from_dtype( pd.RangeIndex(0, 0).dtype ) # int64 +group_size_type = types.int64 SUPPORTED_GROUPBY_NUMBA_TYPES = [types.int64, types.float64] SUPPORTED_GROUPBY_NUMPY_TYPES = [ numpy_support.as_dtype(dt) for dt in [types.int64, types.float64] @@ -47,7 +48,7 @@ def __init__(self, group_scalar_type, index_type=index_default_type): self.group_scalar_type = group_scalar_type self.index_type = index_type self.group_data_type = types.CPointer(group_scalar_type) - self.group_size_type = types.int64 + self.group_size_type = group_size_type self.group_index_type = types.CPointer(index_type) super().__init__( name=f"Group({self.group_scalar_type}, {self.index_type})" @@ -84,12 +85,15 @@ def typer(group_data, size, index): @register_model(GroupType) class GroupModel(models.StructModel): - def __init__( - self, dmm, fe_type - ): # fe_type is fully instantiated group type + """ + Model backing GroupType instances. See the link below for details. + https://github.com/numba/numba/blob/main/numba/core/datamodel/models.py + """ + + def __init__(self, dmm, fe_type): members = [ ("group_data", types.CPointer(fe_type.group_scalar_type)), - ("size", types.int64), + ("size", group_size_type), ("index", types.CPointer(fe_type.index_type)), ] super().__init__(dmm, fe_type, members) @@ -101,7 +105,7 @@ def __init__( def _register_cuda_reduction_caller(funcname, inputty, retty): cuda_func = cuda.declare_device( f"Block{funcname}_{inputty}", - retty(types.CPointer(inputty), types.int64), + retty(types.CPointer(inputty), group_size_type), ) def caller(data, size): @@ -117,7 +121,9 @@ def _register_cuda_idx_reduction_caller(funcname, inputty): cuda_func = cuda.declare_device( f"Block{funcname}_{inputty}", types.int64( - types.CPointer(inputty), types.CPointer(types.int64), types.int64 + types.CPointer(inputty), + types.CPointer(index_default_type), + group_size_type, ), ) @@ -172,7 +178,9 @@ class GroupAttr(AttributeTemplate): resolve_min = _create_reduction_attr("GroupType.min") resolve_sum = _create_reduction_attr("GroupType.sum") - resolve_size = _create_reduction_attr("GroupType.size", retty=types.int64) + resolve_size = _create_reduction_attr( + "GroupType.size", retty=group_size_type + ) resolve_count = _create_reduction_attr( "GroupType.count", retty=types.int64 ) From f0a9af85466bb05501875fa884ea9e2235207b75 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 13:46:42 -0800 Subject: [PATCH 091/121] use index_default_type in idxmax/idxmin lowering --- python/cudf/cudf/core/udf/groupby_lowering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py index 92e55c975a5..376eccb9308 100644 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ b/python/cudf/cudf/core/udf/groupby_lowering.py @@ -102,7 +102,7 @@ def group_reduction_impl_idx_max_or_min(context, builder, sig, args, function): index_dataty = grp_type.group_index_type index_ptr = builder.alloca(grp.index.type) builder.store(grp.index, index_ptr) - type_key = (types.int64, grp_type.group_scalar_type) + type_key = (index_default_type, grp_type.group_scalar_type) func = call_cuda_functions[function][type_key] return context.compile_internal( From 6708655fd843de691c85de8ed2c90707f8c3aca1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 13:51:02 -0800 Subject: [PATCH 092/121] rename some utility functions and add docs --- python/cudf/cudf/core/udf/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index c2e8cb4aa30..fd4fca6cef8 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -276,7 +276,11 @@ def _post_process_output_col(col, retty): return as_column(col, retty) -def _get_appropriate_file(sms, cc): +def _get_best_ptx_file(sms, cc): + """ + Determine of the available PTX files which one is + the most recent up to and including the device cc + """ filtered_sms = [x for x in sms if x[0] <= cc] if filtered_sms: return max(filtered_sms, key=lambda y: y[0]) @@ -312,7 +316,7 @@ def _get_ptx_file(path, prefix): regular_result = None if regular_sms: - regular_result = _get_appropriate_file(regular_sms, cc) + regular_result = _get_best_ptx_file(regular_sms, cc) if regular_result is None: raise RuntimeError( From 3e5149da4db45378b96855d41fa85b5c90a16265 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 13:52:07 -0800 Subject: [PATCH 093/121] tweak previous function --- python/cudf/cudf/core/udf/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index fd4fca6cef8..8591d8d60f8 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -276,14 +276,14 @@ def _post_process_output_col(col, retty): return as_column(col, retty) -def _get_best_ptx_file(sms, cc): +def _get_best_ptx_file(archs, max_compute_capability): """ Determine of the available PTX files which one is the most recent up to and including the device cc """ - filtered_sms = [x for x in sms if x[0] <= cc] - if filtered_sms: - return max(filtered_sms, key=lambda y: y[0]) + filtered_archs = [x for x in archs if x[0] <= max_compute_capability] + if filtered_archs: + return max(filtered_archs, key=lambda y: y[0]) else: return None From c253b8f9242b487dbd6f654df762f55f8685c20b Mon Sep 17 00:00:00 2001 From: Bobbi Yogatama Date: Tue, 24 Jan 2023 21:52:11 +0000 Subject: [PATCH 094/121] Addressing reviewers' comments --- python/cudf/udf_cpp/groupby/function.cu | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index dcfd0cd7b44..8a227db0030 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -263,7 +263,9 @@ extern "C" { #define make_definition(name, cname, type, return_type) \ __device__ int name##_##cname(return_type* numba_return_value, type* const data, int64_t size) \ { \ - *numba_return_value = name(data, size); \ + return_type const res = name(data, size); \ + if (threadIdx.x == 0) *numba_return_value = res; \ + __syncthreads(); \ return 0; \ } @@ -287,9 +289,12 @@ extern "C" { __device__ int name##_##cname( \ int64_t* numba_return_value, type* const data, int64_t* index, int64_t size) \ { \ - *numba_return_value = name(data, index, size); \ + auto const res = name(data, index, size); \ + if (threadIdx.x == 0) *numba_return_value = res; \ + __syncthreads(); \ return 0; \ } + make_definition_idx(BlockIdxMin, int64, int64_t); make_definition_idx(BlockIdxMin, float64, double); make_definition_idx(BlockIdxMax, int64, int64_t); From bae845d1ce44bad8dfb59ef96d886954cf32cfef Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 14:00:48 -0800 Subject: [PATCH 095/121] unused import --- python/cudf/cudf/core/udf/groupby_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 7d2fa48b10b..272db34a1e5 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -1,6 +1,5 @@ # Copyright (c) 2022-2023, NVIDIA CORPORATION. -import math import os import cupy as cp From e91b641816e9c143e6d82ba909e61c13ddfff340 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Jan 2023 17:18:29 -0500 Subject: [PATCH 096/121] Replace std numeric limits with cudf device operators --- python/cudf/udf_cpp/groupby/function.cu | 33 +++++++++---------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index dcfd0cd7b44..a02aca8ff38 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -14,12 +14,12 @@ * limitations under the License. */ +#include + #include #include -#include - template __device__ void device_sum(cooperative_groups::thread_block const& block, T const* data, @@ -127,10 +127,7 @@ __device__ T BlockMax(T const* data, int64_t size) { auto block = cooperative_groups::this_thread_block(); - auto local_max = []() { - if constexpr (std::is_floating_point_v) { return -std::numeric_limits::infinity(); } - return std::numeric_limits::lowest(); - }(); + auto local_max = cudf::DeviceMax::identity(); __shared__ T block_max; if (block.thread_rank() == 0) { block_max = local_max; } block.sync(); @@ -153,13 +150,13 @@ __device__ T BlockMin(T const* data, int64_t size) { auto block = cooperative_groups::this_thread_block(); - auto local_min = []() { - if constexpr (std::is_floating_point_v) { return std::numeric_limits::infinity(); } - return std::numeric_limits::max(); - }(); + auto local_min = cudf::DeviceMin::identity(); __shared__ T block_min; - if (block.thread_rank() == 0) { block_min = local_min; } + if (block.thread_rank() == 0) { + block_min = local_min; + printf("min: %lld\n", int64_t(local_min)); + } block.sync(); #pragma unroll @@ -183,11 +180,8 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) __shared__ T block_max; __shared__ int64_t block_idx_max; - auto local_max = []() { - if constexpr (std::is_floating_point_v) { return -std::numeric_limits::infinity(); } - return std::numeric_limits::lowest(); - }(); - auto local_idx_max = std::numeric_limits::max(); + auto local_max = cudf::DeviceMax::identity(); + auto local_idx_max = cudf::DeviceMin::identity(); if (block.thread_rank() == 0) { block_max = local_max; @@ -225,11 +219,8 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) __shared__ T block_min; __shared__ int64_t block_idx_min; - auto local_min = []() { - if constexpr (std::is_floating_point_v) { return std::numeric_limits::infinity(); } - return std::numeric_limits::max(); - }(); - auto local_idx_min = std::numeric_limits::max(); + auto local_min = cudf::DeviceMin::identity(); + auto local_idx_min = cudf::DeviceMin::identity(); if (block.thread_rank() == 0) { block_min = local_min; From 73892e118334a3c70f5ee2322c57cb1b94ef2fa0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Tue, 24 Jan 2023 14:52:09 -0800 Subject: [PATCH 097/121] add tests for special values --- python/cudf/cudf/tests/test_groupby.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 367f513a7c9..2572ef26457 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -421,6 +421,28 @@ def func(df): run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) +@pytest.mark.parametrize("dtype", ["float64"]) +@pytest.mark.parametrize("func", ["min", "max", "idxmin", "idxmax"]) +@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) +def test_groupby_apply_jit_reductions_special_vals( + func, groupby_jit_data, dtype, special_val +): + # dynamically generate to avoid pickling error. + + funcstr = f""" +def func(df): + return df['val1'].{func}() + """ + lcl = {} + exec(funcstr, lcl) + func = lcl["func"] + + groupby_jit_data["val1"] = special_val + groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype) + + run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) + + @pytest.mark.parametrize( "func", [ From 81bfeb1ffd7bc2ca237c2384aea53f70080868ea Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Jan 2023 19:05:05 -0500 Subject: [PATCH 098/121] Apply suggestions from code review --- python/cudf/udf_cpp/groupby/function.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 4617cbc5cdd..7c545231c55 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -255,7 +255,7 @@ extern "C" { __device__ int name##_##cname(return_type* numba_return_value, type* const data, int64_t size) \ { \ return_type const res = name(data, size); \ - if (threadIdx.x == 0) *numba_return_value = res; \ + if (threadIdx.x == 0) { *numba_return_value = res; } \ __syncthreads(); \ return 0; \ } @@ -281,7 +281,7 @@ extern "C" { int64_t* numba_return_value, type* const data, int64_t* index, int64_t size) \ { \ auto const res = name(data, index, size); \ - if (threadIdx.x == 0) *numba_return_value = res; \ + if (threadIdx.x == 0) { *numba_return_value = res; } \ __syncthreads(); \ return 0; \ } From 3d76481d5db4457592599167aea5a2996c9f8ee4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Jan 2023 19:05:32 -0500 Subject: [PATCH 099/121] Code formatting --- python/cudf/udf_cpp/groupby/function.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 7c545231c55..eb17081af1c 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -255,7 +255,7 @@ extern "C" { __device__ int name##_##cname(return_type* numba_return_value, type* const data, int64_t size) \ { \ return_type const res = name(data, size); \ - if (threadIdx.x == 0) { *numba_return_value = res; } \ + if (threadIdx.x == 0) { *numba_return_value = res; } \ __syncthreads(); \ return 0; \ } From 97490af494868745e3521fa50539b8cdd4b5cdf6 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Jan 2023 21:04:08 -0500 Subject: [PATCH 100/121] Add more special value tests --- python/cudf/cudf/tests/test_groupby.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 2572ef26457..15d9e50e7de 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -422,7 +422,9 @@ def func(df): @pytest.mark.parametrize("dtype", ["float64"]) -@pytest.mark.parametrize("func", ["min", "max", "idxmin", "idxmax"]) +@pytest.mark.parametrize( + "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] +) @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) def test_groupby_apply_jit_reductions_special_vals( func, groupby_jit_data, dtype, special_val From 43694f70c13c919416fa7d2466a9cbee0b07e34c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Jan 2023 21:05:18 -0500 Subject: [PATCH 101/121] Fix bugs when all values are nans --- python/cudf/udf_cpp/groupby/function.cu | 35 +++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index eb17081af1c..4ff3adc1ef1 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -20,6 +20,29 @@ #include +#include +#include + +template +__device__ bool are_all_nans(cooperative_groups::thread_block const& block, + T const* data, + int64_t size) +{ + __shared__ bool result; + + if (block.thread_rank()) { result = true; } + + for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { + if (not std::isnan(data[idx])) { + result = false; + break; + } + } + + block.sync(); + return result; +} + template __device__ void device_sum(cooperative_groups::thread_block const& block, T const* data, @@ -75,6 +98,10 @@ __device__ T BlockSum(T const* data, int64_t size) { auto block = cooperative_groups::this_thread_block(); + if constexpr (std::is_floating_point_v) { + if (are_all_nans(block, data, size)) { return 0; } + } + __shared__ T block_sum; if (block.thread_rank() == 0) { block_sum = 0; } block.sync(); @@ -127,6 +154,10 @@ __device__ T BlockMax(T const* data, int64_t size) { auto block = cooperative_groups::this_thread_block(); + if constexpr (std::is_floating_point_v) { + if (are_all_nans(block, data, size)) { return std::numeric_limits::quiet_NaN(); } + } + auto local_max = cudf::DeviceMax::identity(); __shared__ T block_max; if (block.thread_rank() == 0) { block_max = local_max; } @@ -150,6 +181,10 @@ __device__ T BlockMin(T const* data, int64_t size) { auto block = cooperative_groups::this_thread_block(); + if constexpr (std::is_floating_point_v) { + if (are_all_nans(block, data, size)) { return std::numeric_limits::quiet_NaN(); } + } + auto local_min = cudf::DeviceMin::identity(); __shared__ T block_min; From 928d404558db85f6843d0ed6f87f81d0b23c52a5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Jan 2023 10:31:01 -0500 Subject: [PATCH 102/121] Fix a result init bug in are_all_nans --- python/cudf/udf_cpp/groupby/function.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 4ff3adc1ef1..057b411b5ec 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -30,7 +30,8 @@ __device__ bool are_all_nans(cooperative_groups::thread_block const& block, { __shared__ bool result; - if (block.thread_rank()) { result = true; } + if (block.thread_rank() == 0) { result = true; } + block.sync(); for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { if (not std::isnan(data[idx])) { From 8079047d931f2e9e228541a6416c1d4924bb9594 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 25 Jan 2023 10:44:23 -0800 Subject: [PATCH 103/121] separate out idmax and idxmin tests with special values --- python/cudf/cudf/tests/test_groupby.py | 45 ++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 15d9e50e7de..867a795b565 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -396,7 +396,6 @@ def run_groupby_apply_jit_test(data, func, keys, *args): # compare cuDF jit to pandas cudf_jit_result = got_groupby_obj.apply(func, *args, engine="jit") pandas_result = expect_groupby_obj.apply(func, *args) - assert_groupby_results_equal(cudf_jit_result, pandas_result) @@ -405,7 +404,11 @@ def run_groupby_apply_jit_test(data, func, keys, *args): "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] ) def test_groupby_apply_jit_reductions(func, groupby_jit_data, dtype): - # dynamically generate to avoid pickling error + # ideally we'd just have: + # lambda group: getattr(group, func)() + # but the current kernel caching mechanism relies on pickle which + # does not play nice with local functions. What's below uses + # exec as a workaround to write the test functions dynamically funcstr = f""" def func(df): @@ -422,15 +425,13 @@ def func(df): @pytest.mark.parametrize("dtype", ["float64"]) -@pytest.mark.parametrize( - "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] -) +@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) def test_groupby_apply_jit_reductions_special_vals( func, groupby_jit_data, dtype, special_val ): # dynamically generate to avoid pickling error. - + # see test_groupby_apply_jit_reductions for details. funcstr = f""" def func(df): return df['val1'].{func}() @@ -445,6 +446,38 @@ def func(df): run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) +@pytest.mark.parametrize("dtype", ["float64"]) +@pytest.mark.parametrize("func", ["idxmax", "idxmin"]) +@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) +def test_groupby_apply_jit_idx_reductions_special_vals( + func, groupby_jit_data, dtype, special_val +): + # dynamically generate to avoid pickling error. + # see test_groupby_apply_jit_reductions for details. + funcstr = f""" +def func(df): + return df['val1'].{func}() + """ + lcl = {} + exec(funcstr, lcl) + func = lcl["func"] + + groupby_jit_data["val1"] = special_val + groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype) + + expect = ( + groupby_jit_data.to_pandas() + .groupby("key1", as_index=False) + .apply(func) + ) + + # for all nans or infs, return the first occurrence + expect[None] = 0 + + got = groupby_jit_data.groupby("key1").apply(func, engine="jit") + assert_eq(expect, got) + + @pytest.mark.parametrize( "func", [ From 11c0eb66a71f3c61f105afc393ad450f3167deb2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 25 Jan 2023 10:46:53 -0800 Subject: [PATCH 104/121] remove redundant tests --- python/cudf/cudf/tests/test_groupby.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 867a795b565..baa022144f4 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -482,10 +482,11 @@ def func(df): "func", [ lambda df: df["val1"].max() + df["val2"].min(), - lambda df: df["val1"].idxmax() + df["val2"].idxmin(), + lambda df: df["val1"].sum() + df["val2"].var(), + lambda df: df["val1"].mean() + df["val2"].std(), ], ) -def test_groupby_apply_jit(func, groupby_jit_data): +def test_groupby_apply_jit_basic(func, groupby_jit_data): run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"]) From 3a5afa69e6899be590e3b66234c4e81c5c9405ad Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 25 Jan 2023 10:57:50 -0800 Subject: [PATCH 105/121] answer is the offsets not just an array of zeroes --- python/cudf/cudf/tests/test_groupby.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index baa022144f4..0248ca08f30 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -471,10 +471,13 @@ def func(df): .apply(func) ) + grouped = groupby_jit_data.groupby("key1") + # for all nans or infs, return the first occurrence - expect[None] = 0 + # this is equivalent to the offsets except the last one + expect[None] = grouped._grouped()[1][:-1] - got = groupby_jit_data.groupby("key1").apply(func, engine="jit") + got = grouped.apply(func, engine="jit") assert_eq(expect, got) From 4d719b5b001356ba418829b6de02e0a0bd66841b Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 25 Jan 2023 11:28:55 -0800 Subject: [PATCH 106/121] dynamically register reductions --- python/cudf/cudf/core/udf/groupby_typing.py | 26 +++++++-------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 712d9df147e..f83781e25e3 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -201,23 +201,15 @@ def resolve_idxmin(self, mod): ) -_register_cuda_reduction_caller("Max", types.float64, types.float64) -_register_cuda_reduction_caller("Max", types.int64, types.int64) -_register_cuda_reduction_caller("Min", types.float64, types.float64) -_register_cuda_reduction_caller("Min", types.int64, types.int64) -_register_cuda_reduction_caller("Min", types.float64, types.float64) -_register_cuda_reduction_caller("Sum", types.int64, types.int64) -_register_cuda_reduction_caller("Sum", types.float64, types.float64) -_register_cuda_reduction_caller("Mean", types.int64, types.float64) -_register_cuda_reduction_caller("Mean", types.float64, types.float64) -_register_cuda_reduction_caller("Std", types.int64, types.float64) -_register_cuda_reduction_caller("Std", types.float64, types.float64) -_register_cuda_reduction_caller("Var", types.int64, types.float64) -_register_cuda_reduction_caller("Var", types.float64, types.float64) -_register_cuda_idx_reduction_caller("IdxMax", types.int64) -_register_cuda_idx_reduction_caller("IdxMax", types.float64) -_register_cuda_idx_reduction_caller("IdxMin", types.int64) -_register_cuda_idx_reduction_caller("IdxMin", types.float64) +for ty in SUPPORTED_GROUPBY_NUMBA_TYPES: + _register_cuda_reduction_caller("Max", ty, ty) + _register_cuda_reduction_caller("Min", ty, ty) + _register_cuda_reduction_caller("Sum", ty, ty) + _register_cuda_reduction_caller("Mean", ty, types.float64) + _register_cuda_reduction_caller("Std", ty, types.float64) + _register_cuda_reduction_caller("Var", ty, types.float64) + _register_cuda_idx_reduction_caller("IdxMax", ty) + _register_cuda_idx_reduction_caller("IdxMin", ty) for attr in ("group_data", "index", "size"): From 5c5e37ca55edcd019d53b6912932707e81cb4d47 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Jan 2023 17:13:19 -0500 Subject: [PATCH 107/121] Add corner cases handling to idxmin/idxmax --- python/cudf/udf_cpp/groupby/function.cu | 26 +++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 057b411b5ec..6fcfb096a6d 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -215,6 +215,7 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) __shared__ T block_max; __shared__ int64_t block_idx_max; + __shared__ bool found_max; auto local_max = cudf::DeviceMax::identity(); auto local_idx_max = cudf::DeviceMin::identity(); @@ -222,6 +223,7 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) if (block.thread_rank() == 0) { block_max = local_max; block_idx_max = local_idx_max; + found_max = false; } block.sync(); @@ -231,6 +233,7 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) if (current_data > local_max) { local_max = current_data; local_idx_max = index[idx]; + found_max = true; } } @@ -238,9 +241,13 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) ref.fetch_max(local_max, cuda::std::memory_order_relaxed); block.sync(); - if (local_max == block_max) { - cuda::atomic_ref ref_idx{block_idx_max}; - ref_idx.fetch_min(local_idx_max, cuda::std::memory_order_relaxed); + if (found_max) { + if (local_max == block_max) { + cuda::atomic_ref ref_idx{block_idx_max}; + ref_idx.fetch_min(local_idx_max, cuda::std::memory_order_relaxed); + } + } else { + if (block.thread_rank() == 0) { block_idx_max = index[0]; } } block.sync(); @@ -254,6 +261,7 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) __shared__ T block_min; __shared__ int64_t block_idx_min; + __shared__ bool found_min; auto local_min = cudf::DeviceMin::identity(); auto local_idx_min = cudf::DeviceMin::identity(); @@ -261,6 +269,7 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) if (block.thread_rank() == 0) { block_min = local_min; block_idx_min = local_idx_min; + found_min = false; } block.sync(); @@ -270,6 +279,7 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) if (current_data < local_min) { local_min = current_data; local_idx_min = index[idx]; + found_min = true; } } @@ -277,9 +287,13 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) ref.fetch_min(local_min, cuda::std::memory_order_relaxed); block.sync(); - if (local_min == block_min) { - cuda::atomic_ref ref_idx{block_idx_min}; - ref_idx.fetch_min(local_idx_min, cuda::std::memory_order_relaxed); + if (found_min) { + if (local_min == block_min) { + cuda::atomic_ref ref_idx{block_idx_min}; + ref_idx.fetch_min(local_idx_min, cuda::std::memory_order_relaxed); + } + } else { + if (block.thread_rank() == 0) { block_idx_min = index[0]; } } block.sync(); From fac8d70e3d0659efe08aca2134a0ea0764e9574f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Jan 2023 17:35:49 -0500 Subject: [PATCH 108/121] Apply suggestions from code review Co-authored-by: Bradley Dice --- python/cudf/udf_cpp/groupby/function.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 6fcfb096a6d..746c469adf3 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -191,7 +191,6 @@ __device__ T BlockMin(T const* data, int64_t size) __shared__ T block_min; if (block.thread_rank() == 0) { block_min = local_min; - printf("min: %lld\n", int64_t(local_min)); } block.sync(); From 2f9cc7627d254d834d53baa3a1029a3d9159de72 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Jan 2023 18:03:49 -0500 Subject: [PATCH 109/121] Remove unroll pragma --- python/cudf/udf_cpp/groupby/function.cu | 6 ------ 1 file changed, 6 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 6fcfb096a6d..cb72db3da86 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -52,7 +52,6 @@ __device__ void device_sum(cooperative_groups::thread_block const& block, { T local_sum = 0; -#pragma unroll for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { local_sum += data[idx]; } @@ -80,7 +79,6 @@ __device__ void device_var(cooperative_groups::thread_block const& block, auto const mean = static_cast(block_sum) / static_cast(size); -#pragma unroll for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { auto const delta = static_cast(data[idx]) - mean; local_var += delta * delta; @@ -164,7 +162,6 @@ __device__ T BlockMax(T const* data, int64_t size) if (block.thread_rank() == 0) { block_max = local_max; } block.sync(); -#pragma unroll for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { local_max = max(local_max, data[idx]); } @@ -195,7 +192,6 @@ __device__ T BlockMin(T const* data, int64_t size) } block.sync(); -#pragma unroll for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { local_min = min(local_min, data[idx]); } @@ -227,7 +223,6 @@ __device__ int64_t BlockIdxMax(T const* data, int64_t* index, int64_t size) } block.sync(); -#pragma unroll for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { auto const current_data = data[idx]; if (current_data > local_max) { @@ -273,7 +268,6 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size) } block.sync(); -#pragma unroll for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { auto const current_data = data[idx]; if (current_data < local_min) { From 6665ef9f1b7b40b8ad65b366e83d35774848948e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 25 Jan 2023 16:25:16 -0800 Subject: [PATCH 110/121] address remaining reviews --- python/cudf/cudf/__init__.py | 1 - python/cudf/cudf/core/groupby/groupby.py | 23 +++++++++++++++- python/cudf/cudf/core/udf/groupby_utils.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 31 +++++++++++++--------- 4 files changed, 42 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 49d5d329a3b..b86fb72d955 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -3,7 +3,6 @@ from cudf.utils.gpu_utils import validate_setup validate_setup() -import os import cupy from numba import config as numba_config, cuda diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index fbcfaa447d3..bdf35a154bc 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -853,7 +853,11 @@ def apply(self, function, *args, engine="cudf"): Optional positional arguments to pass to the function. engine: {'cudf', 'jit'}, default 'cudf' Selects the GroupBy.apply implementation. Use `jit` to - select the numba JIT pipeline. + select the numba JIT pipeline. Only certain operations are allowed + within the function when using this option: min, max, sum, mean, var, + std, idxmax, and idxmin and any arithmetic formula involving them are + allowed. Binary operations are not yet supported, so syntax like + `df['x'] * 2` is not yet allowed. For more information, see the `cuDF guide to user defined functions `__. @@ -912,6 +916,23 @@ def mult(df): a b c 0 1 1 1 2 2 1 3 + + ``engine='jit'`` can be used to accelerate certain functions, + initially those that contain reductions and arithmetic operations + between results of those reductions: + + .. code-block:: + + >>> import cudf + >>> df = cudf.DataFrame({'a':[1,1,2,2,3,3], 'b':[1,2,3,4,5,6]}) + >>> df.groupby('a').apply( + lambda group: group['b'].max() - group['b'].min(), + engine='jit' + ) + a None + 0 1 1 + 1 2 1 + 2 3 1 """ if not callable(function): raise TypeError(f"type {type(function)} is not callable") diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 272db34a1e5..a1174835db9 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -172,7 +172,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): max_group_size = cp.diff(offsets).max() - if max_group_size >= 1000: + if max_group_size >= 256: blocklim = 256 else: blocklim = ((max_group_size + 32 - 1) // 32) * 32 diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 0248ca08f30..4f0d73cf657 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2,6 +2,7 @@ import datetime import itertools +import textwrap from decimal import Decimal import numpy as np @@ -410,10 +411,12 @@ def test_groupby_apply_jit_reductions(func, groupby_jit_data, dtype): # does not play nice with local functions. What's below uses # exec as a workaround to write the test functions dynamically - funcstr = f""" -def func(df): - return df['val1'].{func}() - """ + funcstr = textwrap.dedent( + f""" + def func(df): + return df['val1'].{func}() + """ + ) lcl = {} exec(funcstr, lcl) func = lcl["func"] @@ -432,10 +435,12 @@ def test_groupby_apply_jit_reductions_special_vals( ): # dynamically generate to avoid pickling error. # see test_groupby_apply_jit_reductions for details. - funcstr = f""" -def func(df): - return df['val1'].{func}() - """ + funcstr = textwrap.dedent( + f""" + def func(df): + return df['val1'].{func}() + """ + ) lcl = {} exec(funcstr, lcl) func = lcl["func"] @@ -454,10 +459,12 @@ def test_groupby_apply_jit_idx_reductions_special_vals( ): # dynamically generate to avoid pickling error. # see test_groupby_apply_jit_reductions for details. - funcstr = f""" -def func(df): - return df['val1'].{func}() - """ + funcstr = textwrap.dedent( + f""" + def func(df): + return df['val1'].{func}() + """ + ) lcl = {} exec(funcstr, lcl) func = lcl["func"] From 62a892881ec663137cb9add7365c7ef32dca5299 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 25 Jan 2023 17:21:29 -0800 Subject: [PATCH 111/121] fix tests --- python/cudf/cudf/tests/test_groupby.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 4f0d73cf657..c5b330fd89c 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -479,10 +479,9 @@ def func(df): ) grouped = groupby_jit_data.groupby("key1") - - # for all nans or infs, return the first occurrence - # this is equivalent to the offsets except the last one - expect[None] = grouped._grouped()[1][:-1] + sorted = grouped._grouped()[3].to_pandas() + expect_vals = sorted["key1"].drop_duplicates().index + expect[None] = expect_vals got = grouped.apply(func, engine="jit") assert_eq(expect, got) From de6b54c8e7e42f82bc0e5a625a843e5a7728620f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Jan 2023 20:28:04 -0500 Subject: [PATCH 112/121] Code formatting --- python/cudf/udf_cpp/groupby/function.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 86c216cc842..bf924cf175f 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -186,9 +186,7 @@ __device__ T BlockMin(T const* data, int64_t size) auto local_min = cudf::DeviceMin::identity(); __shared__ T block_min; - if (block.thread_rank() == 0) { - block_min = local_min; - } + if (block.thread_rank() == 0) { block_min = local_min; } block.sync(); for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { From 0c3d5a0b024825e378ca20cf71018110f6bd4b09 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 25 Jan 2023 17:37:49 -0800 Subject: [PATCH 113/121] go back to index_default_type = types.int64 explicitly --- python/cudf/cudf/core/udf/groupby_typing.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index f83781e25e3..37381a95fdf 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -2,7 +2,6 @@ from typing import Any, Dict import numba -import pandas as pd from numba import cuda, types from numba.core.extending import ( make_attribute_wrapper, @@ -16,9 +15,7 @@ from numba.cuda.cudadecl import registry as cuda_registry from numba.np import numpy_support -index_default_type = numpy_support.from_dtype( - pd.RangeIndex(0, 0).dtype -) # int64 +index_default_type = types.int64 group_size_type = types.int64 SUPPORTED_GROUPBY_NUMBA_TYPES = [types.int64, types.float64] SUPPORTED_GROUPBY_NUMPY_TYPES = [ From 5db0b6cf27b4926b36aba2760d7359c210b9c2b0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 25 Jan 2023 20:05:46 -0800 Subject: [PATCH 114/121] style --- python/strings_udf/strings_udf/_typing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py index 69b9b5b1582..fa87ad63dc2 100644 --- a/python/strings_udf/strings_udf/_typing.py +++ b/python/strings_udf/strings_udf/_typing.py @@ -9,9 +9,8 @@ from numba.core.typing.templates import AbstractTemplate, AttributeTemplate from numba.cuda.cudadecl import registry as cuda_decl_registry -from cudf.core.udf.utils import _get_extensionty_size import rmm - +from cudf.core.udf.utils import _get_extensionty_size # libcudf size_type size_type = types.int32 From 7e2ca13b9d3943de7b1d9bf45cf2695a87f9b7fb Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Thu, 26 Jan 2023 14:33:36 -0600 Subject: [PATCH 115/121] Update python/cudf/udf_cpp/groupby/CMakeLists.txt Co-authored-by: Robert Maynard --- python/cudf/udf_cpp/groupby/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/udf_cpp/groupby/CMakeLists.txt b/python/cudf/udf_cpp/groupby/CMakeLists.txt index 190a590ab79..2337aba4ec4 100644 --- a/python/cudf/udf_cpp/groupby/CMakeLists.txt +++ b/python/cudf/udf_cpp/groupby/CMakeLists.txt @@ -41,7 +41,7 @@ function(copy_ptx_to_location target destination) CONTENT " set(ptx_paths \"$\") -file(COPY \${ptx_paths} DESTINATION \"${destination}\")" +file(COPY_FILE \${ptx_paths} \"${destination}/${target}.ptx\")" ) add_custom_target( From 40b8ce9e49353d0ec3835c1d22a4c82ead6622e2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Jan 2023 18:08:37 -0500 Subject: [PATCH 116/121] Cast mean results to double --- python/cudf/udf_cpp/groupby/function.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index bf924cf175f..62d2c649939 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -119,7 +119,7 @@ __device__ double BlockMean(T const* data, int64_t size) block.sync(); device_sum(block, data, size, &block_sum); - return block_sum / static_cast(size); + return static_cast(block_sum) / static_cast(size); } template From 81860c5a6869e53b8154e3498f4fb30378e6496f Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 27 Jan 2023 06:09:52 -0800 Subject: [PATCH 117/121] address reviews --- python/cudf/cudf/core/groupby/groupby.py | 37 +++++++++++----------- python/cudf/udf_cpp/groupby/CMakeLists.txt | 13 -------- 2 files changed, 19 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index bdf35a154bc..1fcbb0a7e09 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -800,7 +800,9 @@ def _jit_groupby_apply( chunk_results = jit_groupby_apply( offsets, grouped_values, function, *args ) - result = cudf.Series(chunk_results, index=group_names) + result = cudf.Series._from_data( + {None: chunk_results}, index=group_names + ) result.index.names = self.grouping.names result = result.reset_index() result[None] = result.pop(0) @@ -826,7 +828,9 @@ def _iterative_groupby_apply( return self.obj.head(0) if cudf.api.types.is_scalar(chunk_results[0]): - result = cudf.Series(chunk_results, index=group_names) + result = cudf.Series._from_data( + {None: chunk_results}, index=group_names + ) result.index.names = self.grouping.names elif isinstance(chunk_results[0], cudf.Series) and isinstance( self.obj, cudf.DataFrame @@ -917,22 +921,19 @@ def mult(df): 0 1 1 1 2 2 1 3 - ``engine='jit'`` can be used to accelerate certain functions, - initially those that contain reductions and arithmetic operations - between results of those reductions: - - .. code-block:: - - >>> import cudf - >>> df = cudf.DataFrame({'a':[1,1,2,2,3,3], 'b':[1,2,3,4,5,6]}) - >>> df.groupby('a').apply( - lambda group: group['b'].max() - group['b'].min(), - engine='jit' - ) - a None - 0 1 1 - 1 2 1 - 2 3 1 + ``engine='jit'`` may be used to accelerate certain functions, + initially those that contain reductions and arithmetic operations + between results of those reductions: + >>> import akdfsf + >>> df = cudf.DataFrame({'a':[1,1,2,2,3,3], 'b':[1,2,3,4,5,6]}) + >>> df.groupby('a').apply( + ... lambda group: group['b'].max() - group['b'].min(), + ... engine='jit' + ... ) + a None + 0 1 1 + 1 2 1 + 2 3 1 """ if not callable(function): raise TypeError(f"type {type(function)} is not callable") diff --git a/python/cudf/udf_cpp/groupby/CMakeLists.txt b/python/cudf/udf_cpp/groupby/CMakeLists.txt index 2337aba4ec4..043ab28f362 100644 --- a/python/cudf/udf_cpp/groupby/CMakeLists.txt +++ b/python/cudf/udf_cpp/groupby/CMakeLists.txt @@ -14,21 +14,8 @@ cmake_minimum_required(VERSION 3.23.1) -include(rapids-cmake) -include(rapids-cuda) include(rapids-find) -rapids_cuda_init_architectures(groupby-udf-cpp) - -# Create a project so that we can enable CUDA architectures in this file. -project( - groupby-udf-cpp - VERSION 23.02.00 - LANGUAGES CUDA -) - -rapids_find_package(CUDAToolkit REQUIRED) - # This function will copy the generated PTX file from its generator-specific location in the build # tree into a specified location in the build tree from which we can install it. function(copy_ptx_to_location target destination) From 2f352bc927247c7938a181c574631c7bbbc06d76 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 27 Jan 2023 06:11:38 -0800 Subject: [PATCH 118/121] minor edits --- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/core/udf/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 1fcbb0a7e09..91e00eb43f3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -924,7 +924,7 @@ def mult(df): ``engine='jit'`` may be used to accelerate certain functions, initially those that contain reductions and arithmetic operations between results of those reductions: - >>> import akdfsf + >>> import cudf >>> df = cudf.DataFrame({'a':[1,1,2,2,3,3], 'b':[1,2,3,4,5,6]}) >>> df.groupby('a').apply( ... lambda group: group['b'].max() - group['b'].min(), diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 8591d8d60f8..3ee1d8edcbd 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -411,7 +411,7 @@ def maybe_patch_numba_linker( if (driver_version < ptx_toolkit_version) or ( driver_version < runtime_version ): - print( + logger.debug( "Driver version %s.%s needs patching due to PTX files" % driver_version ) From 0b407c87ab7314bf6bac41ff4594a1e6aec2d88b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 27 Jan 2023 11:08:03 -0500 Subject: [PATCH 119/121] Compute blockstd via blockvar --- python/cudf/udf_cpp/groupby/function.cu | 71 ++++++++++--------------- 1 file changed, 28 insertions(+), 43 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 62d2c649939..510c1f9c2d8 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -62,36 +62,6 @@ __device__ void device_sum(cooperative_groups::thread_block const& block, block.sync(); } -template -__device__ void device_var(cooperative_groups::thread_block const& block, - T const* data, - int64_t size, - double* var) -{ - T local_sum = 0; - double local_var = 0; - - __shared__ T block_sum; - if (block.thread_rank() == 0) { block_sum = 0; } - block.sync(); - - device_sum(block, data, size, &block_sum); - - auto const mean = static_cast(block_sum) / static_cast(size); - - for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { - auto const delta = static_cast(data[idx]) - mean; - local_var += delta * delta; - } - - cuda::atomic_ref ref{*var}; - ref.fetch_add(local_var, cuda::std::memory_order_relaxed); - block.sync(); - - if (block.thread_rank() == 0) { *var = *var / static_cast(size - 1); } - block.sync(); -} - template __device__ T BlockSum(T const* data, int64_t size) { @@ -123,31 +93,46 @@ __device__ double BlockMean(T const* data, int64_t size) } template -__device__ double BlockStd(T const* data, int64_t size) +__device__ double BlockVar(T const* data, int64_t size) { auto block = cooperative_groups::this_thread_block(); - __shared__ double var; - if (block.thread_rank() == 0) { var = 0; } + __shared__ double block_var; + __shared__ T block_sum; + if (block.thread_rank() == 0) { + block_var = 0; + block_sum = 0; + } block.sync(); - device_var(block, data, size, &var); - return sqrt(var); -} + T local_sum = 0; + double local_var = 0; -template -__device__ double BlockVar(T const* data, int64_t size) -{ - auto block = cooperative_groups::this_thread_block(); + device_sum(block, data, size, &block_sum); - __shared__ double block_var; - if (block.thread_rank() == 0) { block_var = 0; } + auto const mean = static_cast(block_sum) / static_cast(size); + + for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { + auto const delta = static_cast(data[idx]) - mean; + local_var += delta * delta; + } + + cuda::atomic_ref ref{block_var}; + ref.fetch_add(local_var, cuda::std::memory_order_relaxed); block.sync(); - device_var(block, data, size, &block_var); + if (block.thread_rank() == 0) { block_var = block_var / static_cast(size - 1); } + block.sync(); return block_var; } +template +__device__ double BlockStd(T const* data, int64_t size) +{ + auto const var = BlockVar(data, size); + return sqrt(var); +} + template __device__ T BlockMax(T const* data, int64_t size) { From dbd5eebb3a397118f11406200f2a81a890204391 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 27 Jan 2023 13:00:58 -0500 Subject: [PATCH 120/121] Use atomic operations to avoid concurrent writes --- python/cudf/udf_cpp/groupby/function.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index 510c1f9c2d8..febfd722b52 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -28,20 +28,22 @@ __device__ bool are_all_nans(cooperative_groups::thread_block const& block, T const* data, int64_t size) { - __shared__ bool result; + // TODO: to be refactored with CG vote functions once + // block size is known at build time + __shared__ int result; - if (block.thread_rank() == 0) { result = true; } + if (block.thread_rank() == 0) { result = 0; } block.sync(); for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { if (not std::isnan(data[idx])) { - result = false; + atomicAdd(&result, 1); break; } } block.sync(); - return result; + return result == 0; } template From eaa8ff7c39560e1014010239084089687e766225 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 27 Jan 2023 13:15:49 -0500 Subject: [PATCH 121/121] Use int64_t atomic ref --- python/cudf/udf_cpp/groupby/function.cu | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/udf_cpp/groupby/function.cu b/python/cudf/udf_cpp/groupby/function.cu index febfd722b52..f94f99c4b49 100644 --- a/python/cudf/udf_cpp/groupby/function.cu +++ b/python/cudf/udf_cpp/groupby/function.cu @@ -30,20 +30,21 @@ __device__ bool are_all_nans(cooperative_groups::thread_block const& block, { // TODO: to be refactored with CG vote functions once // block size is known at build time - __shared__ int result; + __shared__ int64_t count; - if (block.thread_rank() == 0) { result = 0; } + if (block.thread_rank() == 0) { count = 0; } block.sync(); for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) { if (not std::isnan(data[idx])) { - atomicAdd(&result, 1); + cuda::atomic_ref ref{count}; + ref.fetch_add(1, cuda::std::memory_order_relaxed); break; } } block.sync(); - return result == 0; + return count == 0; } template