From 2eba5d536d279c9923cd45fd9d0ea2422b8eacf1 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 7 Nov 2022 12:21:57 -0600 Subject: [PATCH 1/3] Update custreamz README. --- python/custreamz/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/custreamz/README.md b/python/custreamz/README.md index 99ada746ec8..a1d98425d66 100644 --- a/python/custreamz/README.md +++ b/python/custreamz/README.md @@ -48,8 +48,8 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids ### CUDA/GPU requirements -* CUDA 10.0+ -* NVIDIA driver 410.48+ +* CUDA 11.0+ +* NVIDIA driver 450.80.02+ * Pascal architecture or better (Compute Capability >=6.0) ### Conda From 43af53214c2ebbc5ba013e330fef36806361875d Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 7 Nov 2022 13:27:36 -0600 Subject: [PATCH 2/3] Use thrust::prev. --- cpp/src/copying/concatenate.cu | 6 ++---- cpp/src/strings/copying/concatenate.cu | 12 ++++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index 802b47e4664..577d6427b19 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -180,10 +180,8 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views, if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); } while (output_index < output_size) { // Lookup input index by searching for output index in offsets - // thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead - auto const offset_it = - -1 + thrust::upper_bound( - thrust::seq, input_offsets, input_offsets + num_input_views, output_index); + auto const offset_it = thrust::prev(thrust::upper_bound( + thrust::seq, input_offsets, input_offsets + num_input_views, output_index)); size_type const partition_index = offset_it - input_offsets; // Copy input data to output diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 627e689d4d9..e44c343e31b 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -127,10 +127,8 @@ __global__ void fused_concatenate_string_offset_kernel(column_device_view const* if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); } while (output_index < output_size) { // Lookup input index by searching for output index in offsets - // thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead - auto const offset_it = - -1 + thrust::upper_bound( - thrust::seq, input_offsets, input_offsets + num_input_views, output_index); + auto const offset_it = thrust::prev(thrust::upper_bound( + thrust::seq, input_offsets, input_offsets + num_input_views, output_index)); size_type const partition_index = offset_it - input_offsets; auto const offset_index = output_index - *offset_it; @@ -180,10 +178,8 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const* while (output_index < output_size) { // Lookup input index by searching for output index in offsets - // thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead - auto const offset_it = - -1 + thrust::upper_bound( - thrust::seq, partition_offsets, partition_offsets + num_input_views, output_index); + auto const offset_it = thrust::prev(thrust::upper_bound( + thrust::seq, partition_offsets, partition_offsets + num_input_views, output_index)); size_type const partition_index = offset_it - partition_offsets; auto const offset_index = output_index - *offset_it; From ac417b78d5ca10376cba196f69a9814ffafcace1 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 7 Nov 2022 13:29:12 -0600 Subject: [PATCH 3/3] Remove volatile declaration. --- cpp/src/rolling/detail/rolling.cuh | 20 +++++++------------- cpp/src/rolling/jit/kernel.cu | 8 ++------ 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh index 4394557e453..68480dbf773 100644 --- a/cpp/src/rolling/detail/rolling.cuh +++ b/cpp/src/rolling/detail/rolling.cuh @@ -122,10 +122,8 @@ struct DeviceRolling { using AggOp = typename corresponding_operator::type; AggOp agg_op; - // declare this as volatile to avoid some compiler optimizations that lead to incorrect results - // for CUDA 10.0 and below (fixed in CUDA 10.1) - volatile cudf::size_type count = 0; - OutputType val = AggOp::template identity(); + cudf::size_type count = 0; + OutputType val = AggOp::template identity(); for (size_type j = start_index; j < end_index; j++) { if (!has_nulls || input.is_valid(j)) { @@ -190,11 +188,9 @@ struct DeviceRollingArgMinMaxString : DeviceRollingArgMinMaxBase::type; AggOp agg_op; - // declare this as volatile to avoid some compiler optimizations that lead to incorrect results - // for CUDA 10.0 and below (fixed in CUDA 10.1) - volatile cudf::size_type count = 0; - InputType val = AggOp::template identity(); - OutputType val_index = default_output; + cudf::size_type count = 0; + InputType val = AggOp::template identity(); + OutputType val_index = default_output; for (size_type j = start_index; j < end_index; j++) { if (!has_nulls || input.is_valid(j)) { @@ -284,13 +280,11 @@ struct DeviceRollingCountValid { size_type end_index, size_type current_index) { - // declare this as volatile to avoid some compiler optimizations that lead to incorrect - // results for CUDA 10.0 and below (fixed in CUDA 10.1) - volatile cudf::size_type count = 0; - bool output_is_valid = ((end_index - start_index) >= min_periods); if (output_is_valid) { + cudf::size_type count = 0; + if (!has_nulls) { count = end_index - start_index; } else { diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu index ecdbbb6a0f2..3bfee32d1cc 100644 --- a/cpp/src/rolling/jit/kernel.cu +++ b/cpp/src/rolling/jit/kernel.cu @@ -58,10 +58,6 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, auto active_threads = __ballot_sync(0xffff'ffffu, i < nrows); while (i < nrows) { - // declare this as volatile to avoid some compiler optimizations that lead to incorrect results - // for CUDA 10.0 and below (fixed in CUDA 10.1) - volatile cudf::size_type count = 0; - int64_t const preceding_window = get_window(preceding_window_begin, i); int64_t const following_window = get_window(following_window_begin, i); @@ -77,8 +73,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, // TODO: We should explore using shared memory to avoid redundant loads. // This might require separating the kernel into a special version // for dynamic and static sizes. - count = end_index - start_index; - OutType val = agg_op::template operate(in_col, start_index, count); + cudf::size_type count = end_index - start_index; + OutType val = agg_op::template operate(in_col, start_index, count); // check if we have enough input samples bool const output_is_valid = (count >= min_periods);