Skip to content

Commit

Permalink
Remove CUDA 10 compatibility code. (#12088)
Browse files Browse the repository at this point in the history
This PR updates some documentation and removes some compatibility layers referencing CUDA 10, which is no longer supported by the package.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - David Wendt (https://github.com/davidwendt)
  - Lawrence Mitchell (https://github.com/wence-)

URL: #12088
  • Loading branch information
bdice authored Nov 8, 2022
1 parent 8ee5f51 commit 7535f31
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 33 deletions.
6 changes: 2 additions & 4 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,8 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views,
if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
while (output_index < output_size) {
// Lookup input index by searching for output index in offsets
// thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead
auto const offset_it =
-1 + thrust::upper_bound(
thrust::seq, input_offsets, input_offsets + num_input_views, output_index);
auto const offset_it = thrust::prev(thrust::upper_bound(
thrust::seq, input_offsets, input_offsets + num_input_views, output_index));
size_type const partition_index = offset_it - input_offsets;

// Copy input data to output
Expand Down
20 changes: 7 additions & 13 deletions cpp/src/rolling/detail/rolling.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,8 @@ struct DeviceRolling {
using AggOp = typename corresponding_operator<op>::type;
AggOp agg_op;

// declare this as volatile to avoid some compiler optimizations that lead to incorrect results
// for CUDA 10.0 and below (fixed in CUDA 10.1)
volatile cudf::size_type count = 0;
OutputType val = AggOp::template identity<OutputType>();
cudf::size_type count = 0;
OutputType val = AggOp::template identity<OutputType>();

for (size_type j = start_index; j < end_index; j++) {
if (!has_nulls || input.is_valid(j)) {
Expand Down Expand Up @@ -190,11 +188,9 @@ struct DeviceRollingArgMinMaxString : DeviceRollingArgMinMaxBase<cudf::string_vi
using AggOp = typename corresponding_operator<op>::type;
AggOp agg_op;

// declare this as volatile to avoid some compiler optimizations that lead to incorrect results
// for CUDA 10.0 and below (fixed in CUDA 10.1)
volatile cudf::size_type count = 0;
InputType val = AggOp::template identity<InputType>();
OutputType val_index = default_output;
cudf::size_type count = 0;
InputType val = AggOp::template identity<InputType>();
OutputType val_index = default_output;

for (size_type j = start_index; j < end_index; j++) {
if (!has_nulls || input.is_valid(j)) {
Expand Down Expand Up @@ -284,13 +280,11 @@ struct DeviceRollingCountValid {
size_type end_index,
size_type current_index)
{
// declare this as volatile to avoid some compiler optimizations that lead to incorrect
// results for CUDA 10.0 and below (fixed in CUDA 10.1)
volatile cudf::size_type count = 0;

bool output_is_valid = ((end_index - start_index) >= min_periods);

if (output_is_valid) {
cudf::size_type count = 0;

if (!has_nulls) {
count = end_index - start_index;
} else {
Expand Down
8 changes: 2 additions & 6 deletions cpp/src/rolling/jit/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,6 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,

auto active_threads = __ballot_sync(0xffff'ffffu, i < nrows);
while (i < nrows) {
// declare this as volatile to avoid some compiler optimizations that lead to incorrect results
// for CUDA 10.0 and below (fixed in CUDA 10.1)
volatile cudf::size_type count = 0;

int64_t const preceding_window = get_window(preceding_window_begin, i);
int64_t const following_window = get_window(following_window_begin, i);

Expand All @@ -77,8 +73,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
// TODO: We should explore using shared memory to avoid redundant loads.
// This might require separating the kernel into a special version
// for dynamic and static sizes.
count = end_index - start_index;
OutType val = agg_op::template operate<OutType, InType>(in_col, start_index, count);
cudf::size_type count = end_index - start_index;
OutType val = agg_op::template operate<OutType, InType>(in_col, start_index, count);

// check if we have enough input samples
bool const output_is_valid = (count >= min_periods);
Expand Down
12 changes: 4 additions & 8 deletions cpp/src/strings/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,8 @@ __global__ void fused_concatenate_string_offset_kernel(column_device_view const*
if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
while (output_index < output_size) {
// Lookup input index by searching for output index in offsets
// thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead
auto const offset_it =
-1 + thrust::upper_bound(
thrust::seq, input_offsets, input_offsets + num_input_views, output_index);
auto const offset_it = thrust::prev(thrust::upper_bound(
thrust::seq, input_offsets, input_offsets + num_input_views, output_index));
size_type const partition_index = offset_it - input_offsets;

auto const offset_index = output_index - *offset_it;
Expand Down Expand Up @@ -180,10 +178,8 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const*

while (output_index < output_size) {
// Lookup input index by searching for output index in offsets
// thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead
auto const offset_it =
-1 + thrust::upper_bound(
thrust::seq, partition_offsets, partition_offsets + num_input_views, output_index);
auto const offset_it = thrust::prev(thrust::upper_bound(
thrust::seq, partition_offsets, partition_offsets + num_input_views, output_index));
size_type const partition_index = offset_it - partition_offsets;

auto const offset_index = output_index - *offset_it;
Expand Down
4 changes: 2 additions & 2 deletions python/custreamz/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids

### CUDA/GPU requirements

* CUDA 10.0+
* NVIDIA driver 410.48+
* CUDA 11.0+
* NVIDIA driver 450.80.02+
* Pascal architecture or better (Compute Capability >=6.0)

### Conda
Expand Down

0 comments on commit 7535f31

Please sign in to comment.