Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove CUDA 10 compatibility code. #12088

Merged
merged 3 commits into from
Nov 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,8 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views,
if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
while (output_index < output_size) {
// Lookup input index by searching for output index in offsets
// thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead
auto const offset_it =
-1 + thrust::upper_bound(
thrust::seq, input_offsets, input_offsets + num_input_views, output_index);
auto const offset_it = thrust::prev(thrust::upper_bound(
thrust::seq, input_offsets, input_offsets + num_input_views, output_index));
size_type const partition_index = offset_it - input_offsets;

// Copy input data to output
Expand Down
20 changes: 7 additions & 13 deletions cpp/src/rolling/detail/rolling.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,8 @@ struct DeviceRolling {
using AggOp = typename corresponding_operator<op>::type;
AggOp agg_op;

// declare this as volatile to avoid some compiler optimizations that lead to incorrect results
// for CUDA 10.0 and below (fixed in CUDA 10.1)
volatile cudf::size_type count = 0;
OutputType val = AggOp::template identity<OutputType>();
cudf::size_type count = 0;
OutputType val = AggOp::template identity<OutputType>();

for (size_type j = start_index; j < end_index; j++) {
if (!has_nulls || input.is_valid(j)) {
Expand Down Expand Up @@ -190,11 +188,9 @@ struct DeviceRollingArgMinMaxString : DeviceRollingArgMinMaxBase<cudf::string_vi
using AggOp = typename corresponding_operator<op>::type;
AggOp agg_op;

// declare this as volatile to avoid some compiler optimizations that lead to incorrect results
// for CUDA 10.0 and below (fixed in CUDA 10.1)
volatile cudf::size_type count = 0;
InputType val = AggOp::template identity<InputType>();
OutputType val_index = default_output;
cudf::size_type count = 0;
InputType val = AggOp::template identity<InputType>();
OutputType val_index = default_output;

for (size_type j = start_index; j < end_index; j++) {
if (!has_nulls || input.is_valid(j)) {
Expand Down Expand Up @@ -284,13 +280,11 @@ struct DeviceRollingCountValid {
size_type end_index,
size_type current_index)
{
// declare this as volatile to avoid some compiler optimizations that lead to incorrect
// results for CUDA 10.0 and below (fixed in CUDA 10.1)
volatile cudf::size_type count = 0;

bool output_is_valid = ((end_index - start_index) >= min_periods);

if (output_is_valid) {
cudf::size_type count = 0;

if (!has_nulls) {
count = end_index - start_index;
} else {
Expand Down
8 changes: 2 additions & 6 deletions cpp/src/rolling/jit/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,6 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,

auto active_threads = __ballot_sync(0xffff'ffffu, i < nrows);
while (i < nrows) {
// declare this as volatile to avoid some compiler optimizations that lead to incorrect results
// for CUDA 10.0 and below (fixed in CUDA 10.1)
volatile cudf::size_type count = 0;

int64_t const preceding_window = get_window(preceding_window_begin, i);
int64_t const following_window = get_window(following_window_begin, i);

Expand All @@ -77,8 +73,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
// TODO: We should explore using shared memory to avoid redundant loads.
// This might require separating the kernel into a special version
// for dynamic and static sizes.
count = end_index - start_index;
OutType val = agg_op::template operate<OutType, InType>(in_col, start_index, count);
cudf::size_type count = end_index - start_index;
OutType val = agg_op::template operate<OutType, InType>(in_col, start_index, count);

// check if we have enough input samples
bool const output_is_valid = (count >= min_periods);
Expand Down
12 changes: 4 additions & 8 deletions cpp/src/strings/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,8 @@ __global__ void fused_concatenate_string_offset_kernel(column_device_view const*
if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
while (output_index < output_size) {
// Lookup input index by searching for output index in offsets
// thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead
auto const offset_it =
-1 + thrust::upper_bound(
thrust::seq, input_offsets, input_offsets + num_input_views, output_index);
auto const offset_it = thrust::prev(thrust::upper_bound(
thrust::seq, input_offsets, input_offsets + num_input_views, output_index));
size_type const partition_index = offset_it - input_offsets;

auto const offset_index = output_index - *offset_it;
Expand Down Expand Up @@ -180,10 +178,8 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const*

while (output_index < output_size) {
// Lookup input index by searching for output index in offsets
// thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead
auto const offset_it =
-1 + thrust::upper_bound(
thrust::seq, partition_offsets, partition_offsets + num_input_views, output_index);
auto const offset_it = thrust::prev(thrust::upper_bound(
thrust::seq, partition_offsets, partition_offsets + num_input_views, output_index));
size_type const partition_index = offset_it - partition_offsets;

auto const offset_index = output_index - *offset_it;
Expand Down
4 changes: 2 additions & 2 deletions python/custreamz/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids

### CUDA/GPU requirements

* CUDA 10.0+
* NVIDIA driver 410.48+
* CUDA 11.0+
* NVIDIA driver 450.80.02+
* Pascal architecture or better (Compute Capability >=6.0)

### Conda
Expand Down