Skip to content

Commit

Permalink
Batch of fixes for index overflows in grid stride loops. (#10448)
Browse files Browse the repository at this point in the history
Partially addresses  #10368

Specifically:
- `valid_if`
- `scatter`
- `rolling_window`
- `compute_column_kernel` (ast stuff)
- `replace_nulls` (fixed-width and strings)

The majority of the fixes are simply making the indexing variable a `std::size_t` instead of a `cudf::size_type`.  Although scatter had an additional place it was overflowing outside the kernel.

I didn't add tests for these fixes, but each of them were individually tested locally to make sure they actually manifested the issue and then were verified with the fixes.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)

URL: #10448
  • Loading branch information
nvdbaranec authored Mar 23, 2022
1 parent ce5bacb commit 5129ee5
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 43 deletions.
5 changes: 3 additions & 2 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ __global__ void valid_if_kernel(
{
constexpr size_type leader_lane{0};
auto const lane_id{threadIdx.x % warp_size};
size_type i = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type i = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;
size_type warp_valid_count{0};

auto active_mask = __ballot_sync(0xFFFF'FFFF, i < size);
Expand All @@ -58,7 +59,7 @@ __global__ void valid_if_kernel(
output[cudf::word_index(i)] = ballot;
warp_valid_count += __popc(ballot);
}
i += blockDim.x * gridDim.x;
i += stride;
active_mask = __ballot_sync(active_mask, i < size);
}

Expand Down
9 changes: 5 additions & 4 deletions cpp/include/cudf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,11 @@ class mutable_table_view;
* @file
*/

using size_type = int32_t;
using bitmask_type = uint32_t;
using valid_type = uint8_t;
using offset_type = int32_t;
using size_type = int32_t;
using bitmask_type = uint32_t;
using valid_type = uint8_t;
using offset_type = int32_t;
using thread_index_type = int64_t;

/**
* @brief Similar to `std::distance` but returns `cudf::size_type` and performs `static_cast`
Expand Down
14 changes: 10 additions & 4 deletions cpp/src/copying/scatter.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -47,7 +47,8 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
MapIterator scatter_map,
size_type num_scatter_rows)
{
size_type row = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type row = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;

while (row < num_scatter_rows) {
size_type const output_row = scatter_map[row];
Expand All @@ -58,7 +59,7 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
destination.set_null(output_row);
}

row += blockDim.x * gridDim.x;
row += stride;
}
}

Expand Down Expand Up @@ -351,8 +352,13 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>

// Transform negative indices to index + target size
auto scatter_rows = indices.size();
// note: the intermediate ((in % n_rows) + n_rows) will overflow a size_type for any value of `in`
// > (2^31)/2, but the end result after the final (% n_rows) will fit. so we'll do the computation
// using a signed 64 bit value.
auto scatter_iter = thrust::make_transform_iterator(
map_begin, [n_rows] __device__(size_type in) { return ((in % n_rows) + n_rows) % n_rows; });
map_begin, [n_rows = static_cast<int64_t>(n_rows)] __device__(size_type in) -> size_type {
return ((static_cast<int64_t>(in) % n_rows) + n_rows) % n_rows;
});

// Dispatch over data type per column
auto result = std::vector<std::unique_ptr<column>>(target.num_columns());
Expand Down
15 changes: 9 additions & 6 deletions cpp/src/replace/nulls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
char* chars,
cudf::size_type* valid_counter)
{
cudf::size_type nrows = input.size();
cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::size_type nrows = input.size();
cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::thread_index_type const stride = blockDim.x * gridDim.x;

uint32_t active_mask = 0xffffffff;
active_mask = __ballot_sync(active_mask, i < nrows);
Expand Down Expand Up @@ -98,7 +99,7 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
if (nonzero_output) std::memcpy(chars + offsets[i], out.data(), out.size_bytes());
}

i += blockDim.x * gridDim.x;
i += stride;
active_mask = __ballot_sync(active_mask, i < nrows);
}

Expand All @@ -114,8 +115,9 @@ __global__ void replace_nulls(cudf::column_device_view input,
cudf::mutable_column_device_view output,
cudf::size_type* output_valid_count)
{
cudf::size_type nrows = input.size();
cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::size_type nrows = input.size();
cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::thread_index_type const stride = blockDim.x * gridDim.x;

uint32_t active_mask = 0xffffffff;
active_mask = __ballot_sync(active_mask, i < nrows);
Expand All @@ -141,7 +143,7 @@ __global__ void replace_nulls(cudf::column_device_view input,
}
}

i += blockDim.x * gridDim.x;
i += stride;
active_mask = __ballot_sync(active_mask, i < nrows);
}
if (replacement_has_nulls) {
Expand Down Expand Up @@ -247,6 +249,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<

std::unique_ptr<cudf::column> offsets = cudf::strings::detail::make_offsets_child_column(
sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);

auto offsets_view = offsets->mutable_view();

auto const bytes =
Expand Down
26 changes: 14 additions & 12 deletions cpp/src/rolling/jit/kernel.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,13 +25,13 @@ namespace rolling {
namespace jit {

template <typename WindowType>
cudf::size_type __device__ get_window(WindowType window, cudf::size_type index)
cudf::size_type __device__ get_window(WindowType window, cudf::thread_index_type index)
{
return window[index];
}

template <>
cudf::size_type __device__ get_window(cudf::size_type window, cudf::size_type index)
cudf::size_type __device__ get_window(cudf::size_type window, cudf::thread_index_type index)
{
return window;
}
Expand All @@ -51,8 +51,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
FollowingWindowType following_window_begin,
cudf::size_type min_periods)
{
cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::size_type stride = blockDim.x * gridDim.x;
cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::thread_index_type const stride = blockDim.x * gridDim.x;

cudf::size_type warp_valid_count{0};

Expand All @@ -62,14 +62,16 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
// for CUDA 10.0 and below (fixed in CUDA 10.1)
volatile cudf::size_type count = 0;

cudf::size_type preceding_window = get_window(preceding_window_begin, i);
cudf::size_type following_window = get_window(following_window_begin, i);
int64_t const preceding_window = get_window(preceding_window_begin, i);
int64_t const following_window = get_window(following_window_begin, i);

// compute bounds
cudf::size_type start = min(nrows, max(0, i - preceding_window + 1));
cudf::size_type end = min(nrows, max(0, i + following_window + 1));
cudf::size_type start_index = min(start, end);
cudf::size_type end_index = max(start, end);
auto const start = static_cast<cudf::size_type>(
min(static_cast<int64_t>(nrows), max(int64_t{0}, i - preceding_window + 1)));
auto const end = static_cast<cudf::size_type>(
min(static_cast<int64_t>(nrows), max(int64_t{0}, i + following_window + 1)));
auto const start_index = min(start, end);
auto const end_index = max(start, end);

// aggregate
// TODO: We should explore using shared memory to avoid redundant loads.
Expand All @@ -79,7 +81,7 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
OutType val = agg_op::template operate<OutType, InType>(in_col, start_index, count);

// check if we have enough input samples
bool output_is_valid = (count >= min_periods);
bool const output_is_valid = (count >= min_periods);

// set the mask
const unsigned int result_mask = __ballot_sync(active_threads, output_is_valid);
Expand Down
22 changes: 11 additions & 11 deletions cpp/src/rolling/rolling_detail.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1008,24 +1008,24 @@ __launch_bounds__(block_size) __global__
PrecedingWindowIterator preceding_window_begin,
FollowingWindowIterator following_window_begin)
{
size_type i = blockIdx.x * block_size + threadIdx.x;
size_type stride = block_size * gridDim.x;
thread_index_type i = blockIdx.x * block_size + threadIdx.x;
thread_index_type const stride = block_size * gridDim.x;

size_type warp_valid_count{0};

auto active_threads = __ballot_sync(0xffffffff, i < input.size());
while (i < input.size()) {
// to prevent overflow issues when computing bounds use int64_t
int64_t preceding_window = preceding_window_begin[i];
int64_t following_window = following_window_begin[i];
int64_t const preceding_window = preceding_window_begin[i];
int64_t const following_window = following_window_begin[i];

// compute bounds
auto start = static_cast<size_type>(
min(static_cast<int64_t>(input.size()), max(0L, i - preceding_window + 1)));
auto end = static_cast<size_type>(
min(static_cast<int64_t>(input.size()), max(0L, i + following_window + 1)));
size_type start_index = min(start, end);
size_type end_index = max(start, end);
auto const start = static_cast<size_type>(
min(static_cast<int64_t>(input.size()), max(int64_t{0}, i - preceding_window + 1)));
auto const end = static_cast<size_type>(
min(static_cast<int64_t>(input.size()), max(int64_t{0}, i + following_window + 1)));
auto const start_index = min(start, end);
auto const end_index = max(start, end);

// aggregate
// TODO: We should explore using shared memory to avoid redundant loads.
Expand All @@ -1037,7 +1037,7 @@ __launch_bounds__(block_size) __global__
input, default_outputs, output, start_index, end_index, i);

// set the mask
cudf::bitmask_type result_mask{__ballot_sync(active_threads, output_is_valid)};
cudf::bitmask_type const result_mask{__ballot_sync(active_threads, output_is_valid)};

// only one thread writes the mask
if (0 == threadIdx.x % cudf::detail::warp_size) {
Expand Down
9 changes: 5 additions & 4 deletions cpp/src/transform/compute_column.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -67,12 +67,13 @@ __launch_bounds__(max_block_size) __global__

auto thread_intermediate_storage =
&intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
auto const start_idx = static_cast<cudf::size_type>(threadIdx.x + blockIdx.x * blockDim.x);
auto const stride = static_cast<cudf::size_type>(blockDim.x * gridDim.x);
auto const start_idx =
static_cast<cudf::thread_index_type>(threadIdx.x + blockIdx.x * blockDim.x);
auto const stride = static_cast<cudf::thread_index_type>(blockDim.x * gridDim.x);
auto evaluator =
cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);

for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
for (thread_index_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
auto output_dest = ast::detail::mutable_column_expression_result<has_nulls>(output_column);
evaluator.evaluate(output_dest, row_index, thread_intermediate_storage);
}
Expand Down

0 comments on commit 5129ee5

Please sign in to comment.