diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index 620504f5c93..27936ce04b3 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -408,7 +408,7 @@ __global__ void compute_row_sizes(device_span cols, if (tid >= num_rows) { return; } // branch stack. points to the last list prior to branching. - row_span* my_branch_stack = thread_branch_stacks + (tid * max_branch_depth); + row_span* my_branch_stack = thread_branch_stacks + (threadIdx.x * max_branch_depth); size_type branch_depth{0}; // current row span - always starts at 1 row. diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu index 0081cf0d467..8284def5f13 100644 --- a/cpp/tests/transform/row_bit_count_test.cu +++ b/cpp/tests/transform/row_bit_count_test.cu @@ -15,7 +15,9 @@ */ #include +#include #include +#include #include #include #include @@ -25,6 +27,9 @@ #include +#include +#include + using namespace cudf; template @@ -192,6 +197,66 @@ TEST_F(RowBitCount, StringsWithNulls) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); } +namespace { + +/** + * @brief __device__ functor to multiply input by 2, defined out of line because __device__ lambdas + * cannot be defined in a TEST_F(). + */ +struct times_2 { + int32_t __device__ operator()(int32_t i) const { return i * 2; } +}; + +} // namespace + +TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock) +{ + // Tests that `row_bit_count()` can handle struct> with more + // than max_block_size (256) rows. + // With a large number of rows, computation spills to multiple thread-blocks, + // thus exercising the branch-stack comptutation. + // The contents of the input column aren't as pertinent to this test as the + // column size. For what it's worth, it looks as follows: + // [ struct({0,1}), struct({2,3}), struct({4,5}), ... ] + + using namespace cudf; + auto constexpr num_rows = 1024 * 2; // Exceeding a block size. + + // List child column = {0, 1, 2, 3, 4, ..., 2*num_rows}; + auto ints = make_numeric_column(data_type{type_id::INT32}, num_rows * 2); + auto ints_view = ints->mutable_view(); + thrust::tabulate(thrust::device, + ints_view.begin(), + ints_view.end(), + thrust::identity()); + + // List offsets = {0, 2, 4, 6, 8, ..., num_rows*2}; + auto list_offsets = make_numeric_column(data_type{type_id::INT32}, num_rows + 1); + auto list_offsets_view = list_offsets->mutable_view(); + thrust::tabulate(thrust::device, + list_offsets_view.begin(), + list_offsets_view.end(), + times_2{}); + + // List = {{0,1}, {2,3}, {4,5}, ..., {2*(num_rows-1), 2*num_rows-1}}; + auto lists_column = make_lists_column(num_rows, std::move(list_offsets), std::move(ints), 0, {}); + + // Struct. + auto struct_members = std::vector>{}; + struct_members.emplace_back(std::move(lists_column)); + auto structs_column = make_structs_column(num_rows, std::move(struct_members), 0, {}); + + // Compute row_bit_count, and compare. + auto row_bit_counts = row_bit_count(table_view{{structs_column->view()}}); + auto expected_row_bit_counts = make_numeric_column(data_type{type_id::INT32}, num_rows); + thrust::fill_n(thrust::device, + expected_row_bit_counts->mutable_view().begin(), + num_rows, + CHAR_BIT * (2 * sizeof(int32_t) + sizeof(offset_type))); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(row_bit_counts->view(), expected_row_bit_counts->view()); +} + std::pair, std::unique_ptr> build_struct_column() { std::vector struct_validity{0, 1, 1, 1, 1, 0};