diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index a9194ceea93..bcedc2f62c6 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -63,7 +63,7 @@ struct src_buf_info { int _offset_stack_pos, int _parent_offsets_index, bool _is_validity, - int _column_offset) + size_type _column_offset) : type(_type), offsets(_offsets), offset_stack_pos(_offset_stack_pos), @@ -194,11 +194,18 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, if (value_shift || bit_shift) { std::size_t idx = (num_bytes - remainder) / 4; uint32_t v = remainder > 0 ? (reinterpret_cast(src)[idx] - value_shift) : 0; + + constexpr size_type rows_per_element = 32; + auto const have_trailing_bits = ((num_elements * rows_per_element) - num_rows) < bit_shift; while (remainder) { - uint32_t const next = bit_shift > 0 || remainder > 4 - ? (reinterpret_cast(src)[idx + 1] - value_shift) - : 0; - uint32_t const val = (v >> bit_shift) | (next << (32 - bit_shift)); + // if we're at the very last word of a validity copy, we do not always need to read the next + // word to get the final trailing bits. + auto const read_trailing_bits = bit_shift > 0 && remainder == 4 && have_trailing_bits; + uint32_t const next = (read_trailing_bits || remainder > 4) + ? (reinterpret_cast(src)[idx + 1] - value_shift) + : 0; + + uint32_t const val = (v >> bit_shift) | (next << (32 - bit_shift)); if (valid_count) { thread_valid_count += __popc(val); } reinterpret_cast(dst)[idx] = val; v = next; diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index f7714ce9ac7..b5a793ecd1c 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -1315,6 +1315,21 @@ TEST_F(ContiguousSplitUntypedTest, ProgressiveSizes) } } +TEST_F(ContiguousSplitUntypedTest, ValidityEdgeCase) +{ + // tests an edge case where the splits cause the final validity data to be copied + // to be < 32 full bits, making sure we don't unintentionally read past the end of the input + auto col = cudf::make_numeric_column( + cudf::data_type{cudf::type_id::INT32}, 512, cudf::mask_state::ALL_VALID); + auto result = cudf::contiguous_split(cudf::table_view{{*col}}, {510}); + auto expected = cudf::split(cudf::table_view{{*col}}, {510}); + + EXPECT_EQ(expected.size(), result.size()); + for (unsigned long index = 0; index < result.size(); index++) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected[index].column(0), result[index].table.column(0)); + } +} + // contiguous split with strings struct ContiguousSplitStringTableTest : public SplitTest { };