Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change nullable() to has_nulls() in cudf::detail::gather #14363

Merged
merged 18 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions cpp/include/cudf/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
#pragma once

#include <cstddef>
divyegala marked this conversation as resolved.
Show resolved Hide resolved
#include <cudf/detail/copy.hpp>
#include <cudf/detail/indexalator.cuh>
#include <cudf/detail/null_mask.hpp>
Expand Down Expand Up @@ -678,9 +679,22 @@ std::unique_ptr<table> gather(table_view const& source_table,
return col.nullable();
});
if (nullable) {
auto const op = bounds_policy == out_of_bounds_policy::NULLIFY ? gather_bitmask_op::NULLIFY
: gather_bitmask_op::DONT_CHECK;
gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr);
auto const has_nulls =
bounds_policy == out_of_bounds_policy::NULLIFY || cudf::has_nested_nulls(source_table);
Copy link
Contributor

@ttnghia ttnghia Nov 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this use && instead? Because if we indeed don't have any nulls here then we don't need to call gather_bitmask.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably I misunderstood the usage of this variable. So this variable should be called need_new_bitmask or so. It should not be has_nulls.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But we need to call gather_bitmask if out_of_bounds_policy::NULLIFY. gather_bitmask will help nullify any OOB accesses.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, so || is indeed needed, but please rename that variable.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

if (has_nulls) {
auto const op = bounds_policy == out_of_bounds_policy::NULLIFY
? gather_bitmask_op::NULLIFY
: gather_bitmask_op::DONT_CHECK;
gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr);
} else {
for (size_type i = 0; i < source_table.num_columns(); ++i) {
if (source_table.column(i).nullable()) {
auto mask = detail::create_null_mask(
destination_columns[i]->size(), mask_state::ALL_VALID, stream, mr);
destination_columns[i]->set_null_mask(std::move(mask), 0);
}
}
}
}

return std::make_unique<table>(std::move(destination_columns));
Expand Down
17 changes: 12 additions & 5 deletions cpp/src/groupby/sort/group_quantiles.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ struct calculate_quantile_fn {
double const* d_quantiles;
size_type num_quantiles;
interpolation interpolation;
size_type* null_count;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be an unrelated change to this PR so ideally it should be in a separate PR. But I'm fine to keep this here but please clarify that in the PR description.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


__device__ void operator()(size_type i)
{
Expand All @@ -68,11 +69,13 @@ struct calculate_quantile_fn {
thrust::for_each_n(thrust::seq,
thrust::make_counting_iterator(0),
num_quantiles,
[d_result = d_result, segment_size, offset](size_type j) {
if (segment_size == 0)
[d_result = d_result, segment_size, offset, this](size_type j) {
if (segment_size == 0) {
d_result.set_null(offset + j);
else
atomicAdd(this->null_count, 1);
bdice marked this conversation as resolved.
Show resolved Hide resolved
} else {
d_result.set_valid(offset + j);
}
});
}
};
Expand Down Expand Up @@ -104,6 +107,7 @@ struct quantiles_functor {
auto values_view = column_device_view::create(values, stream);
auto group_size_view = column_device_view::create(group_sizes, stream);
auto result_view = mutable_column_device_view::create(result->mutable_view(), stream);
auto null_count = rmm::device_scalar<cudf::size_type>(0, stream, mr);

// For each group, calculate quantile
if (!cudf::is_dictionary(values.type())) {
Expand All @@ -118,7 +122,8 @@ struct quantiles_functor {
group_offsets.data(),
quantile.data(),
static_cast<size_type>(quantile.size()),
interpolation});
interpolation,
null_count.data()});
} else {
auto values_iter = cudf::dictionary::detail::make_dictionary_iterator<T>(*values_view);
thrust::for_each_n(rmm::exec_policy(stream),
Expand All @@ -131,9 +136,11 @@ struct quantiles_functor {
group_offsets.data(),
quantile.data(),
static_cast<size_type>(quantile.size()),
interpolation});
interpolation,
null_count.data()});
}

result->set_null_count(null_count.value(stream));
return result;
}

Expand Down
56 changes: 0 additions & 56 deletions cpp/tests/join/join_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1941,62 +1941,6 @@ TEST_F(JoinTest, FullJoinWithStructsAndNulls)
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
}

TEST_F(JoinTest, Repro_StructsWithoutNullsPushedDown)
{
// When joining on a STRUCT column, if the parent nulls are not reflected in
// the children, the join might produce incorrect results.
//
// In this test, a fact table of structs is joined against a dimension table.
// Both tables must match (only) on the NULL row. This will fail if the fact table's
// nulls are not pushed down into its children.
using ints = column_wrapper<int32_t>;
using structs = cudf::test::structs_column_wrapper;
using namespace cudf::test::iterators;

auto make_table = [](auto&& col) {
auto columns = CVector{};
columns.push_back(std::move(col));
return cudf::table{std::move(columns)};
};

auto const fact_table = [make_table] {
auto fact_ints = ints{0, 1, 2, 3, 4};
auto fact_structs = structs{{fact_ints}, no_nulls()}.release();
// Now set struct validity to invalidate index#3.
cudf::detail::set_null_mask(
fact_structs->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream());
// Struct row#3 is null, but Struct.child has a non-null value.
return make_table(std::move(fact_structs));
}();

auto const dimension_table = [make_table] {
auto dim_ints = ints{999};
auto dim_structs = structs{{dim_ints}, null_at(0)};
return make_table(dim_structs.release());
}();

auto const result = inner_join(fact_table.view(), dimension_table.view(), {0}, {0});
EXPECT_EQ(result->num_rows(), 1); // The null STRUCT rows should match.

// Note: Join result might not have nulls pushed down, since it's an output of gather().
// Must superimpose parent nulls before comparisons.
auto [superimposed_results, _] = cudf::structs::detail::push_down_nulls(
*result, cudf::get_default_stream(), rmm::mr::get_current_device_resource());

auto const expected = [] {
auto fact_ints = ints{0};
auto fact_structs = structs{{fact_ints}, null_at(0)};
auto dim_ints = ints{0};
auto dim_structs = structs{{dim_ints}, null_at(0)};
auto columns = CVector{};
columns.push_back(fact_structs.release());
columns.push_back(dim_structs.release());
return cudf::table{std::move(columns)};
}();

CUDF_TEST_EXPECT_TABLES_EQUIVALENT(superimposed_results, expected);
}

using lcw = cudf::test::lists_column_wrapper<int32_t>;
using cudf::test::iterators::null_at;

Expand Down
Loading