-
Notifications
You must be signed in to change notification settings - Fork 917
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix empty cluster handling in tdigest merge #16675
Changes from 10 commits
2cb240d
99521e1
c4d968b
6f53dd2
47fcf0d
18b6fd5
ceaf17b
79cb1c3
0ea092e
aa063cd
ffa7997
3492b9f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -366,8 +366,8 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest, | |
* @param group_cluster_wl Output. The set of cluster weight limits for each group. | ||
* @param group_num_clusters Output. The number of output clusters for each input group. | ||
* @param group_cluster_offsets Offsets per-group to the start of it's clusters | ||
* @param has_nulls Whether or not the input contains nulls | ||
* | ||
* @param may_have_empty_clusters Whether or not there could be empty clusters. Must only be | ||
* set to false when there is no empty cluster, true otherwise. | ||
*/ | ||
|
||
template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight> | ||
|
@@ -379,7 +379,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, | |
double* group_cluster_wl, | ||
size_type* group_num_clusters, | ||
size_type const* group_cluster_offsets, | ||
bool has_nulls) | ||
bool may_have_empty_clusters) | ||
{ | ||
int const tid = threadIdx.x + blockIdx.x * blockDim.x; | ||
|
||
|
@@ -399,11 +399,12 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, | |
// a group with nothing in it. | ||
group_num_clusters[group_index] = 0; | ||
if (total_weight <= 0) { | ||
// if the input contains nulls we can potentially have a group that generates no | ||
// clusters because -all- of the input values are null. in that case, the reduce_by_key call | ||
// in the tdigest generation step will need a location to store the unused reduction value for | ||
// that group of nulls. these "stubs" will be postprocessed out afterwards. | ||
if (has_nulls) { group_num_clusters[group_index] = 1; } | ||
// If the input contains empty clusters, we can potentially have a group that also generates | ||
// empty clusters because -all- of the input values are null or empty cluster. In that case, the | ||
// `reduce_by_key` call in the tdigest generation step will need a location to store the unused | ||
// reduction value for that group of nulls and empty clusters. These "stubs" will be | ||
// postprocessed out afterwards. | ||
if (may_have_empty_clusters) { group_num_clusters[group_index] = 1; } | ||
jihoonson marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return; | ||
} | ||
|
||
|
@@ -502,7 +503,8 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, | |
* stream that falls before our current cluster limit | ||
* @param group_info A functor which returns the info for the specified group (total weight, | ||
* size and start offset) | ||
* @param has_nulls Whether or not the input data contains nulls | ||
* @param may_have_empty_clusters Whether or not there could be empty clusters. It should be | ||
* set to false only when there is no empty cluster. | ||
* @param stream CUDA stream used for device memory operations and kernel launches. | ||
* @param mr Device memory resource used to allocate the returned column's device memory | ||
* | ||
|
@@ -516,7 +518,7 @@ generate_group_cluster_info(int delta, | |
NearestWeight nearest_weight, | ||
GroupInfo group_info, | ||
CumulativeWeight cumulative_weight, | ||
bool has_nulls, | ||
bool may_have_empty_clusters, | ||
rmm::cuda_stream_view stream, | ||
rmm::device_async_resource_ref mr) | ||
{ | ||
|
@@ -535,7 +537,7 @@ generate_group_cluster_info(int delta, | |
nullptr, | ||
group_num_clusters.begin(), | ||
nullptr, | ||
has_nulls); | ||
may_have_empty_clusters); | ||
|
||
// generate group cluster offsets (where the clusters for a given group start and end) | ||
auto group_cluster_offsets = cudf::make_numeric_column( | ||
|
@@ -567,7 +569,7 @@ generate_group_cluster_info(int delta, | |
group_cluster_wl.begin(), | ||
group_num_clusters.begin(), | ||
group_cluster_offsets->view().begin<size_type>(), | ||
has_nulls); | ||
may_have_empty_clusters); | ||
|
||
return {std::move(group_cluster_wl), | ||
std::move(group_cluster_offsets), | ||
|
@@ -580,7 +582,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows, | |
std::unique_ptr<column>&& offsets, | ||
std::unique_ptr<column>&& min_col, | ||
std::unique_ptr<column>&& max_col, | ||
bool has_nulls, | ||
bool may_have_empty_clusters, | ||
rmm::cuda_stream_view stream, | ||
rmm::device_async_resource_ref mr) | ||
{ | ||
|
@@ -595,7 +597,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows, | |
size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; }; | ||
|
||
size_type const num_stubs = [&]() { | ||
if (!has_nulls) { return 0; } | ||
if (!may_have_empty_clusters) { return 0; } | ||
auto iter = cudf::detail::make_counting_transform_iterator( | ||
0, cuda::proclaim_return_type<size_type>(is_stub_digest)); | ||
return thrust::reduce(rmm::exec_policy(stream), iter, iter + num_rows); | ||
|
@@ -661,6 +663,10 @@ std::unique_ptr<column> build_output_column(size_type num_rows, | |
mr); | ||
} | ||
|
||
/** | ||
* @brief A functor which returns the cluster index within a group that the value at | ||
* the given value index falls into. | ||
*/ | ||
template <typename CumulativeWeight> | ||
struct compute_tdigests_keys_fn { | ||
int const delta; | ||
|
@@ -706,16 +712,17 @@ struct compute_tdigests_keys_fn { | |
* boundaries. | ||
* | ||
* @param delta tdigest compression level | ||
* @param values_begin Beginning of the range of input values. | ||
* @param values_end End of the range of input values. | ||
* @param centroids_begin Beginning of the range of centroids. | ||
* @param centroids_end End of the range of centroids. | ||
jihoonson marked this conversation as resolved.
Show resolved
Hide resolved
|
||
* @param cumulative_weight Functor which returns cumulative weight and group information for | ||
* an absolute input value index. | ||
* @param min_col Column containing the minimum value per group. | ||
* @param max_col Column containing the maximum value per group. | ||
* @param group_cluster_wl Cluster weight limits for each group. | ||
* @param group_cluster_offsets R-value reference of offsets into the cluster weight limits. | ||
* @param total_clusters Total number of clusters in all groups. | ||
* @param has_nulls Whether or not the input contains nulls | ||
* @param may_have_empty_clusters Whether or not there could be empty clusters. It should be | ||
* set to false only when there is no empty cluster. | ||
* @param stream CUDA stream used for device memory operations and kernel launches. | ||
* @param mr Device memory resource used to allocate the returned column's device memory | ||
* | ||
|
@@ -731,7 +738,7 @@ std::unique_ptr<column> compute_tdigests(int delta, | |
rmm::device_uvector<double> const& group_cluster_wl, | ||
std::unique_ptr<column>&& group_cluster_offsets, | ||
size_type total_clusters, | ||
bool has_nulls, | ||
bool may_have_empty_clusters, | ||
rmm::cuda_stream_view stream, | ||
rmm::device_async_resource_ref mr) | ||
{ | ||
|
@@ -750,7 +757,9 @@ std::unique_ptr<column> compute_tdigests(int delta, | |
// double // max | ||
// } | ||
// | ||
if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } | ||
if (total_clusters == 0) { | ||
return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr); | ||
} | ||
|
||
// each input group represents an individual tdigest. within each tdigest, we want the keys | ||
// to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall | ||
|
@@ -793,7 +802,7 @@ std::unique_ptr<column> compute_tdigests(int delta, | |
std::move(group_cluster_offsets), | ||
std::move(min_col), | ||
std::move(max_col), | ||
has_nulls, | ||
may_have_empty_clusters, | ||
stream, | ||
mr); | ||
} | ||
|
@@ -1145,8 +1154,13 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv, | |
auto merged = | ||
cudf::detail::concatenate(tdigest_views, stream, rmm::mr::get_current_device_resource()); | ||
|
||
auto merged_weights = merged->get_column(1).view(); | ||
// If there are no values, we can simply return a column that has only empty tdigests. | ||
if (merged_weights.size() == 0) { | ||
return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(num_groups, stream, mr); | ||
} | ||
Comment on lines
+1157
to
+1161
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not even needed to create a temp variable and it can directly be used in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @ttnghia. Your explanation aligns with my understanding. In that case, I think this is OK because the shortcut in this |
||
|
||
// generate cumulative weights | ||
auto merged_weights = merged->get_column(1).view(); | ||
auto cumulative_weights = cudf::make_numeric_column( | ||
data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream); | ||
auto keys = cudf::detail::make_counting_transform_iterator( | ||
|
@@ -1161,6 +1175,10 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv, | |
|
||
auto const delta = max_centroids; | ||
|
||
// We do not know whether there is any empty cluster in the input without actually reading the | ||
// data, which could be expensive. So, we just assume that there could be empty clusters. | ||
auto const may_have_empty_clusters = true; | ||
Comment on lines
+1178
to
+1180
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we would never change it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Before this PR, this flag was fixed to false, which implies that empty clusters should never be found during the merge. This is not always true, which is the root cause of the bug. It is rather always true to assume that there might be some clusters until we inspect all clusters, which is expensive. Having this flag fixed to true might be always not optimal when there is indeed no empty cluster. We can make this better later for this case if that turns out to be a problem. |
||
|
||
// generate cluster info | ||
auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info( | ||
delta, | ||
|
@@ -1177,7 +1195,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv, | |
group_labels, | ||
group_offsets, | ||
{tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}}, | ||
false, | ||
may_have_empty_clusters, | ||
stream, | ||
mr); | ||
|
||
|
@@ -1202,7 +1220,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv, | |
group_cluster_wl, | ||
std::move(group_cluster_offsets), | ||
total_clusters, | ||
false, | ||
may_have_empty_clusters, | ||
stream, | ||
mr); | ||
} | ||
|
@@ -1267,7 +1285,9 @@ std::unique_ptr<column> group_tdigest(column_view const& col, | |
rmm::cuda_stream_view stream, | ||
rmm::device_async_resource_ref mr) | ||
{ | ||
if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } | ||
if (col.size() == 0) { | ||
return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr); | ||
} | ||
|
||
auto const delta = max_centroids; | ||
return cudf::type_dispatcher(col.type(), | ||
|
@@ -1293,7 +1313,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input, | |
tdigest_column_view tdv(input); | ||
|
||
if (num_groups == 0 || input.size() == 0) { | ||
return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); | ||
return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr); | ||
} | ||
|
||
// bring group offsets back to the host | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. I fixed this doc to further clarify it.