Skip to content

Commit

Permalink
Add struct column support to cudf::sort and cudf::sorted_order (#7422)
Browse files Browse the repository at this point in the history
closes #7226

Add struct column support to `cudf::sort`, `cudf::sorted_order`
struct is supported by flattening the struct into individual columns in table_view, null mask of struct is converted to boolean column with same null_mask.

Authors:
  - Karthikeyan (@karthikeyann)

Approvers:
  - Gera Shegalov (@gerashegalov)
  - David (@davidwendt)
  - Nghia Truong (@ttnghia)
  - Jake Hemstad (@jrhemstad)
  - Conor Hoekstra (@codereport)

URL: #7422
  • Loading branch information
karthikeyann authored Mar 26, 2021
1 parent b8f149a commit 20509d0
Show file tree
Hide file tree
Showing 15 changed files with 707 additions and 45 deletions.
7 changes: 7 additions & 0 deletions cpp/include/cudf/column/column_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,13 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
return d_children[child_index];
}

/**
* @brief Returns the number of child columns
*
* @return The number of child columns
*/
__host__ __device__ size_type num_child_columns() const noexcept { return _num_children; }

protected:
column_device_view* d_children{}; ///< Array of `column_device_view`
///< objects in device memory.
Expand Down
46 changes: 31 additions & 15 deletions cpp/include/cudf/table/row_operators.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -91,6 +91,26 @@ __device__ weak_ordering relational_compare(Element lhs, Element rhs)
return detail::compare_elements(lhs, rhs);
}

/**
* @brief Compare the nulls according to null order.
*
* @param lhs_is_null boolean representing if lhs is null
* @param rhs_is_null boolean representing if lhs is null
* @param null_precedence null order
* @return Indicates the relationship between null in lhs and rhs columns.
*/
inline __device__ auto null_compare(bool lhs_is_null, bool rhs_is_null, null_order null_precedence)
{
if (lhs_is_null and rhs_is_null) { // null <? null
return weak_ordering::EQUIVALENT;
} else if (lhs_is_null) { // null <? x
return (null_precedence == null_order::BEFORE) ? weak_ordering::LESS : weak_ordering::GREATER;
} else if (rhs_is_null) { // x <? null
return (null_precedence == null_order::AFTER) ? weak_ordering::LESS : weak_ordering::GREATER;
}
return weak_ordering::EQUIVALENT;
}

/**
* @brief A specialization for non-floating-point `Element` type relational
* comparison to derive the order of the elements with respect to `lhs`.
Expand Down Expand Up @@ -173,8 +193,8 @@ class element_equality_comparator {
noexcept
{
if (has_nulls) {
bool const lhs_is_null{lhs.nullable() and lhs.is_null(lhs_element_index)};
bool const rhs_is_null{rhs.nullable() and rhs.is_null(rhs_element_index)};
bool const lhs_is_null{lhs.is_null(lhs_element_index)};
bool const rhs_is_null{rhs.is_null(rhs_element_index)};
if (lhs_is_null and rhs_is_null) {
return nulls_are_equal;
} else if (lhs_is_null != rhs_is_null) {
Expand Down Expand Up @@ -269,17 +289,11 @@ class element_relational_comparator {
size_type rhs_element_index) const noexcept
{
if (has_nulls) {
bool const lhs_is_null{lhs.nullable() and lhs.is_null(lhs_element_index)};
bool const rhs_is_null{rhs.nullable() and rhs.is_null(rhs_element_index)};

if (lhs_is_null and rhs_is_null) { // null <? null
return weak_ordering::EQUIVALENT;
} else if (lhs_is_null) { // null <? x
return (null_precedence == null_order::BEFORE) ? weak_ordering::LESS
: weak_ordering::GREATER;
} else if (rhs_is_null) { // x <? null
return (null_precedence == null_order::AFTER) ? weak_ordering::LESS
: weak_ordering::GREATER;
bool const lhs_is_null{lhs.is_null(lhs_element_index)};
bool const rhs_is_null{rhs.is_null(rhs_element_index)};

if (lhs_is_null or rhs_is_null) { // atleast one is null
return null_compare(lhs_is_null, rhs_is_null, null_precedence);
}
}

Expand Down Expand Up @@ -324,6 +338,7 @@ class row_lexicographic_comparator {
* comparison between the rows of two tables.
*
* @throws cudf::logic_error if `lhs.num_columns() != rhs.num_columns()`
* @throws cudf::logic_error if column types of `lhs` and `rhs` are not comparable.
*
* @param lhs The first table
* @param rhs The second table (may be the same table as `lhs`)
Expand All @@ -341,8 +356,9 @@ class row_lexicographic_comparator {
null_order const* null_precedence = nullptr)
: _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence}
{
// Add check for types to be the same.
CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns.");
CUDF_EXPECTS(detail::is_relationally_comparable(_lhs, _rhs),
"Attempted to compare elements of uncomparable types.");
}

/**
Expand Down
8 changes: 7 additions & 1 deletion cpp/include/cudf/table/table_device_view.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -149,4 +149,10 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
return std::make_tuple(std::move(descendant_storage), d_columns);
}

namespace detail {
extern template bool is_relationally_comparable<table_device_view>(table_device_view const& lhs,
table_device_view const& rhs);
extern template bool is_relationally_comparable<mutable_table_device_view>(
mutable_table_device_view const& lhs, mutable_table_device_view const& rhs);
} // namespace detail
} // namespace cudf
19 changes: 18 additions & 1 deletion cpp/include/cudf/table/table_view.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -291,4 +291,21 @@ table_view scatter_columns(table_view const& source,
std::vector<size_type> const& map,
table_view const& target);

namespace detail {
/**
* @brief Indicates whether respective columns in input tables are relationally comparable.
*
* @param lhs The first table
* @param rhs The second table (may be the same table as `lhs`)
* @return true all of respective columns on `lhs` and 'rhs` tables are comparable.
* @return false any of respective columns on `lhs` and 'rhs` tables are not comparable.
*/
template <typename TableView>
bool is_relationally_comparable(TableView const& lhs, TableView const& rhs);

extern template bool is_relationally_comparable<table_view>(table_view const& lhs,
table_view const& rhs);
extern template bool is_relationally_comparable<mutable_table_view>(mutable_table_view const& lhs,
mutable_table_view const& rhs);
} // namespace detail
} // namespace cudf
6 changes: 3 additions & 3 deletions cpp/include/cudf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,12 +260,12 @@ class data_type {
/**
* @brief Returns the type identifier
*/
CUDA_HOST_DEVICE_CALLABLE type_id id() const noexcept { return _id; }
constexpr type_id id() const noexcept { return _id; }

/**
* @brief Returns the scale (for fixed_point types)
*/
CUDA_HOST_DEVICE_CALLABLE int32_t scale() const noexcept { return _fixed_point_scale; }
constexpr int32_t scale() const noexcept { return _fixed_point_scale; }

private:
type_id _id{type_id::EMPTY};
Expand All @@ -287,7 +287,7 @@ class data_type {
* @return true `lhs` is equal to `rhs`
* @return false `lhs` is not equal to `rhs`
*/
inline bool operator==(data_type const& lhs, data_type const& rhs)
constexpr bool operator==(data_type const& lhs, data_type const& rhs)
{
// use std::tie in the future, breaks JITIFY currently
return lhs.id() == rhs.id() && lhs.scale() == rhs.scale();
Expand Down
11 changes: 7 additions & 4 deletions cpp/src/sort/is_sorted.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,6 +20,7 @@
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>
#include <structs/utilities.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_vector.hpp>
Expand All @@ -34,10 +35,12 @@ auto is_sorted(cudf::table_view const& in,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream)
{
auto in_d = table_device_view::create(in);
rmm::device_vector<order> d_column_order(column_order);
// 0-table_view, 1-column_order, 2-null_precedence, 3-validity_columns
auto flattened = structs::detail::flatten_nested_columns(in, column_order, null_precedence);
auto in_d = table_device_view::create(std::get<0>(flattened), stream);
rmm::device_vector<order> d_column_order(std::get<1>(flattened));
rmm::device_vector<null_order> const d_null_precedence =
(has_nulls) ? rmm::device_vector<null_order>{null_precedence}
(has_nulls) ? rmm::device_vector<null_order>{std::get<2>(flattened)}
: rmm::device_vector<null_order>{};
auto ineq_op = row_lexicographic_comparator<has_nulls>(
*in_d, *in_d, d_column_order.data().get(), d_null_precedence.data().get());
Expand Down
14 changes: 9 additions & 5 deletions cpp/src/sort/sort_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#include <cudf/table/row_operators.cuh>
#include <cudf/table/table_device_view.cuh>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/traits.hpp>
#include <structs/utilities.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
Expand Down Expand Up @@ -112,19 +114,21 @@ std::unique_ptr<column> sorted_order(table_view input,
0);

// fast-path for single column sort
if (input.num_columns() == 1) {
if (input.num_columns() == 1 and not cudf::is_nested(input.column(0).type())) {
auto const single_col = input.column(0);
auto const col_order = column_order.empty() ? order::ASCENDING : column_order.front();
auto const null_prec = null_precedence.empty() ? null_order::BEFORE : null_precedence.front();
return stable ? sorted_order<true>(single_col, col_order, null_prec, stream, mr)
: sorted_order<false>(single_col, col_order, null_prec, stream, mr);
}

auto device_table = table_device_view::create(input, stream);
rmm::device_vector<order> d_column_order(column_order);
auto flattened = structs::detail::flatten_nested_columns(input, column_order, null_precedence);
auto& input_flattened = std::get<0>(flattened);
auto device_table = table_device_view::create(input_flattened, stream);
rmm::device_vector<order> d_column_order(std::get<1>(flattened));

if (has_nulls(input)) {
rmm::device_vector<null_order> d_null_precedence(null_precedence);
if (has_nulls(input_flattened)) {
rmm::device_vector<null_order> d_null_precedence(std::get<2>(flattened));
auto comparator = row_lexicographic_comparator<true>(
*device_table, *device_table, d_column_order.data().get(), d_null_precedence.data().get());
if (stable) {
Expand Down
101 changes: 100 additions & 1 deletion cpp/src/structs/utilities.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,6 +17,8 @@
#include <thrust/iterator/counting_iterator.h>

#include <cudf/structs/structs_column_view.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/unary.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/span.hpp>

Expand Down Expand Up @@ -57,6 +59,103 @@ std::vector<std::vector<column_view>> extract_ordered_struct_children(
return result;
}

/**
* @brief Flattens struct columns to constituent non-struct columns in the input table.
*
*/
struct flattened_table {
// reference variables
table_view const& input;
std::vector<order> const& column_order;
std::vector<null_order> const& null_precedence;
// output
std::vector<std::unique_ptr<column>> validity_as_column;
std::vector<column_view> flat_columns;
std::vector<order> flat_column_order;
std::vector<null_order> flat_null_precedence;

flattened_table(table_view const& input,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence)
: input(input), column_order(column_order), null_precedence(null_precedence)
{
}

// Convert null_mask to BOOL8 columns and flatten the struct children in order.
void flatten_struct_column(structs_column_view const& col,
order col_order,
null_order col_null_order)
{
if (col.nullable()) {
validity_as_column.push_back(cudf::is_valid(col));
validity_as_column.back()->set_null_mask(copy_bitmask(col));
flat_columns.push_back(validity_as_column.back()->view());
if (not column_order.empty()) flat_column_order.push_back(col_order); // doesn't matter.
if (not null_precedence.empty()) flat_null_precedence.push_back(col_null_order);
}
for (decltype(col.num_children()) i = 0; i < col.num_children(); ++i) {
auto const& child = col.get_sliced_child(i);
if (child.type().id() == type_id::STRUCT) {
flatten_struct_column(structs_column_view{child}, col_order, null_order::BEFORE);
// default spark behaviour is null_order::BEFORE
} else {
flat_columns.push_back(child);
if (not column_order.empty()) flat_column_order.push_back(col_order);
if (not null_precedence.empty()) flat_null_precedence.push_back(null_order::BEFORE);
// default spark behaviour is null_order::BEFORE
}
}
}
// Note: possibly expand for flattening list columns too.

/**
* @copydoc flattened_table
*
* @return tuple with flattened table, flattened column order, flattened null precedence,
* vector of boolean columns (struct validity).
*/
auto operator()()
{
for (auto i = 0; i < input.num_columns(); ++i) {
auto const& col = input.column(i);
if (col.type().id() == type_id::STRUCT) {
flatten_struct_column(structs_column_view{col},
(column_order.empty() ? order() : column_order[i]),
(null_precedence.empty() ? null_order() : null_precedence[i]));
} else {
flat_columns.push_back(col);
if (not column_order.empty()) flat_column_order.push_back(column_order[i]);
if (not null_precedence.empty()) flat_null_precedence.push_back(null_precedence[i]);
}
}

return std::make_tuple(table_view{flat_columns},
std::move(flat_column_order),
std::move(flat_null_precedence),
std::move(validity_as_column));
}
};

/**
* @copydoc cudf::detail::flatten_nested_columns
*/
std::tuple<table_view,
std::vector<order>,
std::vector<null_order>,
std::vector<std::unique_ptr<column>>>
flatten_nested_columns(table_view const& input,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence)
{
std::vector<std::unique_ptr<column>> validity_as_column;
auto const has_struct = std::any_of(
input.begin(), input.end(), [](auto const& col) { return col.type().id() == type_id::STRUCT; });
if (not has_struct)
return std::make_tuple(input, column_order, null_precedence, std::move(validity_as_column));

return flattened_table{input, column_order, null_precedence}();
}

} // namespace detail
} // namespace structs
} // namespace cudf
20 changes: 20 additions & 0 deletions cpp/src/structs/utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <cudf/structs/structs_column_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/span.hpp>

namespace cudf {
Expand Down Expand Up @@ -48,6 +49,25 @@ namespace detail {
std::vector<std::vector<column_view>> extract_ordered_struct_children(
host_span<column_view const> struct_cols);

/**
* @brief Flatten table with struct columns to table with constituent columns of struct columns.
*
* If a table does not have struct columns, same input arguments are returned.
*
* @param input input table to be flattened
* @param column_order column order for input table
* @param null_precedence null order for input table
* @return tuple with flattened table, flattened column order, flattened null precedence,
* vector of boolean columns (struct validity).
*/
std::tuple<table_view,
std::vector<order>,
std::vector<null_order>,
std::vector<std::unique_ptr<column>>>
flatten_nested_columns(table_view const& input,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence);

} // namespace detail
} // namespace structs
} // namespace cudf
Loading

0 comments on commit 20509d0

Please sign in to comment.