Skip to content

Commit

Permalink
Support lists of structs in row lexicographic comparator (#12953)
Browse files Browse the repository at this point in the history
This implements support for lexicographic comparison for lists-of-structs, following the proposed idea in #11222:
 * The child column of the lists-of-structs column is replaced by an integer column of its rank values. 
 * In the cases of comparing two tables, such child columns from both tables are concatenated, ranked, then split back into new child columns to replace the original child columns for each table.

Depends on:
 * #13005

Closes #11222.

Authors:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #12953
  • Loading branch information
ttnghia authored May 3, 2023
1 parent 9be347f commit d0a7dec
Show file tree
Hide file tree
Showing 8 changed files with 1,308 additions and 134 deletions.
12 changes: 9 additions & 3 deletions cpp/benchmarks/sort/nested_types_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,23 @@

#include <random>

inline std::unique_ptr<cudf::table> create_lists_data(nvbench::state& state)
inline std::unique_ptr<cudf::table> create_lists_data(nvbench::state& state,
cudf::size_type const num_columns = 1,
cudf::size_type const min_val = 0,
cudf::size_type const max_val = 5)
{
const size_t size_bytes(state.get_int64("size_bytes"));
const cudf::size_type depth{static_cast<cudf::size_type>(state.get_int64("depth"))};
auto const null_frequency{state.get_float64("null_frequency")};

data_profile table_profile;
table_profile.set_distribution_params(cudf::type_id::LIST, distribution_id::UNIFORM, 0, 5);
table_profile.set_distribution_params(
cudf::type_id::LIST, distribution_id::UNIFORM, min_val, max_val);
table_profile.set_list_depth(depth);
table_profile.set_null_probability(null_frequency);
return create_random_table({cudf::type_id::LIST}, table_size_bytes{size_bytes}, table_profile);
return create_random_table(std::vector<cudf::type_id>(num_columns, cudf::type_id::LIST),
table_size_bytes{size_bytes},
table_profile);
}

inline std::unique_ptr<cudf::table> create_structs_data(nvbench::state& state,
Expand Down
70 changes: 67 additions & 3 deletions cpp/benchmarks/sort/sort_lists.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,82 @@

#include <nvbench/nvbench.cuh>

void nvbench_sort_lists(nvbench::state& state)
namespace {
constexpr cudf::size_type min_val = 0;
constexpr cudf::size_type max_val = 100;

void sort_multiple_lists(nvbench::state& state)
{
auto const num_columns = static_cast<cudf::size_type>(state.get_int64("num_columns"));
auto const input_table = create_lists_data(state, num_columns, min_val, max_val);
auto const stream = cudf::get_default_stream();

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::detail::sorted_order(
*input_table, {}, {}, stream, rmm::mr::get_current_device_resource());
});
}

void sort_lists_of_structs(nvbench::state& state)
{
auto const table = create_lists_data(state);
auto const num_columns = static_cast<cudf::size_type>(state.get_int64("num_columns"));
auto const lists_table = create_lists_data(state, num_columns, min_val, max_val);

// After having a table of (multiple) lists columns, convert those lists columns into lists of
// structs columns. The children of these structs columns are also children of the original lists
// columns.
// Such resulted lists-of-structs columns are very similar to the original lists-of-integers
// columns so their benchmarks can be somewhat comparable.
std::vector<cudf::column_view> lists_of_structs;
for (auto const& col : lists_table->view()) {
auto const child = col.child(cudf::lists_column_view::child_column_index);

// Put the child column under a struct column having the same null mask/null count.
auto const new_child = cudf::column_view{cudf::data_type{cudf::type_id::STRUCT},
child.size(),
nullptr,
child.null_mask(),
child.null_count(),
child.offset(),
{child}};
auto const converted_col =
cudf::column_view{cudf::data_type{cudf::type_id::LIST},
col.size(),
nullptr,
col.null_mask(),
col.null_count(),
col.offset(),
{col.child(cudf::lists_column_view::offsets_column_index), new_child}};
lists_of_structs.push_back(converted_col);
}

auto const input_table = cudf::table_view{lists_of_structs};
auto const stream = cudf::get_default_stream();

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
cudf::detail::sorted_order(*table, {}, {}, stream_view, rmm::mr::get_current_device_resource());
cudf::detail::sorted_order(input_table, {}, {}, stream, rmm::mr::get_current_device_resource());
});
}

} // namespace

void nvbench_sort_lists(nvbench::state& state)
{
const auto has_lists_of_structs = state.get_int64("lists_of_structs") > 0;
if (has_lists_of_structs) {
sort_lists_of_structs(state);
} else {
sort_multiple_lists(state);
}
}

NVBENCH_BENCH(nvbench_sort_lists)
.set_name("sort_list")
.add_int64_power_of_two_axis("size_bytes", {10, 18, 24, 28})
.add_int64_axis("depth", {1, 4})
.add_int64_axis("num_columns", {1})
.add_int64_axis("lists_of_structs", {0, 1})
.add_float64_axis("null_frequency", {0, 0.2});
17 changes: 16 additions & 1 deletion cpp/include/cudf/detail/sorting.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,6 +16,7 @@

#pragma once

#include <cudf/sorting.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>

Expand Down Expand Up @@ -61,6 +62,20 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @copydoc cudf::rank
*
* @param[in] stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> rank(column_view const& input,
rank_method method,
order column_order,
null_policy null_handling,
null_order null_precedence,
bool percentage,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @copydoc cudf::stable_sort_by_key
*
Expand Down
Loading

0 comments on commit d0a7dec

Please sign in to comment.