Skip to content

Commit

Permalink
Add public libcudf match_dictionaries API (#8429)
Browse files Browse the repository at this point in the history
This PR creates a public API for the internal libcudf `cudf::dictionary::detail::match_dictionaries` function to help with transitioning the cudf python CategoricalColumn over to using the libcudf dictionary column.

No function has changed or been added but this PR does add a formal gtest for the new public API.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Devavret Makkar (https://github.com/devavret)

URL: #8429
  • Loading branch information
davidwendt authored Jun 4, 2021
1 parent ad6e0bd commit 6792be9
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 16 deletions.
14 changes: 5 additions & 9 deletions cpp/include/cudf/dictionary/detail/update_keys.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,7 @@
#include <cudf/column/column.hpp>
#include <cudf/dictionary/dictionary_column_view.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>

Expand Down Expand Up @@ -72,18 +73,13 @@ std::unique_ptr<column> set_keys(
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create new dictionaries that have keys merged from the input dictionaries.
* @copydoc
* cudf::dictionary::match_dictionaries(std::vector<cudf::dictionary_column_view>,mm::mr::device_memory_resource*)
*
* This will concatenate the keys for each dictionary and then call `set_keys` on each.
* The result is a vector of new dictionaries with a common set of keys.
*
* @param input Dictionary columns to match keys.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return New dictionary column.
*/
std::vector<std::unique_ptr<column>> match_dictionaries(
std::vector<dictionary_column_view> input,
cudf::host_span<dictionary_column_view const> input,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
17 changes: 16 additions & 1 deletion cpp/include/cudf/dictionary/update_keys.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,6 +17,7 @@

#include <cudf/column/column.hpp>
#include <cudf/dictionary/dictionary_column_view.hpp>
#include <cudf/utilities/span.hpp>

namespace cudf {
namespace dictionary {
Expand Down Expand Up @@ -139,6 +140,20 @@ std::unique_ptr<column> set_keys(
column_view const& keys,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create new dictionaries that have keys merged from the input dictionaries.
*
* This will concatenate the keys for each dictionary and then call `set_keys` on each.
* The result is a vector of new dictionaries with a common set of keys.
*
* @param input Dictionary columns to match keys.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New dictionary columns.
*/
std::vector<std::unique_ptr<column>> match_dictionaries(
cudf::host_span<dictionary_column_view const> input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace dictionary
} // namespace cudf
3 changes: 2 additions & 1 deletion cpp/src/dictionary/replace.cu
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
CUDF_EXPECTS(replacement.size() == input.size(), "column sizes must match");

// first combine the keys so both input dictionaries have the same set
auto matched = match_dictionaries({input, replacement}, stream, mr);
auto matched =
match_dictionaries(std::vector<dictionary_column_view>({input, replacement}), stream, mr);

// now build the new indices by doing replace-null using the updated input indices
auto const input_indices =
Expand Down
16 changes: 12 additions & 4 deletions cpp/src/dictionary/set_keys.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -150,9 +150,10 @@ std::unique_ptr<column> set_keys(
new_nulls.second);
}

std::vector<std::unique_ptr<column>> match_dictionaries(std::vector<dictionary_column_view> input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
std::vector<std::unique_ptr<column>> match_dictionaries(
cudf::host_span<dictionary_column_view const> input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
std::vector<column_view> keys(input.size());
std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); });
Expand Down Expand Up @@ -221,5 +222,12 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
return detail::set_keys(dictionary_column, keys, rmm::cuda_stream_default, mr);
}

std::vector<std::unique_ptr<column>> match_dictionaries(
cudf::host_span<dictionary_column_view const> input, rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::match_dictionaries(input, rmm::cuda_stream_default, mr);
}

} // namespace dictionary
} // namespace cudf
25 changes: 24 additions & 1 deletion cpp/tests/dictionary/set_keys_test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -84,3 +84,26 @@ TEST_F(DictionarySetKeysTest, Errors)
cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), null_keys), cudf::logic_error);
}

TEST_F(DictionarySetKeysTest, MatchDictionaries)
{
cudf::test::dictionary_column_wrapper<int32_t> col1{5, 0, 4, 1, 2, 2, 2, 5, 0};
cudf::test::dictionary_column_wrapper<int32_t> col2{1, 0, 3, 1, 4, 5, 6, 5, 0};

auto input = std::vector<cudf::dictionary_column_view>(
{cudf::dictionary_column_view(col1), cudf::dictionary_column_view(col2)});

auto results = cudf::dictionary::match_dictionaries(input);
auto keys1 = cudf::dictionary_column_view(results[0]->view()).keys();
auto keys2 = cudf::dictionary_column_view(results[1]->view()).keys();
CUDF_TEST_EXPECT_COLUMNS_EQUAL(keys1, keys2);

auto result1 = cudf::dictionary::decode(cudf::dictionary_column_view(results[0]->view()));
auto result2 = cudf::dictionary::decode(cudf::dictionary_column_view(results[1]->view()));

auto expected1 = cudf::dictionary::decode(cudf::dictionary_column_view(col1));
auto expected2 = cudf::dictionary::decode(cudf::dictionary_column_view(col2));

CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result1->view(), expected1->view());
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result2->view(), expected2->view());
}

0 comments on commit 6792be9

Please sign in to comment.