Skip to content

Commit

Permalink
Struct binary search (lower_bound/upper_bound) (#7865)
Browse files Browse the repository at this point in the history
This PR add support for `lower_bound` and `upper_bound` binary searchs for structs column. This closes #7690.

In addition to adding binary search for structs, I also did some refactoring for `tests/search/search_test.cpp`, extracting dictionary search test from it. As such, basic search tests, dictionary search tests and (the new) struct search tests are put in separate source files. This is easier to access and future maintainance.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - David Wendt (https://github.com/davidwendt)
  - Keith Kraus (https://github.com/kkraus14)

URL: #7865
  • Loading branch information
ttnghia authored Apr 19, 2021
1 parent 4893259 commit 1775c3d
Show file tree
Hide file tree
Showing 5 changed files with 375 additions and 132 deletions.
94 changes: 41 additions & 53 deletions cpp/src/search/search.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,20 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/search.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/dictionary/detail/search.hpp>
#include <cudf/dictionary/detail/update_keys.hpp>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/search.hpp>
#include <cudf/table/row_operators.cuh>
#include <cudf/table/table_device_view.cuh>
#include <cudf/table/table_view.hpp>
#include <structs/utilities.hpp>

#include <hash/unordered_multiset.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/binary_search.h>
Expand Down Expand Up @@ -75,71 +78,56 @@ std::unique_ptr<column> search_ordered(table_view const& t,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(
column_order.empty() or static_cast<std::size_t>(t.num_columns()) == column_order.size(),
"Mismatch between number of columns and column order.");
CUDF_EXPECTS(
null_precedence.empty() or static_cast<std::size_t>(t.num_columns()) == null_precedence.size(),
"Mismatch between number of columns and null precedence.");

// Allocate result column
std::unique_ptr<column> result = make_numeric_column(
auto result = make_numeric_column(
data_type{type_to_id<size_type>()}, values.num_rows(), mask_state::UNALLOCATED, stream, mr);

mutable_column_view result_view = result.get()->mutable_view();
auto const result_out = result->mutable_view().data<size_type>();

// Handle empty inputs
if (t.num_rows() == 0) {
CUDA_TRY(cudaMemsetAsync(
result_view.data<size_type>(), 0, values.num_rows() * sizeof(size_type), stream.value()));
CUDA_TRY(cudaMemsetAsync(result_out, 0, values.num_rows() * sizeof(size_type), stream.value()));
return result;
}

if (not column_order.empty()) {
CUDF_EXPECTS(static_cast<std::size_t>(t.num_columns()) == column_order.size(),
"Mismatch between number of columns and column order.");
}

if (not null_precedence.empty()) {
CUDF_EXPECTS(static_cast<std::size_t>(t.num_columns()) == null_precedence.size(),
"Mismatch between number of columns and null precedence.");
}

// This utility will ensure all corresponding dictionary columns have matching keys.
// It will return any new dictionary columns created as well as updated table_views.
auto matched = dictionary::detail::match_dictionaries({t, values}, stream);
auto d_t = table_device_view::create(matched.second.front(), stream);
auto d_values = table_device_view::create(matched.second.back(), stream);
auto count_it = thrust::make_counting_iterator<size_type>(0);

rmm::device_vector<order> d_column_order(column_order.begin(), column_order.end());
rmm::device_vector<null_order> d_null_precedence(null_precedence.begin(), null_precedence.end());

auto const matched = dictionary::detail::match_dictionaries({t, values}, stream);

// 0-table_view, 1-column_order, 2-null_precedence, 3-validity_columns
auto const t_flattened =
structs::detail::flatten_nested_columns(matched.second.front(), column_order, null_precedence);
auto const values_flattened =
structs::detail::flatten_nested_columns(matched.second.back(), {}, {});

auto const t_d = table_device_view::create(std::get<0>(t_flattened), stream);
auto const values_d = table_device_view::create(std::get<0>(values_flattened), stream);
auto const& lhs = find_first ? *t_d : *values_d;
auto const& rhs = find_first ? *values_d : *t_d;

auto const& column_order_flattened = std::get<1>(t_flattened);
auto const& null_precedence_flattened = std::get<2>(t_flattened);
auto const column_order_dv = detail::make_device_uvector_async(column_order_flattened, stream);
auto const null_precedence_dv =
detail::make_device_uvector_async(null_precedence_flattened, stream);

auto const count_it = thrust::make_counting_iterator<size_type>(0);
if (has_nulls(t) or has_nulls(values)) {
auto ineq_op =
(find_first)
? row_lexicographic_comparator<true>(
*d_t, *d_values, d_column_order.data().get(), d_null_precedence.data().get())
: row_lexicographic_comparator<true>(
*d_values, *d_t, d_column_order.data().get(), d_null_precedence.data().get());

launch_search(count_it,
count_it,
t.num_rows(),
values.num_rows(),
result_view.data<size_type>(),
ineq_op,
find_first,
stream);
auto const comp = row_lexicographic_comparator<true>(
lhs, rhs, column_order_dv.data(), null_precedence_dv.data());
launch_search(
count_it, count_it, t.num_rows(), values.num_rows(), result_out, comp, find_first, stream);
} else {
auto ineq_op =
(find_first)
? row_lexicographic_comparator<false>(
*d_t, *d_values, d_column_order.data().get(), d_null_precedence.data().get())
: row_lexicographic_comparator<false>(
*d_values, *d_t, d_column_order.data().get(), d_null_precedence.data().get());

launch_search(count_it,
count_it,
t.num_rows(),
values.num_rows(),
result_view.data<size_type>(),
ineq_op,
find_first,
stream);
auto const comp = row_lexicographic_comparator<false>(
lhs, rhs, column_order_dv.data(), null_precedence_dv.data());
launch_search(
count_it, count_it, t.num_rows(), values.num_rows(), result_out, comp, find_first, stream);
}

return result;
Expand Down
5 changes: 4 additions & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,10 @@ ConfigureTest(FILLING_TEST

###################################################################################################
# - search test -----------------------------------------------------------------------------------
ConfigureTest(SEARCH_TEST search/search_test.cpp)
ConfigureTest(SEARCH_TEST
search/search_dictionary_test.cpp
search/search_struct_test.cpp
search/search_test.cpp)

###################################################################################################
# - reshape test ----------------------------------------------------------------------------------
Expand Down
107 changes: 107 additions & 0 deletions cpp/tests/search/search_dictionary_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/type_lists.hpp>

#include <cudf/search.hpp>

struct DictionarySearchTest : public cudf::test::BaseFixture {
};

using cudf::numeric_scalar;
using cudf::size_type;
using cudf::string_scalar;
using cudf::test::fixed_width_column_wrapper;

TEST_F(DictionarySearchTest, search_dictionary)
{
cudf::test::dictionary_column_wrapper<std::string> input(
{"", "", "10", "10", "20", "20", "30", "40"}, {0, 0, 1, 1, 1, 1, 1, 1});
cudf::test::dictionary_column_wrapper<std::string> values(
{"", "08", "10", "11", "30", "32", "90"}, {0, 1, 1, 1, 1, 1, 1});

auto result = cudf::upper_bound({cudf::table_view{{input}}},
{cudf::table_view{{values}}},
{cudf::order::ASCENDING},
{cudf::null_order::BEFORE});
fixed_width_column_wrapper<size_type> expect_upper{2, 2, 4, 4, 7, 7, 8};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_upper);

result = cudf::lower_bound({cudf::table_view{{input}}},
{cudf::table_view{{values}}},
{cudf::order::ASCENDING},
{cudf::null_order::BEFORE});
fixed_width_column_wrapper<size_type> expect_lower{0, 2, 2, 4, 6, 7, 8};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_lower);
}

TEST_F(DictionarySearchTest, search_table_dictionary)
{
fixed_width_column_wrapper<int32_t> column_0{{10, 10, 20, 20, 20, 20, 20, 20, 20, 50, 30},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}};
fixed_width_column_wrapper<float> column_1{{5.0, 6.0, .5, .5, .5, .5, .7, .7, .7, .7, .5},
{1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
cudf::test::dictionary_column_wrapper<int16_t> column_2{
{90, 95, 77, 78, 79, 76, 61, 62, 63, 41, 50}, {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1}};
cudf::table_view input({column_0, column_1, column_2});

fixed_width_column_wrapper<int32_t> values_0{{10, 40, 20}, {1, 0, 1}};
fixed_width_column_wrapper<float> values_1{{6., .5, .5}, {0, 1, 1}};
cudf::test::dictionary_column_wrapper<int16_t> values_2{{95, 50, 77}, {1, 1, 0}};
cudf::table_view values({values_0, values_1, values_2});

std::vector<cudf::order> order_flags{
{cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::DESCENDING}};
std::vector<cudf::null_order> null_order_flags{
{cudf::null_order::AFTER, cudf::null_order::AFTER, cudf::null_order::AFTER}};

auto result = cudf::lower_bound(input, values, order_flags, null_order_flags);
fixed_width_column_wrapper<size_type> expect_lower{1, 10, 2};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_lower);

result = cudf::upper_bound(input, values, order_flags, null_order_flags);
fixed_width_column_wrapper<size_type> expect_upper{2, 11, 6};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_upper);
}

TEST_F(DictionarySearchTest, contains_dictionary)
{
cudf::test::dictionary_column_wrapper<std::string> column(
{"00", "00", "17", "17", "23", "23", "29"});
EXPECT_TRUE(cudf::contains(column, string_scalar{"23"}));
EXPECT_FALSE(cudf::contains(column, string_scalar{"28"}));

cudf::test::dictionary_column_wrapper<std::string> needles({"00", "17", "23", "27"});
fixed_width_column_wrapper<bool> expect{1, 1, 1, 1, 1, 1, 0};
auto result = cudf::contains(column, needles);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect);
}

TEST_F(DictionarySearchTest, contains_nullable_dictionary)
{
cudf::test::dictionary_column_wrapper<int64_t> column({0, 0, 17, 17, 23, 23, 29},
{1, 0, 1, 1, 1, 1, 1});
EXPECT_TRUE(cudf::contains(column, numeric_scalar<int64_t>{23}));
EXPECT_FALSE(cudf::contains(column, numeric_scalar<int64_t>{28}));

cudf::test::dictionary_column_wrapper<int64_t> needles({0, 17, 23, 27});
fixed_width_column_wrapper<bool> expect({1, 0, 1, 1, 1, 1, 0}, {1, 0, 1, 1, 1, 1, 1});
auto result = cudf::contains(column, needles);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect);
}
Loading

0 comments on commit 1775c3d

Please sign in to comment.