Skip to content

Commit

Permalink
Do not add nulls to the hash table when null_equality::NOT_EQUAL is p…
Browse files Browse the repository at this point in the history
…assed to left_semi_join and left_anti_join (#8277)

Fixes  #7300

This is fundamentally the same issue and fix as https://github.com/rapidsai/cudf/pull/6943/files from @hyperbolic2346 

When nulls are considered not equal (`null_equality::NOT_EQUAL`) there is no point in adding them to the hash table used for the join as they will never compare as true against anything.  Adding large numbers of nulls was causing huge performance issues.

Includes a fix to doxygen comments for `left_anti_join`

Performance gain is tremendous.

Before:

```
Benchmark                                                                             Time             CPU   Iterations
-----------------------------------------------------------------------------------------------------------------------
Join<int32_t, int32_t>/left_anti_join_32bit_nulls/100000/100000/manual_time        1072 ms         1072 ms            1
Join<int32_t, int32_t>/left_anti_join_32bit_nulls/200000/400000/manual_time        4253 ms         4253 ms            1
Join<int32_t, int32_t>/left_anti_join_32bit_nulls/300000/1000000/manual_time      14016 ms        14016 ms            1
Join<int32_t, int32_t>/left_semi_join_32bit_nulls/100000/100000/manual_time         932 ms          932 ms            1
Join<int32_t, int32_t>/left_semi_join_32bit_nulls/200000/400000/manual_time        4481 ms         4481 ms            1
Join<int32_t, int32_t>/left_semi_join_32bit_nulls/300000/1000000/manual_time      14172 ms        14172 ms            1
```


After:
```
-----------------------------------------------------------------------------------------------------------------------
Benchmark                                                                             Time             CPU   Iterations
-----------------------------------------------------------------------------------------------------------------------
Join<int32_t, int32_t>/left_anti_join_32bit_nulls/100000/100000/manual_time       0.143 ms        0.162 ms         4996
Join<int32_t, int32_t>/left_anti_join_32bit_nulls/200000/400000/manual_time       0.255 ms        0.275 ms         2780
Join<int32_t, int32_t>/left_anti_join_32bit_nulls/300000/1000000/manual_time      0.514 ms        0.532 ms         1368
Join<int32_t, int32_t>/left_semi_join_32bit_nulls/100000/100000/manual_time       0.135 ms        0.155 ms         5203
Join<int32_t, int32_t>/left_semi_join_32bit_nulls/200000/400000/manual_time       0.206 ms        0.224 ms         3325
Join<int32_t, int32_t>/left_semi_join_32bit_nulls/300000/1000000/manual_time      0.368 ms        0.385 ms         1903
```

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Robert Maynard (https://github.com/robertmaynard)
  - Mark Harris (https://github.com/harrism)

URL: #8277
  • Loading branch information
nvdbaranec authored May 24, 2021
1 parent 691dd11 commit 7e725b5
Show file tree
Hide file tree
Showing 6 changed files with 365 additions and 124 deletions.
135 changes: 129 additions & 6 deletions cpp/benchmarks/join/join_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ template <typename key_type, typename payload_type>
class Join : public cudf::benchmark {
};

template <typename key_type, typename payload_type, bool Nullable>
static void BM_join(benchmark::State &state)
template <typename key_type, typename payload_type, bool Nullable, typename Join>
static void BM_join(benchmark::State& state, Join JoinFunc)
{
const cudf::size_type build_table_size{(cudf::size_type)state.range(0)};
const cudf::size_type probe_table_size{(cudf::size_type)state.range(1)};
Expand Down Expand Up @@ -105,20 +105,69 @@ static void BM_join(benchmark::State &state)
for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);

auto result = cudf::inner_join(
auto result = JoinFunc(
probe_table, build_table, columns_to_join, columns_to_join, cudf::null_equality::UNEQUAL);
}
}

#define JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type) \
(::benchmark::State & st) { BM_join<key_type, payload_type, nullable>(st); }
#define JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type) \
(::benchmark::State & st) \
{ \
auto join = [](cudf::table_view const& left, \
cudf::table_view const& right, \
std::vector<cudf::size_type> const& left_on, \
std::vector<cudf::size_type> const& right_on, \
cudf::null_equality compare_nulls) { \
return cudf::inner_join(left, right, left_on, right_on, compare_nulls); \
}; \
BM_join<key_type, payload_type, nullable>(st, join); \
}

JOIN_BENCHMARK_DEFINE(join_32bit, int32_t, int32_t, false);
JOIN_BENCHMARK_DEFINE(join_64bit, int64_t, int64_t, false);
JOIN_BENCHMARK_DEFINE(join_32bit_nulls, int32_t, int32_t, true);
JOIN_BENCHMARK_DEFINE(join_64bit_nulls, int64_t, int64_t, true);

#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type) \
(::benchmark::State & st) \
{ \
auto join = [](cudf::table_view const& left, \
cudf::table_view const& right, \
std::vector<cudf::size_type> const& left_on, \
std::vector<cudf::size_type> const& right_on, \
cudf::null_equality compare_nulls) { \
return cudf::left_anti_join(left, right, left_on, right_on, compare_nulls); \
}; \
BM_join<key_type, payload_type, nullable>(st, join); \
}

LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, int32_t, false);
LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, int64_t, false);
LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, int32_t, true);
LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, int64_t, true);

#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type) \
(::benchmark::State & st) \
{ \
auto join = [](cudf::table_view const& left, \
cudf::table_view const& right, \
std::vector<cudf::size_type> const& left_on, \
std::vector<cudf::size_type> const& right_on, \
cudf::null_equality compare_nulls) { \
return cudf::left_semi_join(left, right, left_on, right_on, compare_nulls); \
}; \
BM_join<key_type, payload_type, nullable>(st, join); \
}

LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, int32_t, false);
LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, int64_t, false);
LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, int32_t, true);
LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, int64_t, true);

// join -----------------------------------------------------------------------
BENCHMARK_REGISTER_F(Join, join_32bit)
->Unit(benchmark::kMillisecond)
->Args({100'000, 100'000})
Expand Down Expand Up @@ -154,3 +203,77 @@ BENCHMARK_REGISTER_F(Join, join_64bit_nulls)
->Args({50'000'000, 50'000'000})
->Args({40'000'000, 120'000'000})
->UseManualTime();

// left anti-join -------------------------------------------------------------
BENCHMARK_REGISTER_F(Join, left_anti_join_32bit)
->Unit(benchmark::kMillisecond)
->Args({100'000, 100'000})
->Args({100'000, 400'000})
->Args({100'000, 1'000'000})
->Args({10'000'000, 10'000'000})
->Args({10'000'000, 40'000'000})
->Args({10'000'000, 100'000'000})
->Args({100'000'000, 100'000'000})
->Args({80'000'000, 240'000'000})
->UseManualTime();

BENCHMARK_REGISTER_F(Join, left_anti_join_64bit)
->Unit(benchmark::kMillisecond)
->Args({50'000'000, 50'000'000})
->Args({40'000'000, 120'000'000})
->UseManualTime();

BENCHMARK_REGISTER_F(Join, left_anti_join_32bit_nulls)
->Unit(benchmark::kMillisecond)
->Args({100'000, 100'000})
->Args({100'000, 400'000})
->Args({100'000, 1'000'000})
->Args({10'000'000, 10'000'000})
->Args({10'000'000, 40'000'000})
->Args({10'000'000, 100'000'000})
->Args({100'000'000, 100'000'000})
->Args({80'000'000, 240'000'000})
->UseManualTime();

BENCHMARK_REGISTER_F(Join, left_anti_join_64bit_nulls)
->Unit(benchmark::kMillisecond)
->Args({50'000'000, 50'000'000})
->Args({40'000'000, 120'000'000})
->UseManualTime();

// left semi-join -------------------------------------------------------------
BENCHMARK_REGISTER_F(Join, left_semi_join_32bit)
->Unit(benchmark::kMillisecond)
->Args({100'000, 100'000})
->Args({100'000, 400'000})
->Args({100'000, 1'000'000})
->Args({10'000'000, 10'000'000})
->Args({10'000'000, 40'000'000})
->Args({10'000'000, 100'000'000})
->Args({100'000'000, 100'000'000})
->Args({80'000'000, 240'000'000})
->UseManualTime();

BENCHMARK_REGISTER_F(Join, left_semi_join_64bit)
->Unit(benchmark::kMillisecond)
->Args({50'000'000, 50'000'000})
->Args({40'000'000, 120'000'000})
->UseManualTime();

BENCHMARK_REGISTER_F(Join, left_semi_join_32bit_nulls)
->Unit(benchmark::kMillisecond)
->Args({100'000, 100'000})
->Args({100'000, 400'000})
->Args({100'000, 1'000'000})
->Args({10'000'000, 10'000'000})
->Args({10'000'000, 40'000'000})
->Args({10'000'000, 100'000'000})
->Args({100'000'000, 100'000'000})
->Args({80'000'000, 240'000'000})
->UseManualTime();

BENCHMARK_REGISTER_F(Join, left_semi_join_64bit_nulls)
->Unit(benchmark::kMillisecond)
->Args({50'000'000, 50'000'000})
->Args({40'000'000, 120'000'000})
->UseManualTime();
4 changes: 2 additions & 2 deletions cpp/include/cudf/join.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,13 +424,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
* TableB: {{1, 2, 3}, {1, 2, 5}}
* left_on: {0}
* right_on: {1}
* Result: {{0}, {1}}
* Result: {{0}}
*
* TableA: {{0, 1, 2}, {1, 2, 5}}
* TableB: {{1, 2, 3}}
* left_on: {0}
* right_on: {0}
* Result: { {0} {1} }
* Result: { {0}, {1} }
* @endcode
*
* @throw cudf::logic_error if number of elements in `left_on` or `right_on`
Expand Down
22 changes: 16 additions & 6 deletions cpp/src/join/semi_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,22 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
equality_build);
auto hash_table = *hash_table_ptr;

thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
right_num_rows,
[hash_table] __device__(size_type idx) mutable {
hash_table.insert(thrust::make_pair(idx, true));
});
// if compare_nulls == UNEQUAL, we can simply ignore any rows that
// contain a NULL in any column as they will never compare to equal.
auto const row_bitmask = (compare_nulls == null_equality::EQUAL)
? rmm::device_buffer{}
: cudf::detail::bitmask_and(right_flattened_keys, stream);
// skip rows that are null here.
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
right_num_rows,
[hash_table, row_bitmask = static_cast<bitmask_type const*>(row_bitmask.data())] __device__(
size_type idx) mutable {
if (!row_bitmask || cudf::bit_is_set(row_bitmask, idx)) {
hash_table.insert(thrust::make_pair(idx, true));
}
});

//
// Now we have a hash table, we need to iterate over the rows of the left table
Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ ConfigureTest(GROUPBY_TEST
ConfigureTest(JOIN_TEST
join/join_tests.cpp
join/cross_join_tests.cpp
join/semi_join_tests.cpp)
join/semi_anti_join_tests.cpp)

###################################################################################################
# - is_sorted tests -------------------------------------------------------------------------------
Expand Down
Loading

0 comments on commit 7e725b5

Please sign in to comment.