Skip to content

Commit

Permalink
Move template parameter to function parameter in cudf::detail::left_s…
Browse files Browse the repository at this point in the history
…emi_anti_join (#8914)

The `semi_join.cu` takes about 6 minutes to compile on my Linux 18.04 desktop when doing a full build of libcudf. The `join_kind` template parameter used internally in `cudf::detail::left_semi_anti_join` for `left_semi_join` and `left_anti_join` APIs is not used in a `constexpr` or to pass to any other templated function. This PR moves the template parameter to a runtime parameter on the detail functions reducing the compile time for `semi_join.cu` by ~2x.

Another improvement includes un-inlining the `is_trivial_join` utility function to reduce the compile time for files that include `join_common_utils.hpp`.

Finally, the device vector used as a gather map in `detail::left_semi_anti_join` was wrapped with a `column_view`  in order to call `detail::gather` without iterators. This allowed not including the heavy `gather.cuh`. This improved the compile time about 10% and reduced the object file `semi_join.cu.o` size by 2x.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Mark Harris (https://github.com/harrism)

URL: #8914
  • Loading branch information
davidwendt authored Aug 4, 2021
1 parent cc2f192 commit 29b5f9a
Show file tree
Hide file tree
Showing 6 changed files with 244 additions and 178 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ add_library(cudf
src/join/cross_join.cu
src/join/hash_join.cu
src/join/join.cu
src/join/join_utils.cu
src/join/semi_join.cu
src/lists/contains.cu
src/lists/combine/concatenate_list_elements.cu
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/join/join.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down
134 changes: 22 additions & 112 deletions cpp/src/join/join_common_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/sequence.h>
#include <cub/cub.cuh>

namespace cudf {
namespace detail {

/**
* @brief Computes the trivial left join operation for the case when the
* right table is empty. In this case all the valid indices of the left table
* right table is empty.
*
* In this case all the valid indices of the left table
* are returned with their corresponding right indices being set to
* JoinNoneValue, i.e. -1.
*
Expand All @@ -41,21 +41,12 @@ namespace detail {
*
* @return Join output indices vector pair
*/
inline std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
std::unique_ptr<rmm::device_uvector<size_type>>>
std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
std::unique_ptr<rmm::device_uvector<size_type>>>
get_trivial_left_join_indices(
table_view const& left,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0);
auto right_indices =
std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
thrust::uninitialized_fill(
rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue);
return std::make_pair(std::move(left_indices), std::move(right_indices));
}
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

// Convenient alias for a pair of unique pointers to device uvectors.
using VectorPair = std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
Expand Down Expand Up @@ -83,47 +74,11 @@ using VectorPair = std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
*
* @return A pair of vectors containing the concatenated output.
*/
inline VectorPair concatenate_vector_pairs(VectorPair& a,
VectorPair& b,
rmm::cuda_stream_view stream)
{
CUDF_EXPECTS((a.first->size() == a.second->size()),
"Mismatch between sizes of vectors in vector pair");
CUDF_EXPECTS((b.first->size() == b.second->size()),
"Mismatch between sizes of vectors in vector pair");
if (a.first->is_empty()) {
return std::move(b);
} else if (b.first->is_empty()) {
return std::move(a);
}
auto original_size = a.first->size();
a.first->resize(a.first->size() + b.first->size(), stream);
a.second->resize(a.second->size() + b.second->size(), stream);
thrust::copy(
rmm::exec_policy(stream), b.first->begin(), b.first->end(), a.first->begin() + original_size);
thrust::copy(rmm::exec_policy(stream),
b.second->begin(),
b.second->end(),
a.second->begin() + original_size);
return std::move(a);
}

/**
* @brief Device functor to determine if an index is contained in a range.
*/
template <typename T>
struct valid_range {
T start, stop;
__host__ __device__ valid_range(const T begin, const T end) : start(begin), stop(end) {}

__host__ __device__ __forceinline__ bool operator()(const T index)
{
return ((index >= start) && (index < stop));
}
};
VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream);

/**
* @brief Creates a table containing the complement of left join indices.
*
* This table has two columns. The first one is filled with JoinNoneValue(-1)
* and the second one contains values from 0 to right_table_row_count - 1
* excluding those found in the right_indices column.
Expand All @@ -136,72 +91,27 @@ struct valid_range {
*
* @return Pair of vectors containing the left join indices complement
*/
inline std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
std::unique_ptr<rmm::device_uvector<size_type>>>
std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
std::unique_ptr<rmm::device_uvector<size_type>>>
get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>& right_indices,
size_type left_table_row_count,
size_type right_table_row_count,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// Get array of indices that do not appear in right_indices

// Vector allocated for unmatched result
auto right_indices_complement =
std::make_unique<rmm::device_uvector<size_type>>(right_table_row_count, stream);

// If left table is empty in a full join call then all rows of the right table
// should be represented in the joined indices. This is an optimization since
// if left table is empty and full join is called all the elements in
// right_indices will be JoinNoneValue, i.e. -1. This if path should
// produce exactly the same result as the else path but will be faster.
if (left_table_row_count == 0) {
thrust::sequence(rmm::exec_policy(stream),
right_indices_complement->begin(),
right_indices_complement->end(),
0);
} else {
// Assume all the indices in invalid_index_map are invalid
auto invalid_index_map =
std::make_unique<rmm::device_uvector<size_type>>(right_table_row_count, stream);
thrust::uninitialized_fill(
rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1});

// Functor to check for index validity since left joins can create invalid indices
valid_range<size_type> valid(0, right_table_row_count);
rmm::mr::device_memory_resource* mr);

// invalid_index_map[index_ptr[i]] = 0 for i = 0 to right_table_row_count
// Thus specifying that those locations are valid
thrust::scatter_if(rmm::exec_policy(stream),
thrust::make_constant_iterator(0),
thrust::make_constant_iterator(0) + right_indices->size(),
right_indices->begin(), // Index locations
right_indices->begin(), // Stencil - Check if index location is valid
invalid_index_map->begin(), // Output indices
valid); // Stencil Predicate
size_type begin_counter = static_cast<size_type>(0);
size_type end_counter = static_cast<size_type>(right_table_row_count);
/**
* @brief Device functor to determine if an index is contained in a range.
*/
template <typename T>
struct valid_range {
T start, stop;
__host__ __device__ valid_range(const T begin, const T end) : start(begin), stop(end) {}

// Create list of indices that have been marked as invalid
size_type indices_count = thrust::copy_if(rmm::exec_policy(stream),
thrust::make_counting_iterator(begin_counter),
thrust::make_counting_iterator(end_counter),
invalid_index_map->begin(),
right_indices_complement->begin(),
thrust::identity<size_type>()) -
right_indices_complement->begin();
right_indices_complement->resize(indices_count, stream);
__host__ __device__ __forceinline__ bool operator()(const T index)
{
return ((index >= start) && (index < stop));
}

auto left_invalid_indices =
std::make_unique<rmm::device_uvector<size_type>>(right_indices_complement->size(), stream);
thrust::uninitialized_fill(rmm::exec_policy(stream),
left_invalid_indices->begin(),
left_invalid_indices->end(),
JoinNoneValue);

return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement));
}
};

/**
* @brief Adds a pair of indices to the shared memory cache
Expand Down
25 changes: 2 additions & 23 deletions cpp/src/join/join_common_utils.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -19,8 +19,6 @@
#include <cudf/table/row_operators.cuh>
#include <cudf/table/table_view.hpp>

#include <rmm/device_uvector.hpp>

#include <hash/concurrent_unordered_multimap.cuh>

#include <limits>
Expand Down Expand Up @@ -49,26 +47,7 @@ using row_equality = cudf::row_equality_comparator<true>;

enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };

inline bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type)
{
// If there is nothing to join, then send empty table with all columns
if (left.is_empty() || right.is_empty()) { return true; }

// If left join and the left table is empty, return immediately
if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; }

// If Inner Join and either table is empty, return immediately
if ((join_kind::INNER_JOIN == join_type) && ((0 == left.num_rows()) || (0 == right.num_rows()))) {
return true;
}

// If left semi join (contains) and right table is empty,
// return immediately
if ((join_kind::LEFT_SEMI_JOIN == join_type) && (0 == right.num_rows())) { return true; }

return false;
}
bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type);

} // namespace detail

} // namespace cudf
155 changes: 155 additions & 0 deletions cpp/src/join/join_utils.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <join/join_common_utils.cuh>

#include <rmm/exec_policy.hpp>

#include <thrust/copy.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/scatter.h>
#include <thrust/sequence.h>

namespace cudf {
namespace detail {

bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type)
{
// If there is nothing to join, then send empty table with all columns
if (left.is_empty() || right.is_empty()) { return true; }

// If left join and the left table is empty, return immediately
if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; }

// If Inner Join and either table is empty, return immediately
if ((join_kind::INNER_JOIN == join_type) && ((0 == left.num_rows()) || (0 == right.num_rows()))) {
return true;
}

// If left semi join (contains) and right table is empty,
// return immediately
if ((join_kind::LEFT_SEMI_JOIN == join_type) && (0 == right.num_rows())) { return true; }

return false;
}

std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
std::unique_ptr<rmm::device_uvector<size_type>>>
get_trivial_left_join_indices(table_view const& left,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0);
auto right_indices =
std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
thrust::uninitialized_fill(
rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue);
return std::make_pair(std::move(left_indices), std::move(right_indices));
}

VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream)
{
CUDF_EXPECTS((a.first->size() == a.second->size()),
"Mismatch between sizes of vectors in vector pair");
CUDF_EXPECTS((b.first->size() == b.second->size()),
"Mismatch between sizes of vectors in vector pair");
if (a.first->is_empty()) {
return std::move(b);
} else if (b.first->is_empty()) {
return std::move(a);
}
auto original_size = a.first->size();
a.first->resize(a.first->size() + b.first->size(), stream);
a.second->resize(a.second->size() + b.second->size(), stream);
thrust::copy(
rmm::exec_policy(stream), b.first->begin(), b.first->end(), a.first->begin() + original_size);
thrust::copy(rmm::exec_policy(stream),
b.second->begin(),
b.second->end(),
a.second->begin() + original_size);
return std::move(a);
}

std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
std::unique_ptr<rmm::device_uvector<size_type>>>
get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>& right_indices,
size_type left_table_row_count,
size_type right_table_row_count,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// Get array of indices that do not appear in right_indices

// Vector allocated for unmatched result
auto right_indices_complement =
std::make_unique<rmm::device_uvector<size_type>>(right_table_row_count, stream);

// If left table is empty in a full join call then all rows of the right table
// should be represented in the joined indices. This is an optimization since
// if left table is empty and full join is called all the elements in
// right_indices will be JoinNoneValue, i.e. -1. This if path should
// produce exactly the same result as the else path but will be faster.
if (left_table_row_count == 0) {
thrust::sequence(rmm::exec_policy(stream),
right_indices_complement->begin(),
right_indices_complement->end(),
0);
} else {
// Assume all the indices in invalid_index_map are invalid
auto invalid_index_map =
std::make_unique<rmm::device_uvector<size_type>>(right_table_row_count, stream);
thrust::uninitialized_fill(
rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1});

// Functor to check for index validity since left joins can create invalid indices
valid_range<size_type> valid(0, right_table_row_count);

// invalid_index_map[index_ptr[i]] = 0 for i = 0 to right_table_row_count
// Thus specifying that those locations are valid
thrust::scatter_if(rmm::exec_policy(stream),
thrust::make_constant_iterator(0),
thrust::make_constant_iterator(0) + right_indices->size(),
right_indices->begin(), // Index locations
right_indices->begin(), // Stencil - Check if index location is valid
invalid_index_map->begin(), // Output indices
valid); // Stencil Predicate
size_type begin_counter = static_cast<size_type>(0);
size_type end_counter = static_cast<size_type>(right_table_row_count);

// Create list of indices that have been marked as invalid
size_type indices_count = thrust::copy_if(rmm::exec_policy(stream),
thrust::make_counting_iterator(begin_counter),
thrust::make_counting_iterator(end_counter),
invalid_index_map->begin(),
right_indices_complement->begin(),
thrust::identity<size_type>()) -
right_indices_complement->begin();
right_indices_complement->resize(indices_count, stream);
}

auto left_invalid_indices =
std::make_unique<rmm::device_uvector<size_type>>(right_indices_complement->size(), stream);
thrust::uninitialized_fill(rmm::exec_policy(stream),
left_invalid_indices->begin(),
left_invalid_indices->end(),
JoinNoneValue);

return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement));
}

} // namespace detail
} // namespace cudf
Loading

0 comments on commit 29b5f9a

Please sign in to comment.