From 506e610edaa874cc1d4554b26e17c4db3c7b377e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Jun 2021 14:49:10 -0400 Subject: [PATCH 01/55] Add cuco dependency in CMake --- cpp/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 090f613a9d1..1b2fff3df11 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -135,6 +135,8 @@ include(cmake/thirdparty/CUDF_GetDLPack.cmake) include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake) # find or install GoogleTest include(cmake/thirdparty/CUDF_GetGTest.cmake) +# find cuCollections +include(cmake/thirdparty/CUDF_GetCUCO.cmake) # preprocess jitify-able kernels include(cmake/Modules/JitifyPreprocessKernels.cmake) # find cuFile From 0bc588182bf1f7aaace796c4255afdc02e449e8a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Jun 2021 16:09:56 -0400 Subject: [PATCH 02/55] Add CUDF_GetCUCO CMake file --- cpp/cmake/thirdparty/CUDF_GetCUCO.cmake | 34 +++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 cpp/cmake/thirdparty/CUDF_GetCUCO.cmake diff --git a/cpp/cmake/thirdparty/CUDF_GetCUCO.cmake b/cpp/cmake/thirdparty/CUDF_GetCUCO.cmake new file mode 100644 index 00000000000..94a3899f781 --- /dev/null +++ b/cpp/cmake/thirdparty/CUDF_GetCUCO.cmake @@ -0,0 +1,34 @@ +#============================================================================= +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +function(find_and_configure_cuco) + if(CUCO_INCLUDE) + set(CUCO_INCLUDE_DIR "${CUCO_INCLUDE}" PARENT_SCOPE) + return() + endif() + if(CUCO_INCLUDE_DIR) + set(CUCO_INCLUDE_DIR ${CUCO_INCLUDE_DIR} PARENT_SCOPE) + return() + endif() + CPMFindPackage(NAME cuco + GITHUB_REPOSITORY PointKernel/cuCollections + GIT_TAG static-multi-map + GIT_SHALLOW TRUE + DOWNLOAD_ONLY TRUE) + set(CUCO_INCLUDE_DIR "${cuco_SOURCE_DIR}/include" PARENT_SCOPE) +endfunction() + +find_and_configure_cuco() From 04d588cbc7c42f9b26035c55a98a046003a06d8c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Jun 2021 17:12:02 -0400 Subject: [PATCH 03/55] Add cuco CPM build options --- cpp/cmake/thirdparty/CUDF_GetCUCO.cmake | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/cmake/thirdparty/CUDF_GetCUCO.cmake b/cpp/cmake/thirdparty/CUDF_GetCUCO.cmake index 94a3899f781..ee06d53de32 100644 --- a/cpp/cmake/thirdparty/CUDF_GetCUCO.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetCUCO.cmake @@ -25,9 +25,12 @@ function(find_and_configure_cuco) endif() CPMFindPackage(NAME cuco GITHUB_REPOSITORY PointKernel/cuCollections - GIT_TAG static-multi-map - GIT_SHALLOW TRUE - DOWNLOAD_ONLY TRUE) + GIT_TAG static-multi-map + GIT_SHALLOW TRUE + DOWNLOAD_ONLY TRUE + OPTIONS "BUILD_BENCHMARKS OFF" + "BUILD_EXAMPLES OFF" + "BUILD_TESTS OFF") set(CUCO_INCLUDE_DIR "${cuco_SOURCE_DIR}/include" PARENT_SCOPE) endfunction() From 108a0aa3fc9e7ba8f473a1b10d61354b3a0aa30a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 14 Jun 2021 19:06:14 -0400 Subject: [PATCH 04/55] Add cuco include dir --- cpp/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1b2fff3df11..88d53f886f9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -442,6 +442,7 @@ target_compile_definitions(cudf PRIVATE "JITIFY_PRINT_LOG=0") target_include_directories(cudf PUBLIC "$" "$" + "$" "$" "$" "$" From 720922a1e820595e3584ff671db36485960aafc9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 22 Jun 2021 16:01:15 -0400 Subject: [PATCH 05/55] Set cuco::static_multimap as default multimap --- cpp/src/join/join_common_utils.hpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 9312704f065..7fe11d43dca 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -21,6 +21,7 @@ #include +#include #include #include @@ -37,14 +38,11 @@ using VectorPair = std::pair>, std::unique_ptr>>; using multimap_type = - concurrent_unordered_multimap::max(), - std::numeric_limits::max(), - default_hash, - equal_to, - default_allocator>>; + cuco::static_multimap, + cuda::thread_scope_device, + default_allocator>>; using row_hash = cudf::row_hasher; From c192e7110b473867c9215ccf58fdee1008a61c50 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 22 Jun 2021 16:17:50 -0400 Subject: [PATCH 06/55] Refactor join APIs: take cuco multimap view as argument --- cpp/src/join/hash_join.cu | 35 +++++++++++++++++------------------ cpp/src/join/hash_join.cuh | 8 +++++--- cpp/src/join/join_kernels.cuh | 24 +++++++++++++++--------- 3 files changed, 37 insertions(+), 30 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index dfe3231e897..b503353905b 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -180,28 +180,31 @@ std::unique_ptr> build_join_ size_type const build_table_num_rows{build_device_table->num_rows()}; std::size_t const hash_table_size = compute_hash_table_size(build_table_num_rows); + /* auto hash_table = multimap_type::create(hash_table_size, stream, true, multimap_type::hasher(), multimap_type::key_equal(), multimap_type::allocator_type()); + */ + auto hash_table = std::make_unique(hash_table_size, + std::numeric_limits::max(), + std::numeric_limits::max()); + + auto hash_table_view = hash_table->get_device_mutable_view(); row_hash hash_build{*build_device_table}; - rmm::device_scalar failure(0, stream); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; detail::grid_1d config(build_table_num_rows, block_size); auto const row_bitmask = (compare_nulls == null_equality::EQUAL) ? rmm::device_buffer{0, stream} : cudf::detail::bitmask_and(build, stream); build_hash_table<<>>( - *hash_table, + hash_table_view, hash_build, build_table_num_rows, - static_cast(row_bitmask.data()), - failure.data()); - // Check error code from the kernel - if (failure.value(stream) == 1) { CUDF_FAIL("Hash Table insert failure."); } + static_cast(row_bitmask.data())); return hash_table; } @@ -251,6 +254,7 @@ probe_join_hash_table(cudf::table_device_view build_table, auto left_indices = std::make_unique>(join_size, stream, mr); auto right_indices = std::make_unique>(join_size, stream, mr); + auto const hash_table_view = hash_table.get_device_view(); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; detail::grid_1d config(probe_table.num_rows(), block_size); @@ -258,12 +262,9 @@ probe_join_hash_table(cudf::table_device_view build_table, row_hash hash_probe{probe_table}; row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) { - probe_hash_table + probe_hash_table <<>>( - hash_table, + hash_table_view, build_table, probe_table, hash_probe, @@ -276,9 +277,9 @@ probe_join_hash_table(cudf::table_device_view build_table, left_indices->resize(actual_size, stream); right_indices->resize(actual_size, stream); } else { - probe_hash_table + probe_hash_table <<>>( - hash_table, + hash_table_view, build_table, probe_table, hash_probe, @@ -323,17 +324,15 @@ std::size_t get_full_join_size(cudf::table_device_view build_table, auto left_indices = std::make_unique>(join_size, stream, mr); auto right_indices = std::make_unique>(join_size, stream, mr); + auto const hash_table_view = hash_table.get_device_view(); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; detail::grid_1d config(probe_table.num_rows(), block_size); row_hash hash_probe{probe_table}; row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; - probe_hash_table - <<>>(hash_table, + probe_hash_table + <<>>(hash_table_view, build_table, probe_table, hash_probe, diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index f9ccbd68c74..c7a45a7f4ce 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -91,8 +91,9 @@ std::size_t compute_join_output_size(table_device_view build_table, constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; int numBlocks{-1}; + using multimap_view_type = typename multimap_type::device_view; CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, compute_join_output_size, block_size, 0)); + &numBlocks, compute_join_output_size, block_size, 0)); int dev_id{-1}; CUDA_TRY(cudaGetDevice(&dev_id)); @@ -102,10 +103,11 @@ std::size_t compute_join_output_size(table_device_view build_table, row_hash hash_probe{probe_table}; row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; + auto const hash_table_view = hash_table.get_device_view(); // Probe the hash table without actually building the output to simply // find what the size of the output will be. - compute_join_output_size - <<>>(hash_table, + compute_join_output_size + <<>>(hash_table_view, build_table, probe_table, hash_probe, diff --git a/cpp/src/join/join_kernels.cuh b/cpp/src/join/join_kernels.cuh index 4298706987c..973f94321da 100644 --- a/cpp/src/join/join_kernels.cuh +++ b/cpp/src/join/join_kernels.cuh @@ -77,15 +77,15 @@ constexpr auto remap_sentinel_hash(H hash, S sentinel) * value in row `i` of input keys. This is nullptr if nulls are equal. * @param[out] error Pointer used to set an error code if the insert fails */ -template -__global__ void build_hash_table(multimap_type multi_map, +template +__global__ void build_hash_table(multimap_view_type multi_map, row_hash hash_build, const cudf::size_type build_table_num_rows, - bitmask_type const* row_bitmask, - int* error) + bitmask_type const* row_bitmask) { cudf::size_type i = threadIdx.x + blockIdx.x * blockDim.x; + /* while (i < build_table_num_rows) { if (!row_bitmask || cudf::bit_is_set(row_bitmask, i)) { // Compute the hash value of this row @@ -102,6 +102,7 @@ __global__ void build_hash_table(multimap_type multi_map, } i += blockDim.x * gridDim.x; } + */ } /** @@ -120,8 +121,8 @@ __global__ void build_hash_table(multimap_type multi_map, * @param[in] probe_table_num_rows The number of rows in the probe table * @param[out] output_size The resulting output size */ -template -__global__ void compute_join_output_size(multimap_type multi_map, +template +__global__ void compute_join_output_size(multimap_view_type multi_map, table_device_view build_table, table_device_view probe_table, row_hash hash_probe, @@ -138,6 +139,7 @@ __global__ void compute_join_output_size(multimap_type multi_map, cudf::size_type thread_counter{0}; const cudf::size_type start_idx = threadIdx.x + blockIdx.x * blockDim.x; const cudf::size_type stride = blockDim.x * gridDim.x; + /* const auto unused_key = multi_map.get_unused_key(); const auto end = multi_map.end(); @@ -197,6 +199,7 @@ __global__ void compute_join_output_size(multimap_type multi_map, // Add block counter to global counter if (threadIdx.x == 0) atomicAdd(output_size, block_counter); + */ } /** @@ -298,10 +301,10 @@ __device__ void flush_output_cache(const unsigned int activemask, * @param[in] max_size The maximum size of the output */ template -__global__ void probe_hash_table(multimap_type multi_map, + cudf::size_type output_cache_size, + typename multimap_view_type> +__global__ void probe_hash_table(multimap_view_type multi_map, table_device_view build_table, table_device_view probe_table, row_hash hash_probe, @@ -326,6 +329,8 @@ __global__ void probe_hash_table(multimap_type multi_map, size_type probe_row_index = threadIdx.x + blockIdx.x * blockDim.x; + /* + const unsigned int activemask = __ballot_sync(0xffffffff, probe_row_index < probe_table_num_rows); if (probe_row_index < probe_table_num_rows) { const auto unused_key = multi_map.get_unused_key(); @@ -425,6 +430,7 @@ __global__ void probe_hash_table(multimap_type multi_map, join_output_r); } } +*/ } /** From a3a92eaa4c05effcfb57f144deaab056f7cd2429 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 22 Jun 2021 16:27:54 -0400 Subject: [PATCH 07/55] Update docs --- cpp/src/join/join_kernels.cuh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/join/join_kernels.cuh b/cpp/src/join/join_kernels.cuh index 973f94321da..2eab06bfd68 100644 --- a/cpp/src/join/join_kernels.cuh +++ b/cpp/src/join/join_kernels.cuh @@ -68,14 +68,13 @@ constexpr auto remap_sentinel_hash(H hash, S sentinel) * @brief Builds a hash table from a row hasher that maps the hash * values of each row to its respective row index. * - * @tparam multimap_type The type of the hash table + * @tparam multimap_view_type The type of the hash table view * * @param[in,out] multi_map The hash table to be built to insert rows into * @param[in] hash_build Row hasher for the build table * @param[in] build_table_num_rows The number of rows in the build table * @param[in] row_bitmask Bitmask where bit `i` indicates the presence of a null * value in row `i` of input keys. This is nullptr if nulls are equal. - * @param[out] error Pointer used to set an error code if the insert fails */ template __global__ void build_hash_table(multimap_view_type multi_map, @@ -110,10 +109,10 @@ __global__ void build_hash_table(multimap_view_type multi_map, * by probing the hash map with the probe table and counting the number of matches. * * @tparam JoinKind The type of join to be performed - * @tparam multimap_type The datatype of the hash table * @tparam block_size The number of threads per block for this kernel + * @tparam multimap_view_type The datatype of the hash table view * - * @param[in] multi_map The hash table built on the build table + * @param[in] multi_map The view of the hash table built on the build table * @param[in] build_table The build table * @param[in] probe_table The probe table * @param[in] hash_probe Row hasher for the probe table @@ -285,11 +284,11 @@ __device__ void flush_output_cache(const unsigned int activemask, * Join operation. * * @tparam JoinKind The type of join to be performed - * @tparam multimap_type The type of the hash table * @tparam block_size The number of threads per block for this kernel * @tparam output_cache_size The side of the shared memory buffer to cache join output results + * @tparam multimap_type The type of the hash table view * - * @param[in] multi_map The hash table built from the build table + * @param[in] multi_map The view of the hash table built from the build table * @param[in] build_table The build table * @param[in] probe_table The probe table * @param[in] hash_probe Row hasher for the probe table From c3e67ade70f0b651ec625af20e191197e9f1d1b2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 28 Jun 2021 11:40:08 -0400 Subject: [PATCH 08/55] Insert using cuco multimap --- cpp/src/join/hash_join.cu | 13 ++++++++----- cpp/src/join/join_kernels.cuh | 32 +++++++++++++++++--------------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index b503353905b..e0f5ddff097 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -200,11 +200,14 @@ std::unique_ptr> build_join_ auto const row_bitmask = (compare_nulls == null_equality::EQUAL) ? rmm::device_buffer{0, stream} : cudf::detail::bitmask_and(build, stream); - build_hash_table<<>>( - hash_table_view, - hash_build, - build_table_num_rows, - static_cast(row_bitmask.data())); + auto const cg_size = multimap_type::cg_size(); + + build_hash_table + <<>>( + hash_table_view, + hash_build, + build_table_num_rows, + static_cast(row_bitmask.data())); return hash_table; } diff --git a/cpp/src/join/join_kernels.cuh b/cpp/src/join/join_kernels.cuh index 2eab06bfd68..821b88c099d 100644 --- a/cpp/src/join/join_kernels.cuh +++ b/cpp/src/join/join_kernels.cuh @@ -26,6 +26,8 @@ namespace cudf { namespace detail { +namespace cg = cooperative_groups; + /** * @brief Adds a pair of indices to the shared memory cache * @@ -70,38 +72,38 @@ constexpr auto remap_sentinel_hash(H hash, S sentinel) * * @tparam multimap_view_type The type of the hash table view * - * @param[in,out] multi_map The hash table to be built to insert rows into + * @param[in,out] multimap_view The hash table to be built to insert rows into * @param[in] hash_build Row hasher for the build table * @param[in] build_table_num_rows The number of rows in the build table * @param[in] row_bitmask Bitmask where bit `i` indicates the presence of a null * value in row `i` of input keys. This is nullptr if nulls are equal. */ -template -__global__ void build_hash_table(multimap_view_type multi_map, +template +__global__ void build_hash_table(multimap_view_type multimap_view, row_hash hash_build, const cudf::size_type build_table_num_rows, bitmask_type const* row_bitmask) { - cudf::size_type i = threadIdx.x + blockIdx.x * blockDim.x; + auto g = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto i = tid / cg_size; - /* while (i < build_table_num_rows) { if (!row_bitmask || cudf::bit_is_set(row_bitmask, i)) { // Compute the hash value of this row - auto const row_hash_value = remap_sentinel_hash(hash_build(i), multi_map.get_unused_key()); + auto const row_hash_value = + remap_sentinel_hash(hash_build(i), multimap_view.get_empty_key_sentinel()); - // Insert the (row hash value, row index) into the map - // using the row hash value to determine the location in the - // hash map where the new pair should be inserted - auto const insert_location = - multi_map.insert(thrust::make_pair(row_hash_value, i), true, row_hash_value); + auto insert_pair = + cuco::make_pair(std::move(row_hash_value), std::move(i)); - // If the insert failed, set the error code accordingly - if (multi_map.end() == insert_location) { *error = 1; } + multimap_view.insert(g, insert_pair); } - i += blockDim.x * gridDim.x; + i += (blockDim.x * gridDim.x) / cg_size; } - */ } /** From 6376ca62eb00e88c436a8e14aac0388b4b9ef6fd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 20 Jul 2021 17:16:20 -0400 Subject: [PATCH 09/55] Add pair_equality callable --- cpp/src/join/join_common_utils.hpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 7fe11d43dca..0ca4adbe28a 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,6 +48,24 @@ using row_hash = cudf::row_hasher; using row_equality = cudf::row_equality_comparator; +class pair_equality { + public: + pair_equality(table_device_view lhs, table_device_view rhs, bool nulls_are_equal = true) + : row_equality{lhs, rhs, nulls_are_equal} + { + } + + __device__ __inline__ bool operator()( + const cuco::pair_type& lhs, + const cuco::pair_type& rhs) const noexcept + { + return lhs.first == rhs.first and row_equality(rhs.second, lhs.second); + } + + private: + cudf::row_equality_comparator row_equality; +}; + enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN }; inline bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type) From 8c5adb4464b50415e782f502002240a8a0a54bc2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 22 Jul 2021 16:29:15 -0400 Subject: [PATCH 10/55] Optimize pair_equality callable --- cpp/src/join/join_common_utils.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 0ca4adbe28a..8bfbbc2edce 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -51,7 +51,7 @@ using row_equality = cudf::row_equality_comparator; class pair_equality { public: pair_equality(table_device_view lhs, table_device_view rhs, bool nulls_are_equal = true) - : row_equality{lhs, rhs, nulls_are_equal} + : check_row_equality{lhs, rhs, nulls_are_equal} { } @@ -59,11 +59,13 @@ class pair_equality { const cuco::pair_type& lhs, const cuco::pair_type& rhs) const noexcept { - return lhs.first == rhs.first and row_equality(rhs.second, lhs.second); + bool res = (lhs.first == rhs.first); + if (res) { return check_row_equality(rhs.second, lhs.second); } + return res; } private: - cudf::row_equality_comparator row_equality; + cudf::row_equality_comparator check_row_equality; }; enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN }; From 9920ccb5a3ca6e883b77f733b6e0325151eb50bd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 27 Jul 2021 10:04:20 -0400 Subject: [PATCH 11/55] Refactor build_hash_table and compute_join_output_size to use cuco multimap --- cpp/src/join/hash_join.cu | 40 ++++--------- cpp/src/join/hash_join.cuh | 34 ++++++----- cpp/src/join/join_kernels.cuh | 110 +++++++++++++--------------------- 3 files changed, 72 insertions(+), 112 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 77e51ff8044..ad6d21240d4 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -158,10 +158,6 @@ get_left_join_indices_complement(std::unique_ptr> /** * @brief Builds the hash table based on the given `build_table`. * - * @throw cudf::logic_error if the number of columns in `build` table is 0. - * @throw cudf::logic_error if the number of rows in `build` table is 0. - * @throw cudf::logic_error if insertion to the hash table fails. - * * @param build Table of columns used to build join hash. * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches. @@ -179,35 +175,25 @@ std::unique_ptr> build_join_h size_type const build_table_num_rows{build_device_table->num_rows()}; std::size_t const hash_table_size = compute_hash_table_size(build_table_num_rows); - /* - auto hash_table = multimap_type::create(hash_table_size, - stream, - true, - multimap_type::hasher(), - multimap_type::key_equal(), - multimap_type::allocator_type()); - */ - auto hash_table = std::make_unique(hash_table_size, + auto hash_table = std::make_unique(hash_table_size, std::numeric_limits::max(), std::numeric_limits::max()); - auto hash_table_view = hash_table->get_device_mutable_view(); - row_hash hash_build{*build_device_table}; + constexpr auto cg_size = multimap_type::cg_size(); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; - detail::grid_1d config(build_table_num_rows, block_size); + detail::grid_1d config(build_table_num_rows * cg_size, block_size); + auto const row_bitmask = (compare_nulls == null_equality::EQUAL) ? rmm::device_buffer{0, stream} : cudf::detail::bitmask_and(build, stream); + row_hash hash_build{*build_device_table}; - auto const cg_size = multimap_type::cg_size(); - - build_hash_table - <<>>( - hash_table_view, - hash_build, - build_table_num_rows, - static_cast(row_bitmask.data())); + build_hash_table<<>>( + hash_table_view, + hash_build, + build_table_num_rows, + static_cast(row_bitmask.data())); return hash_table; } @@ -263,7 +249,7 @@ probe_join_hash_table(cudf::table_device_view build_table, detail::grid_1d config(probe_table.num_rows(), block_size); row_hash hash_probe{probe_table}; - row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; + pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) { probe_hash_table <<>>( @@ -333,7 +319,7 @@ std::size_t get_full_join_size(cudf::table_device_view build_table, detail::grid_1d config(probe_table.num_rows(), block_size); row_hash hash_probe{probe_table}; - row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; + pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; probe_hash_table <<>>(hash_table_view, build_table, diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index d77448a6d07..ffe97cc4907 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -83,17 +83,24 @@ std::size_t compute_join_output_size(table_device_view build_table, } // Allocate storage for the counter used to get the size of the join output - std::size_t h_size{0}; - rmm::device_scalar d_size(0, stream); + rmm::device_scalar> d_size(0, stream); CHECK_CUDA(stream.value()); + constexpr auto cg_size = multimap_type::cg_size(); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; int numBlocks{-1}; using multimap_view_type = typename multimap_type::device_view; CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, compute_join_output_size, block_size, 0)); + &numBlocks, + compute_join_output_size>, + block_size, + 0)); int dev_id{-1}; CUDA_TRY(cudaGetDevice(&dev_id)); @@ -102,23 +109,20 @@ std::size_t compute_join_output_size(table_device_view build_table, CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); row_hash hash_probe{probe_table}; - row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; + pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; + auto const hash_table_view = hash_table.get_device_view(); + // Probe the hash table without actually building the output to simply // find what the size of the output will be. - compute_join_output_size - <<>>(hash_table_view, - build_table, - probe_table, - hash_probe, - equality, - probe_table_num_rows, - d_size.data()); + compute_join_output_size + <<>>( + hash_table_view, hash_probe, equality, probe_table_num_rows, d_size.data()); CHECK_CUDA(stream.value()); - h_size = d_size.value(stream); + auto h_size = d_size.value(stream); - return h_size; + return h_size.load(cuda::std::memory_order_relaxed); } /** diff --git a/cpp/src/join/join_kernels.cuh b/cpp/src/join/join_kernels.cuh index ecfac3d4e9a..fad699050fa 100644 --- a/cpp/src/join/join_kernels.cuh +++ b/cpp/src/join/join_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -76,6 +76,7 @@ constexpr auto remap_sentinel_hash(H hash, S sentinel) * @brief Builds a hash table from a row hasher that maps the hash * values of each row to its respective row index. * + * @tparam cg_size The size of the CUDA cooperative thread group * @tparam multimap_view_type The type of the hash table view * * @param[in,out] multimap_view The hash table to be built to insert rows into @@ -84,15 +85,15 @@ constexpr auto remap_sentinel_hash(H hash, S sentinel) * @param[in] row_bitmask Bitmask where bit `i` indicates the presence of a null * value in row `i` of input keys. This is nullptr if nulls are equal. */ -template +template __global__ void build_hash_table(multimap_view_type multimap_view, row_hash hash_build, const cudf::size_type build_table_num_rows, bitmask_type const* row_bitmask) { + using key_type = typename multimap_view_type::key_type; + using payload_type = typename multimap_view_type::mapped_type; + auto g = cg::tiled_partition(cg::this_thread_block()); auto tid = blockDim.x * blockIdx.x + threadIdx.x; auto i = tid / cg_size; @@ -100,7 +101,7 @@ __global__ void build_hash_table(multimap_view_type multimap_view, while (i < build_table_num_rows) { if (!row_bitmask || cudf::bit_is_set(row_bitmask, i)) { // Compute the hash value of this row - auto const row_hash_value = + auto row_hash_value = remap_sentinel_hash(hash_build(i), multimap_view.get_empty_key_sentinel()); auto insert_pair = @@ -117,96 +118,65 @@ __global__ void build_hash_table(multimap_view_type multimap_view, * by probing the hash map with the probe table and counting the number of matches. * * @tparam JoinKind The type of join to be performed + * @tparam cg_size The size of the CUDA cooperative thread group * @tparam block_size The number of threads per block for this kernel * @tparam multimap_view_type The datatype of the hash table view * * @param[in] multi_map The view of the hash table built on the build table - * @param[in] build_table The build table - * @param[in] probe_table The probe table * @param[in] hash_probe Row hasher for the probe table - * @param[in] check_row_equality The row equality comparator + * @param[in] check_pair_equality The pair equality comparator * @param[in] probe_table_num_rows The number of rows in the probe table * @param[out] output_size The resulting output size */ -template -__global__ void compute_join_output_size(multimap_view_type multi_map, - table_device_view build_table, - table_device_view probe_table, +template +__global__ void compute_join_output_size(multimap_view_type multimap_view, row_hash hash_probe, - row_equality check_row_equality, + pair_equality check_pair_equality, const cudf::size_type probe_table_num_rows, - std::size_t* output_size) + atomic_counter_type* output_size) { // This kernel probes multiple elements in the probe_table and store the number of matches found // inside a register. A block reduction is used at the end to calculate the matches per thread // block, and atomically add to the global 'output_size'. Compared to probing one element per // thread, this implementation improves performance by reducing atomic adds to the shared memory // counter. + using key_type = typename multimap_view_type::key_type; + using payload_type = typename multimap_view_type::mapped_type; - cudf::size_type thread_counter{0}; - const cudf::size_type start_idx = threadIdx.x + blockIdx.x * blockDim.x; - const cudf::size_type stride = blockDim.x * gridDim.x; - /* - const auto unused_key = multi_map.get_unused_key(); - const auto end = multi_map.end(); + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = block_size * blockIdx.x + threadIdx.x; + auto probe_row_index = tid / cg_size; - for (cudf::size_type probe_row_index = start_idx; probe_row_index < probe_table_num_rows; - probe_row_index += stride) { - // Search the hash map for the hash value of the probe row using the row's - // hash value to determine the location where to search for the row in the hash map - auto const probe_row_hash_value = remap_sentinel_hash(hash_probe(probe_row_index), unused_key); + std::size_t thread_counter{0}; - auto found = multi_map.find(probe_row_hash_value, true, probe_row_hash_value); + const auto empty_key_sentinel = multimap_view.get_empty_key_sentinel(); - // for left-joins we always need to add an output - bool running = (JoinKind == join_kind::LEFT_JOIN) || (end != found); - bool found_match = false; - - while (running) { - // TODO Simplify this logic... + while (probe_row_index < probe_table_num_rows) { + // Search the hash map for the hash value of the probe row using the row's + // hash value to determine the location where to search for the row in the hash map + auto probe_row_hash_value = + remap_sentinel_hash(hash_probe(probe_row_index), empty_key_sentinel); - // Left joins always have an entry in the output - if (JoinKind == join_kind::LEFT_JOIN && (end == found)) { - running = false; - } - // Stop searching after encountering an empty hash table entry - else if (unused_key == found->first) { - running = false; - } - // First check that the hash values of the two rows match - else if (found->first == probe_row_hash_value) { - // If the hash values are equal, check that the rows are equal - if (check_row_equality(probe_row_index, found->second)) { - // If the rows are equal, then we have found a true match - found_match = true; - ++thread_counter; - } - // Continue searching for matching rows until you hit an empty hash map entry - ++found; - // If you hit the end of the hash map, wrap around to the beginning - if (end == found) found = multi_map.begin(); - // Next entry is empty, stop searching - if (unused_key == found->first) running = false; - } else { - // Continue searching for matching rows until you hit an empty hash table entry - ++found; - // If you hit the end of the hash map, wrap around to the beginning - if (end == found) found = multi_map.begin(); - // Next entry is empty, stop searching - if (unused_key == found->first) running = false; - } + auto current_pair = cuco::make_pair(std::move(probe_row_hash_value), + std::move(probe_row_index)); - if ((JoinKind == join_kind::LEFT_JOIN) && (!running) && (!found_match)) { ++thread_counter; } + if constexpr (JoinKind == join_kind::LEFT_JOIN) { + multimap_view.pair_count_outer(tile, current_pair, thread_counter, check_pair_equality); + } else { + multimap_view.pair_count(tile, current_pair, thread_counter, check_pair_equality); } + probe_row_index += (gridDim.x * block_size) / cg_size; } - using BlockReduce = cub::BlockReduce; + typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter); - // Add block counter to global counter - if (threadIdx.x == 0) atomicAdd(output_size, block_counter); - */ + if (threadIdx.x == 0) { output_size->fetch_add(block_counter, cuda::std::memory_order_relaxed); } } /** @@ -324,7 +294,7 @@ __device__ void flush_output_cache(const unsigned int activemask, * @param[in] build_table The build table * @param[in] probe_table The probe table * @param[in] hash_probe Row hasher for the probe table - * @param[in] check_row_equality The row equality comparator + * @param[in] check_pair_equality The row equality comparator * @param[out] join_output_l The left result of the join operation * @param[out] join_output_r The right result of the join operation * @param[in,out] current_idx A global counter used by threads to coordinate writes to the global @@ -339,7 +309,7 @@ __global__ void probe_hash_table(multimap_view_type multi_map, table_device_view build_table, table_device_view probe_table, row_hash hash_probe, - row_equality check_row_equality, + pair_equality check_pair_equality, size_type* join_output_l, size_type* join_output_r, cudf::size_type* current_idx, From 3f5a44487aba98bdf9a4e6b7c6b7cac8f39b19db Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 29 Jul 2021 10:31:42 -0400 Subject: [PATCH 12/55] Minor cleanups in common utils --- cpp/src/join/join_common_utils.hpp | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 8bfbbc2edce..2de4544673f 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -37,12 +37,13 @@ constexpr size_type JoinNoneValue = std::numeric_limits::min(); using VectorPair = std::pair>, std::unique_ptr>>; -using multimap_type = - cuco::static_multimap, - cuda::thread_scope_device, - default_allocator>>; +using pair_type = cuco::pair_type; + +using multimap_type = cuco::static_multimap, + cuda::thread_scope_device, + default_allocator>; using row_hash = cudf::row_hasher; @@ -51,21 +52,19 @@ using row_equality = cudf::row_equality_comparator; class pair_equality { public: pair_equality(table_device_view lhs, table_device_view rhs, bool nulls_are_equal = true) - : check_row_equality{lhs, rhs, nulls_are_equal} + : _check_row_equality{lhs, rhs, nulls_are_equal} { } - __device__ __inline__ bool operator()( - const cuco::pair_type& lhs, - const cuco::pair_type& rhs) const noexcept + __device__ __inline__ bool operator()(const pair_type& lhs, const pair_type& rhs) const noexcept { bool res = (lhs.first == rhs.first); - if (res) { return check_row_equality(rhs.second, lhs.second); } + if (res) { return _check_row_equality(rhs.second, lhs.second); } return res; } private: - cudf::row_equality_comparator check_row_equality; + cudf::row_equality_comparator _check_row_equality; }; enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN }; From 9e7662eab9846e377c31f9a350daf4ced1f4cf7a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 29 Jul 2021 10:32:56 -0400 Subject: [PATCH 13/55] Use cuco host bulk function instead of cudf multimap --- cpp/src/join/hash_join.cu | 83 ++++++++++++++++++-------------------- cpp/src/join/hash_join.cuh | 76 +++++++++++++++++----------------- 2 files changed, 78 insertions(+), 81 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index ad6d21240d4..65ce6bc42d1 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -74,6 +74,19 @@ struct valid_range { } }; +class build_predicate { + public: + build_predicate(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} + + __device__ __inline__ bool operator()(const pair_type& pair) const noexcept + { + return !_row_bitmask || cudf::bit_is_set(_row_bitmask, pair.second); + } + + private: + bitmask_type const* _row_bitmask; +}; + /** * @brief Creates a table containing the complement of left join indices. * This table has two columns. The first one is filled with JoinNoneValue(-1) @@ -175,25 +188,26 @@ std::unique_ptr> build_join_h size_type const build_table_num_rows{build_device_table->num_rows()}; std::size_t const hash_table_size = compute_hash_table_size(build_table_num_rows); - auto hash_table = std::make_unique(hash_table_size, + auto hash_table = std::make_unique(hash_table_size, std::numeric_limits::max(), std::numeric_limits::max()); - auto hash_table_view = hash_table->get_device_mutable_view(); - - constexpr auto cg_size = multimap_type::cg_size(); - constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; - detail::grid_1d config(build_table_num_rows * cg_size, block_size); auto const row_bitmask = (compare_nulls == null_equality::EQUAL) ? rmm::device_buffer{0, stream} : cudf::detail::bitmask_and(build, stream); + build_predicate pred{static_cast(row_bitmask.data())}; + row_hash hash_build{*build_device_table}; + auto const empty_key_sentinel = hash_table->get_empty_key_sentinel(); + make_pair_function pair_func{hash_build, empty_key_sentinel}; - build_hash_table<<>>( - hash_table_view, - hash_build, - build_table_num_rows, - static_cast(row_bitmask.data())); + thrust::counting_iterator first(0); + thrust::transform_iterator, + cudf::detail::pair_type> + iter(first, pair_func); + + hash_table->insert_if(iter, iter + build_table_num_rows, pred); return hash_table; } @@ -239,44 +253,27 @@ probe_join_hash_table(cudf::table_device_view build_table, std::make_unique>(0, stream, mr)); } - rmm::device_scalar write_index(0, stream); - auto left_indices = std::make_unique>(join_size, stream, mr); auto right_indices = std::make_unique>(join_size, stream, mr); - auto const hash_table_view = hash_table.get_device_view(); - constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; - detail::grid_1d config(probe_table.num_rows(), block_size); + pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; row_hash hash_probe{probe_table}; - pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; - if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) { - probe_hash_table - <<>>( - hash_table_view, - build_table, - probe_table, - hash_probe, - equality, - left_indices->data(), - right_indices->data(), - write_index.data(), - join_size); - auto const actual_size = write_index.value(stream); - left_indices->resize(actual_size, stream); - right_indices->resize(actual_size, stream); + auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); + make_pair_function pair_func{hash_probe, empty_key_sentinel}; + + thrust::counting_iterator first(0); + thrust::transform_iterator, + cudf::detail::pair_type> + iter(first, pair_func); + + if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN or + JoinKind == cudf::detail::join_kind::LEFT_JOIN) { + hash_table.pair_retrieve_outer( + iter, iter + join_size, output.begin(), equality, stream.value()); } else { - probe_hash_table - <<>>( - hash_table_view, - build_table, - probe_table, - hash_probe, - equality, - left_indices->data(), - right_indices->data(), - write_index.data(), - join_size); + hash_table.pair_retrieve(iter, iter + join_size, output.begin(), equality, stream.value()); } return std::make_pair(std::move(left_indices), std::move(right_indices)); } diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index ffe97cc4907..375b7dc1ff5 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -38,6 +38,25 @@ namespace cudf { namespace detail { + +class make_pair_function { + public: + make_pair_function(row_hash const& hash, hash_value_type const empty_key_sentinel) + : _hash{hash}, _empty_key_sentinel{empty_key_sentinel} + { + } + + __device__ __inline__ cudf::detail::pair_type operator()(size_type i) const noexcept + { + auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel); + return cuco::make_pair(std::move(row_hash_value), std::move(i)); + } + + private: + row_hash const& _hash; + hash_value_type const _empty_key_sentinel; +}; + /** * @brief Calculates the exact size of the join output produced when * joining two tables together. @@ -82,47 +101,28 @@ std::size_t compute_join_output_size(table_device_view build_table, } } - // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar> d_size(0, stream); - - CHECK_CUDA(stream.value()); - - constexpr auto cg_size = multimap_type::cg_size(); - constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; - int numBlocks{-1}; - - using multimap_view_type = typename multimap_type::device_view; - CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, - compute_join_output_size>, - block_size, - 0)); - - int dev_id{-1}; - CUDA_TRY(cudaGetDevice(&dev_id)); - - int num_sms{-1}; - CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); - - row_hash hash_probe{probe_table}; pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; - auto const hash_table_view = hash_table.get_device_view(); - - // Probe the hash table without actually building the output to simply - // find what the size of the output will be. - compute_join_output_size - <<>>( - hash_table_view, hash_probe, equality, probe_table_num_rows, d_size.data()); - - CHECK_CUDA(stream.value()); - auto h_size = d_size.value(stream); + row_hash hash_probe{probe_table}; + auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); + make_pair_function pair_func{hash_probe, empty_key_sentinel}; + + thrust::counting_iterator first(0); + thrust::transform_iterator, + cudf::detail::pair_type> + iter(first, pair_func); + + size_type size; + if constexpr (JoinKind == join_kind::LEFT_JOIN) { + size = static_cast( + hash_table.pair_count_outer(iter, iter + probe_table_num_rows, equality, stream.value())); + } else { + size = static_cast( + hash_table.pair_count(iter, iter + probe_table_num_rows, equality, stream.value())); + } - return h_size.load(cuda::std::memory_order_relaxed); + return size; } /** From 7acb608820f0337ad77e31b732ca54546e2f4ba3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 29 Jul 2021 18:05:09 -0400 Subject: [PATCH 14/55] Refactor probe_join_hash_table and get_full_join_size to use cuco multimap --- cpp/src/join/hash_join.cu | 47 ++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 213acead74b..51a00d4b8f6 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include #include #include #include @@ -157,12 +158,20 @@ probe_join_hash_table(cudf::table_device_view build_table, cudf::detail::pair_type> iter(first, pair_func); + const cudf::size_type probe_table_num_rows = probe_table.num_rows(); + + auto out1_zip = thrust::make_zip_iterator( + thrust::make_tuple(thrust::make_discard_iterator(), left_indices->begin())); + auto out2_zip = thrust::make_zip_iterator( + thrust::make_tuple(thrust::make_discard_iterator(), right_indices->begin())); + if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN or JoinKind == cudf::detail::join_kind::LEFT_JOIN) { hash_table.pair_retrieve_outer( - iter, iter + join_size, output.begin(), equality, stream.value()); + iter, iter + probe_table_num_rows, out1_zip, out2_zip, equality, stream.value()); } else { - hash_table.pair_retrieve(iter, iter + join_size, output.begin(), equality, stream.value()); + hash_table.pair_retrieve( + iter, iter + probe_table_num_rows, out1_zip, out2_zip, equality, stream.value()); } return std::make_pair(std::move(left_indices), std::move(right_indices)); } @@ -199,23 +208,29 @@ std::size_t get_full_join_size(cudf::table_device_view build_table, auto left_indices = std::make_unique>(join_size, stream, mr); auto right_indices = std::make_unique>(join_size, stream, mr); - auto const hash_table_view = hash_table.get_device_view(); - constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; - detail::grid_1d config(probe_table.num_rows(), block_size); + pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; row_hash hash_probe{probe_table}; - pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL}; - probe_hash_table - <<>>(hash_table_view, - build_table, - probe_table, - hash_probe, - equality, - left_indices->data(), - right_indices->data(), - write_index.data(), - join_size); + auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); + make_pair_function pair_func{hash_probe, empty_key_sentinel}; + + thrust::counting_iterator first(0); + thrust::transform_iterator, + cudf::detail::pair_type> + iter(first, pair_func); + + const cudf::size_type probe_table_num_rows = probe_table.num_rows(); + + auto out1_zip = thrust::make_zip_iterator( + thrust::make_tuple(thrust::make_discard_iterator(), left_indices->begin())); + auto out2_zip = thrust::make_zip_iterator( + thrust::make_tuple(thrust::make_discard_iterator(), right_indices->begin())); + + hash_table.pair_retrieve_outer( + iter, iter + probe_table_num_rows, out1_zip, out2_zip, equality, stream.value()); + // Release intermediate memory allocation left_indices->resize(0, stream); From 16ff127f53a0cdd1c27b88c78f2f2b6104054515 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 29 Jul 2021 18:11:19 -0400 Subject: [PATCH 15/55] Cleanups: get rid of join kernels --- cpp/src/join/hash_join.cuh | 14 +++++++++++++- cpp/src/join/join_kernels.cuh | 13 ------------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 06c6b7a1607..a8452461c8c 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -40,6 +39,19 @@ namespace cudf { namespace detail { +/** + * @brief Remaps a hash value to a new value if it is equal to the specified sentinel value. + * + * @param hash The hash value to potentially remap + * @param sentinel The reserved value + */ +template +constexpr auto remap_sentinel_hash(H hash, S sentinel) +{ + // Arbitrarily choose hash - 1 + return (hash == sentinel) ? (hash - 1) : hash; +} + class make_pair_function { public: make_pair_function(row_hash const& hash, hash_value_type const empty_key_sentinel) diff --git a/cpp/src/join/join_kernels.cuh b/cpp/src/join/join_kernels.cuh index 2e35c9a6236..62fe2613304 100644 --- a/cpp/src/join/join_kernels.cuh +++ b/cpp/src/join/join_kernels.cuh @@ -28,19 +28,6 @@ namespace cudf { namespace detail { namespace cg = cooperative_groups; -/** - * @brief Remaps a hash value to a new value if it is equal to the specified sentinel value. - * - * @param hash The hash value to potentially remap - * @param sentinel The reserved value - */ -template -constexpr auto remap_sentinel_hash(H hash, S sentinel) -{ - // Arbitrarily choose hash - 1 - return (hash == sentinel) ? (hash - 1) : hash; -} - /** * @brief Builds a hash table from a row hasher that maps the hash * values of each row to its respective row index. From 7af07f8b6a39adfff71ebb2ecfea934c8f854e3e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 30 Jul 2021 15:53:58 -0400 Subject: [PATCH 16/55] Remove join kernels --- cpp/src/join/join_kernels.cuh | 294 ---------------------------------- 1 file changed, 294 deletions(-) delete mode 100644 cpp/src/join/join_kernels.cuh diff --git a/cpp/src/join/join_kernels.cuh b/cpp/src/join/join_kernels.cuh deleted file mode 100644 index 62fe2613304..00000000000 --- a/cpp/src/join/join_kernels.cuh +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -#include -#include - -namespace cudf { -namespace detail { -namespace cg = cooperative_groups; - -/** - * @brief Builds a hash table from a row hasher that maps the hash - * values of each row to its respective row index. - * - * @tparam cg_size The size of the CUDA cooperative thread group - * @tparam multimap_view_type The type of the hash table view - * - * @param[in,out] multimap_view The hash table to be built to insert rows into - * @param[in] hash_build Row hasher for the build table - * @param[in] build_table_num_rows The number of rows in the build table - * @param[in] row_bitmask Bitmask where bit `i` indicates the presence of a null - * value in row `i` of input keys. This is nullptr if nulls are equal. - */ -template -__global__ void build_hash_table(multimap_view_type multimap_view, - row_hash hash_build, - const cudf::size_type build_table_num_rows, - bitmask_type const* row_bitmask) -{ - using key_type = typename multimap_view_type::key_type; - using payload_type = typename multimap_view_type::mapped_type; - - auto g = cg::tiled_partition(cg::this_thread_block()); - auto tid = blockDim.x * blockIdx.x + threadIdx.x; - auto i = tid / cg_size; - - while (i < build_table_num_rows) { - if (!row_bitmask || cudf::bit_is_set(row_bitmask, i)) { - // Compute the hash value of this row - auto row_hash_value = - remap_sentinel_hash(hash_build(i), multimap_view.get_empty_key_sentinel()); - - auto insert_pair = - cuco::make_pair(std::move(row_hash_value), std::move(i)); - - multimap_view.insert(g, insert_pair); - } - i += (blockDim.x * gridDim.x) / cg_size; - } -} - -/** - * @brief Computes the output size of joining the probe table to the build table - * by probing the hash map with the probe table and counting the number of matches. - * - * @tparam JoinKind The type of join to be performed - * @tparam cg_size The size of the CUDA cooperative thread group - * @tparam block_size The number of threads per block for this kernel - * @tparam multimap_view_type The datatype of the hash table view - * - * @param[in] multi_map The view of the hash table built on the build table - * @param[in] hash_probe Row hasher for the probe table - * @param[in] check_pair_equality The pair equality comparator - * @param[in] probe_table_num_rows The number of rows in the probe table - * @param[out] output_size The resulting output size - */ -template -__global__ void compute_join_output_size(multimap_view_type multimap_view, - row_hash hash_probe, - pair_equality check_pair_equality, - const cudf::size_type probe_table_num_rows, - atomic_counter_type* output_size) -{ - // This kernel probes multiple elements in the probe_table and store the number of matches found - // inside a register. A block reduction is used at the end to calculate the matches per thread - // block, and atomically add to the global 'output_size'. Compared to probing one element per - // thread, this implementation improves performance by reducing atomic adds to the shared memory - // counter. - using key_type = typename multimap_view_type::key_type; - using payload_type = typename multimap_view_type::mapped_type; - - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto probe_row_index = tid / cg_size; - - std::size_t thread_counter{0}; - - const auto empty_key_sentinel = multimap_view.get_empty_key_sentinel(); - - while (probe_row_index < probe_table_num_rows) { - // Search the hash map for the hash value of the probe row using the row's - // hash value to determine the location where to search for the row in the hash map - auto probe_row_hash_value = - remap_sentinel_hash(hash_probe(probe_row_index), empty_key_sentinel); - - auto current_pair = cuco::make_pair(std::move(probe_row_hash_value), - std::move(probe_row_index)); - - if constexpr (JoinKind == join_kind::LEFT_JOIN) { - multimap_view.pair_count_outer(tile, current_pair, thread_counter, check_pair_equality); - } else { - multimap_view.pair_count(tile, current_pair, thread_counter, check_pair_equality); - } - probe_row_index += (gridDim.x * block_size) / cg_size; - } - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter); - // Add block counter to global counter - if (threadIdx.x == 0) { output_size->fetch_add(block_counter, cuda::std::memory_order_relaxed); } -} - -/** - * @brief Probes the hash map with the probe table to find all matching rows - * between the probe and hash table and generate the output for the desired - * Join operation. - * - * @tparam JoinKind The type of join to be performed - * @tparam block_size The number of threads per block for this kernel - * @tparam output_cache_size The side of the shared memory buffer to cache join output results - * @tparam multimap_type The type of the hash table view - * - * @param[in] multi_map The view of the hash table built from the build table - * @param[in] build_table The build table - * @param[in] probe_table The probe table - * @param[in] hash_probe Row hasher for the probe table - * @param[in] check_pair_equality The row equality comparator - * @param[out] join_output_l The left result of the join operation - * @param[out] join_output_r The right result of the join operation - * @param[in,out] current_idx A global counter used by threads to coordinate writes to the global - output - * @param[in] max_size The maximum size of the output - */ -template -__global__ void probe_hash_table(multimap_view_type multi_map, - table_device_view build_table, - table_device_view probe_table, - row_hash hash_probe, - pair_equality check_pair_equality, - size_type* join_output_l, - size_type* join_output_r, - cudf::size_type* current_idx, - const std::size_t max_size) -{ - constexpr int num_warps = block_size / detail::warp_size; - __shared__ size_type current_idx_shared[num_warps]; - __shared__ size_type join_shared_l[num_warps][output_cache_size]; - __shared__ size_type join_shared_r[num_warps][output_cache_size]; - - const int warp_id = threadIdx.x / detail::warp_size; - const int lane_id = threadIdx.x % detail::warp_size; - const cudf::size_type probe_table_num_rows = probe_table.num_rows(); - - if (0 == lane_id) { current_idx_shared[warp_id] = 0; } - - __syncwarp(); - - size_type probe_row_index = threadIdx.x + blockIdx.x * blockDim.x; - - /* - - const unsigned int activemask = __ballot_sync(0xffffffff, probe_row_index < probe_table_num_rows); - if (probe_row_index < probe_table_num_rows) { - const auto unused_key = multi_map.get_unused_key(); - const auto end = multi_map.end(); - - // Search the hash map for the hash value of the probe row using the row's - // hash value to determine the location where to search for the row in the hash map - auto const probe_row_hash_value = remap_sentinel_hash(hash_probe(probe_row_index), unused_key); - - auto found = multi_map.find(probe_row_hash_value, true, probe_row_hash_value); - - bool running = (JoinKind == join_kind::LEFT_JOIN) || - (end != found); // for left-joins we always need to add an output - bool found_match = false; - while (__any_sync(activemask, running)) { - if (running) { - // TODO Simplify this logic... - - // Left joins always have an entry in the output - if ((JoinKind == join_kind::LEFT_JOIN) && (end == found)) { - running = false; - } - // Stop searching after encountering an empty hash table entry - else if (unused_key == found->first) { - running = false; - } - // First check that the hash values of the two rows match - else if (found->first == probe_row_hash_value) { - // If the hash values are equal, check that the rows are equal - // TODO : REMOVE : if(row_equal{probe_table, build_table}(probe_row_index, found->second)) - if (check_row_equality(probe_row_index, found->second)) { - // If the rows are equal, then we have found a true match - found_match = true; - add_pair_to_cache(probe_row_index, - found->second, - current_idx_shared, - warp_id, - join_shared_l[warp_id], - join_shared_r[warp_id]); - } - // Continue searching for matching rows until you hit an empty hash map entry - ++found; - // If you hit the end of the hash map, wrap around to the beginning - if (end == found) found = multi_map.begin(); - // Next entry is empty, stop searching - if (unused_key == found->first) running = false; - } else { - // Continue searching for matching rows until you hit an empty hash table entry - ++found; - // If you hit the end of the hash map, wrap around to the beginning - if (end == found) found = multi_map.begin(); - // Next entry is empty, stop searching - if (unused_key == found->first) running = false; - } - - // If performing a LEFT join and no match was found, insert a Null into the output - if ((JoinKind == join_kind::LEFT_JOIN) && (!running) && (!found_match)) { - add_pair_to_cache(probe_row_index, - static_cast(JoinNoneValue), - current_idx_shared, - warp_id, - join_shared_l[warp_id], - join_shared_r[warp_id]); - } - } - - __syncwarp(activemask); - // flush output cache if next iteration does not fit - if (current_idx_shared[warp_id] + detail::warp_size >= output_cache_size) { - flush_output_cache(activemask, - max_size, - warp_id, - lane_id, - current_idx, - current_idx_shared, - join_shared_l, - join_shared_r, - join_output_l, - join_output_r); - __syncwarp(activemask); - if (0 == lane_id) { current_idx_shared[warp_id] = 0; } - __syncwarp(activemask); - } - } - - // final flush of output cache - if (current_idx_shared[warp_id] > 0) { - flush_output_cache(activemask, - max_size, - warp_id, - lane_id, - current_idx, - current_idx_shared, - join_shared_l, - join_shared_r, - join_output_l, - join_output_r); - } - } -*/ -} - -} // namespace detail - -} // namespace cudf From e16aa8b2b23ad49ea8a9c2d1b73eb4776162d865 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 3 Aug 2021 10:58:06 -0400 Subject: [PATCH 17/55] Use JoinNoneValue as empty value sentinel --- cpp/src/join/hash_join.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 51a00d4b8f6..1733772236b 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -80,7 +80,7 @@ std::unique_ptr> build_join_h auto hash_table = std::make_unique(hash_table_size, std::numeric_limits::max(), - std::numeric_limits::max()); + JoinNoneValue); auto const row_bitmask = (compare_nulls == null_equality::EQUAL) ? rmm::device_buffer{0, stream} From 2952b027722bf7329083a14ea17d4001f1d39546 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 3 Aug 2021 11:48:31 -0400 Subject: [PATCH 18/55] Code formatting --- cpp/src/join/hash_join.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 1733772236b..a65fd6615d9 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -78,9 +78,8 @@ std::unique_ptr> build_join_h size_type const build_table_num_rows{build_device_table->num_rows()}; std::size_t const hash_table_size = compute_hash_table_size(build_table_num_rows); - auto hash_table = std::make_unique(hash_table_size, - std::numeric_limits::max(), - JoinNoneValue); + auto hash_table = std::make_unique( + hash_table_size, std::numeric_limits::max(), JoinNoneValue); auto const row_bitmask = (compare_nulls == null_equality::EQUAL) ? rmm::device_buffer{0, stream} From bc0530e3a21e3b4062c9fa00c1ca99d73cbd75ec Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 3 Aug 2021 12:34:59 -0400 Subject: [PATCH 19/55] More descriptive naming: row_contains_null instead of build_predicate --- cpp/src/join/hash_join.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index a65fd6615d9..92da1d5aae4 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -35,9 +35,9 @@ namespace detail { namespace { -class build_predicate { +class row_contains_null { public: - build_predicate(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} + row_contains_null(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} __device__ __inline__ bool operator()(const pair_type& pair) const noexcept { @@ -84,7 +84,7 @@ std::unique_ptr> build_join_h auto const row_bitmask = (compare_nulls == null_equality::EQUAL) ? rmm::device_buffer{0, stream} : cudf::detail::bitmask_and(build, stream); - build_predicate pred{static_cast(row_bitmask.data())}; + row_contains_null pred{static_cast(row_bitmask.data())}; row_hash hash_build{*build_device_table}; auto const empty_key_sentinel = hash_table->get_empty_key_sentinel(); From 31e24a6d79e869ff6c32cccb4f419038efc98085 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 3 Aug 2021 15:56:02 -0400 Subject: [PATCH 20/55] Get rid of multimap unique_ptr --- cpp/src/join/hash_join.cu | 53 +++++++++++++++++--------------------- cpp/src/join/hash_join.cuh | 4 +-- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 92da1d5aae4..232d748a408 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -62,43 +62,35 @@ std::pair, std::unique_ptr> get_empty_joined_table * @brief Builds the hash table based on the given `build_table`. * * @param build Table of columns used to build join hash. + * @param hash_table Build hash table. * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches. * - * @return Built hash table. */ -std::unique_ptr> build_join_hash_table( - cudf::table_view const& build, null_equality compare_nulls, rmm::cuda_stream_view stream) +void build_join_hash_table(cudf::table_view const& build, + multimap_type& hash_table, + null_equality compare_nulls, + rmm::cuda_stream_view stream) { auto build_device_table = cudf::table_device_view::create(build, stream); CUDF_EXPECTS(0 != build_device_table->num_columns(), "Selected build dataset is empty"); CUDF_EXPECTS(0 != build_device_table->num_rows(), "Build side table has no rows"); - size_type const build_table_num_rows{build_device_table->num_rows()}; - std::size_t const hash_table_size = compute_hash_table_size(build_table_num_rows); - - auto hash_table = std::make_unique( - hash_table_size, std::numeric_limits::max(), JoinNoneValue); - auto const row_bitmask = (compare_nulls == null_equality::EQUAL) ? rmm::device_buffer{0, stream} : cudf::detail::bitmask_and(build, stream); row_contains_null pred{static_cast(row_bitmask.data())}; row_hash hash_build{*build_device_table}; - auto const empty_key_sentinel = hash_table->get_empty_key_sentinel(); + auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); make_pair_function pair_func{hash_build, empty_key_sentinel}; thrust::counting_iterator first(0); - thrust::transform_iterator, - cudf::detail::pair_type> - iter(first, pair_func); + auto iter = thrust::make_transform_iterator(first, pair_func); - hash_table->insert_if(iter, iter + build_table_num_rows, pred); - - return hash_table; + size_type const build_table_num_rows{build_device_table->num_rows()}; + hash_table.insert_if(iter, iter + build_table_num_rows, pred); } /** @@ -289,7 +281,10 @@ hash_join::hash_join_impl::~hash_join_impl() = default; hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, null_equality compare_nulls, rmm::cuda_stream_view stream) - : _hash_table(nullptr) + : _empty{!build.num_rows()}, + _hash_table{compute_hash_table_size(build.num_rows()), + std::numeric_limits::max(), + cudf::detail::JoinNoneValue} { CUDF_FUNC_RANGE(); CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty"); @@ -302,9 +297,9 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, // need to store off the owning structures for some of the views in _build _created_null_columns = std::move(std::get<3>(flattened_build)); - if (0 == build.num_rows()) { return; } + if (_empty) { return; } - _hash_table = build_join_hash_table(_build, compare_nulls, stream); + build_join_hash_table(_build, _hash_table, compare_nulls, stream); } std::pair>, @@ -351,13 +346,13 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p rmm::cuda_stream_view stream) const { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(_hash_table, "Hash table of hash join is null."); + CUDF_EXPECTS(!_empty, "Hash table of hash join is null."); auto build_table = cudf::table_device_view::create(_build, stream); auto probe_table = cudf::table_device_view::create(probe, stream); return cudf::detail::compute_join_output_size( - *build_table, *probe_table, *_hash_table, compare_nulls, stream); + *build_table, *probe_table, _hash_table, compare_nulls, stream); } std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe, @@ -367,13 +362,13 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr CUDF_FUNC_RANGE(); // Trivial left join case - exit early - if (!_hash_table) { return probe.num_rows(); } + if (_empty) { return probe.num_rows(); } auto build_table = cudf::table_device_view::create(_build, stream); auto probe_table = cudf::table_device_view::create(probe, stream); return cudf::detail::compute_join_output_size( - *build_table, *probe_table, *_hash_table, compare_nulls, stream); + *build_table, *probe_table, _hash_table, compare_nulls, stream); } std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe, @@ -384,12 +379,12 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr CUDF_FUNC_RANGE(); // Trivial left join case - exit early - if (!_hash_table) { return probe.num_rows(); } + if (_empty) { return probe.num_rows(); } auto build_table = cudf::table_device_view::create(_build, stream); auto probe_table = cudf::table_device_view::create(probe, stream); - return get_full_join_size(*build_table, *probe_table, *_hash_table, compare_nulls, stream, mr); + return get_full_join_size(*build_table, *probe_table, _hash_table, compare_nulls, stream, mr); } template @@ -438,17 +433,17 @@ hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe, rmm::mr::device_memory_resource* mr) const { // Trivial left join case - exit early - if (!_hash_table && JoinKind != cudf::detail::join_kind::INNER_JOIN) { + if (_empty && JoinKind != cudf::detail::join_kind::INNER_JOIN) { return get_trivial_left_join_indices(probe, stream, mr); } - CUDF_EXPECTS(_hash_table, "Hash table of hash join is null."); + CUDF_EXPECTS(!_empty, "Hash table of hash join is null."); auto build_table = cudf::table_device_view::create(_build, stream); auto probe_table = cudf::table_device_view::create(probe, stream); auto join_indices = cudf::detail::probe_join_hash_table( - *build_table, *probe_table, *_hash_table, compare_nulls, output_size, stream, mr); + *build_table, *probe_table, _hash_table, compare_nulls, output_size, stream, mr); if (JoinKind == cudf::detail::join_kind::FULL_JOIN) { auto complement_indices = detail::get_left_join_indices_complement( diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index a8452461c8c..9fc625e2c23 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -158,8 +158,8 @@ struct hash_join::hash_join_impl { private: cudf::table_view _build; std::vector> _created_null_columns; - std::unique_ptr> - _hash_table; + bool _empty; + cudf::detail::multimap_type _hash_table; public: /** From 29298a9514e4f4dd79c7e106ea96a0a61d4d9437 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 3 Aug 2021 18:13:55 -0400 Subject: [PATCH 21/55] Use char as default alloactor data type --- cpp/src/join/join_common_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index f2e669432bc..2e12e657083 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -40,7 +40,7 @@ using multimap_type = cuco::static_multimap, cuda::thread_scope_device, - default_allocator>; + default_allocator>; using row_hash = cudf::row_hasher; From 7025ae663603e8bf13edc23b42fb6e2d8b5d2f92 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 4 Aug 2021 14:53:00 -0400 Subject: [PATCH 22/55] Get rid of const reference --- cpp/src/join/hash_join.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 9fc625e2c23..44c018a465c 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -66,7 +66,7 @@ class make_pair_function { } private: - row_hash const& _hash; + row_hash _hash; hash_value_type const _empty_key_sentinel; }; From a98f156de9d5b491376469a904472bab080e211f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 5 Aug 2021 10:47:09 -0400 Subject: [PATCH 23/55] Pass stream to insert_if function --- cpp/src/join/hash_join.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 232d748a408..bf775e48651 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -90,7 +90,7 @@ void build_join_hash_table(cudf::table_view const& build, auto iter = thrust::make_transform_iterator(first, pair_func); size_type const build_table_num_rows{build_device_table->num_rows()}; - hash_table.insert_if(iter, iter + build_table_num_rows, pred); + hash_table.insert_if(iter, iter + build_table_num_rows, pred, stream.value()); } /** From a485cb71dcd790f6ce925dba9d2ff9a1786288c9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 5 Aug 2021 17:42:26 -0400 Subject: [PATCH 24/55] Use make_transform_iterator instead of naive declarations --- cpp/src/join/hash_join.cu | 10 ++-------- cpp/src/join/hash_join.cuh | 5 +---- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index bf775e48651..b70e273774a 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -144,10 +144,7 @@ probe_join_hash_table(cudf::table_device_view build_table, make_pair_function pair_func{hash_probe, empty_key_sentinel}; thrust::counting_iterator first(0); - thrust::transform_iterator, - cudf::detail::pair_type> - iter(first, pair_func); + auto iter = thrust::make_transform_iterator(first, pair_func); const cudf::size_type probe_table_num_rows = probe_table.num_rows(); @@ -207,10 +204,7 @@ std::size_t get_full_join_size(cudf::table_device_view build_table, make_pair_function pair_func{hash_probe, empty_key_sentinel}; thrust::counting_iterator first(0); - thrust::transform_iterator, - cudf::detail::pair_type> - iter(first, pair_func); + auto iter = thrust::make_transform_iterator(first, pair_func); const cudf::size_type probe_table_num_rows = probe_table.num_rows(); diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 44c018a465c..e049db08b9c 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -121,10 +121,7 @@ std::size_t compute_join_output_size(table_device_view build_table, make_pair_function pair_func{hash_probe, empty_key_sentinel}; thrust::counting_iterator first(0); - thrust::transform_iterator, - cudf::detail::pair_type> - iter(first, pair_func); + auto iter = thrust::make_transform_iterator(first, pair_func); size_type size; if constexpr (JoinKind == join_kind::LEFT_JOIN) { From 115a5e31a83ef7484eaedafb91f9c26bc7079c81 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 6 Aug 2021 13:22:34 -0400 Subject: [PATCH 25/55] Move pair equality functor to cuh header --- cpp/src/join/join_common_utils.cuh | 22 ++++++++++++++++++++++ cpp/src/join/join_common_utils.hpp | 18 ------------------ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index 2b1c870bea1..14fb5a71c21 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -29,6 +29,28 @@ namespace cudf { namespace detail { +/** + * @brief Device functor to determine if two pairs are identical. + */ +class pair_equality { + public: + pair_equality(table_device_view lhs, table_device_view rhs, bool nulls_are_equal = true) + : _check_row_equality{lhs, rhs, nulls_are_equal} + { + } + + __device__ __forceinline__ bool operator()(const pair_type& lhs, + const pair_type& rhs) const noexcept + { + bool res = (lhs.first == rhs.first); + if (res) { return _check_row_equality(rhs.second, lhs.second); } + return res; + } + + private: + row_equality _check_row_equality; +}; + /** * @brief Computes the trivial left join operation for the case when the * right table is empty. In this case all the valid indices of the left table diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 2e12e657083..50a2edf2ff8 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -46,24 +46,6 @@ using row_hash = cudf::row_hasher; using row_equality = cudf::row_equality_comparator; -class pair_equality { - public: - pair_equality(table_device_view lhs, table_device_view rhs, bool nulls_are_equal = true) - : _check_row_equality{lhs, rhs, nulls_are_equal} - { - } - - __device__ __inline__ bool operator()(const pair_type& lhs, const pair_type& rhs) const noexcept - { - bool res = (lhs.first == rhs.first); - if (res) { return _check_row_equality(rhs.second, lhs.second); } - return res; - } - - private: - cudf::row_equality_comparator _check_row_equality; -}; - enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN }; inline bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type) From fe1757ad1fef23f747f429cf64e3c7bb9b909967 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 6 Aug 2021 18:15:12 -0400 Subject: [PATCH 26/55] Fix a wrong logic in full_join --- cpp/src/join/hash_join.cu | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index b70e273774a..b3745afd742 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -155,8 +155,13 @@ probe_join_hash_table(cudf::table_device_view build_table, if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN or JoinKind == cudf::detail::join_kind::LEFT_JOIN) { - hash_table.pair_retrieve_outer( + [[maybe_unused]] auto const actual_size = hash_table.pair_retrieve_outer( iter, iter + probe_table_num_rows, out1_zip, out2_zip, equality, stream.value()); + + if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) { + left_indices->resize(actual_size, stream); + right_indices->resize(actual_size, stream); + } } else { hash_table.pair_retrieve( iter, iter + probe_table_num_rows, out1_zip, out2_zip, equality, stream.value()); From cb591c88ecce9189d7cd0aa816ae4054fd787536 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 6 Aug 2021 18:39:48 -0400 Subject: [PATCH 27/55] Minor improvement: use constexpr --- cpp/src/join/hash_join.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index b3745afd742..a5e4b68b03a 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -444,7 +444,7 @@ hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe, auto join_indices = cudf::detail::probe_join_hash_table( *build_table, *probe_table, _hash_table, compare_nulls, output_size, stream, mr); - if (JoinKind == cudf::detail::join_kind::FULL_JOIN) { + if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) { auto complement_indices = detail::get_left_join_indices_complement( join_indices.second, probe.num_rows(), _build.num_rows(), stream, mr); join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); From 5aacf196f69c3f79b24d725a3e5f8a5ef786aac1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 6 Aug 2021 19:14:34 -0400 Subject: [PATCH 28/55] Fix a bug in JoinDictionaryTest.InnerJoinNoNulls: sort before compare --- cpp/tests/join/join_tests.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index e468368842a..c84b6e68baf 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1426,11 +1426,16 @@ TEST_F(JoinDictionaryTest, InnerJoinNoNulls) result_view.column(3), decoded4->view(), result_view.column(5)}); + auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded)); + auto sorted_result = cudf::gather(cudf::table_view(result_decoded), *result_sort_order); - auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); - auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); - auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); + auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); + auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); + auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}); + auto gold_sort_order = cudf::sorted_order(gold->view()); + auto sorted_gold = cudf::gather(gold->view(), *gold_sort_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinDictionaryTest, InnerJoinWithNulls) From f379426eef94b727e7df797087fbbb6308ed92ea Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 9 Aug 2021 10:32:44 -0400 Subject: [PATCH 29/55] Sort before compare for all JoinDictionaryTest tests --- cpp/tests/join/join_tests.cpp | 86 ++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 31 deletions(-) diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index c84b6e68baf..87b6da7d05f 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1352,21 +1352,25 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls) auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2}); auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); - { - auto result = cudf::left_join(t0, t1, {0}, {0}); - auto result_view = result->view(); - auto decoded1 = cudf::dictionary::decode(result_view.column(1)); - auto decoded4 = cudf::dictionary::decode(result_view.column(4)); - std::vector result_decoded({result_view.column(0), - decoded1->view(), - result_view.column(2), - result_view.column(3), - decoded4->view(), - result_view.column(5)}); - - auto gold = cudf::left_join(g0, g1, {0}, {0}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); - } + + auto result = cudf::left_join(t0, t1, {0}, {0}); + auto result_view = result->view(); + auto decoded1 = cudf::dictionary::decode(result_view.column(1)); + auto decoded4 = cudf::dictionary::decode(result_view.column(4)); + std::vector result_decoded({result_view.column(0), + decoded1->view(), + result_view.column(2), + result_view.column(3), + decoded4->view(), + result_view.column(5)}); + auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded)); + auto sorted_result = cudf::gather(cudf::table_view(result_decoded), *result_sort_order); + + auto gold = cudf::left_join(g0, g1, {0}, {0}); + auto gold_sort_order = cudf::sorted_order(gold->view()); + auto sorted_gold = cudf::gather(gold->view(), *gold_sort_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinDictionaryTest, LeftJoinWithNulls) @@ -1394,11 +1398,16 @@ TEST_F(JoinDictionaryTest, LeftJoinWithNulls) result_view.column(3), result_view.column(4), decoded5->view()}); + auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded)); + auto sorted_result = cudf::gather(cudf::table_view(result_decoded), *result_sort_order); + + auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); + auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); + auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}); + auto gold_sort_order = cudf::sorted_order(gold->view()); + auto sorted_gold = cudf::gather(gold->view(), *gold_sort_order); - auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); - auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); - auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinDictionaryTest, InnerJoinNoNulls) @@ -1463,11 +1472,16 @@ TEST_F(JoinDictionaryTest, InnerJoinWithNulls) result_view.column(3), result_view.column(4), decoded5->view()}); + auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded)); + auto sorted_result = cudf::gather(cudf::table_view(result_decoded), *result_sort_order); + + auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); + auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); + auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}); + auto gold_sort_order = cudf::sorted_order(gold->view()); + auto sorted_gold = cudf::gather(gold->view(), *gold_sort_order); - auto g0 = cudf::table_view({col0_0, col0_1, col0_2_w}); - auto g1 = cudf::table_view({col1_0, col1_1, col1_2_w}); - auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinDictionaryTest, FullJoinNoNulls) @@ -1495,11 +1509,16 @@ TEST_F(JoinDictionaryTest, FullJoinNoNulls) result_view.column(3), decoded4->view(), result_view.column(5)}); + auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded)); + auto sorted_result = cudf::gather(cudf::table_view(result_decoded), *result_sort_order); - auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); - auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); - auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); + auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2}); + auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2}); + auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}); + auto gold_sort_order = cudf::sorted_order(gold->view()); + auto sorted_gold = cudf::gather(gold->view(), *gold_sort_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinDictionaryTest, FullJoinWithNulls) @@ -1527,11 +1546,16 @@ TEST_F(JoinDictionaryTest, FullJoinWithNulls) decoded3->view(), result_view.column(4), result_view.column(5)}); + auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded)); + auto sorted_result = cudf::gather(cudf::table_view(result_decoded), *result_sort_order); - auto g0 = cudf::table_view({col0_0_w, col0_1, col0_2}); - auto g1 = cudf::table_view({col1_0_w, col1_1, col1_2}); - auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}); - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); + auto g0 = cudf::table_view({col0_0_w, col0_1, col0_2}); + auto g1 = cudf::table_view({col1_0_w, col1_1, col1_2}); + auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}); + auto gold_sort_order = cudf::sorted_order(gold->view()); + auto sorted_gold = cudf::gather(gold->view(), *gold_sort_order); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } TEST_F(JoinTest, FullJoinWithStructsAndNulls) From d0d481dc595da5c4a91d9547e5d196e63053775b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 9 Aug 2021 19:31:40 -0400 Subject: [PATCH 30/55] Multimap takes stream as argument --- cpp/src/join/hash_join.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index a5e4b68b03a..c5161bf7f2c 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -283,7 +283,8 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, : _empty{!build.num_rows()}, _hash_table{compute_hash_table_size(build.num_rows()), std::numeric_limits::max(), - cudf::detail::JoinNoneValue} + cudf::detail::JoinNoneValue, + stream} { CUDF_FUNC_RANGE(); CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty"); From d49988a48f0cf2986379b56b2d9d9658c3c0ea07 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 10 Aug 2021 11:23:59 -0400 Subject: [PATCH 31/55] Minor optimization of pair_equality --- cpp/src/join/join_common_utils.cuh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index 380e4a83415..cec633765c7 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -40,9 +40,7 @@ class pair_equality { __device__ __forceinline__ bool operator()(const pair_type& lhs, const pair_type& rhs) const noexcept { - bool res = (lhs.first == rhs.first); - if (res) { return _check_row_equality(rhs.second, lhs.second); } - return res; + return lhs.first == rhs.first and _check_row_equality(rhs.second, lhs.second); } private: From ef18f186bc8474ced7d013f58e4ee0e4d2a9eafb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 22 Aug 2021 19:15:10 -0400 Subject: [PATCH 32/55] Refactor generate_input_tables: enable key multiplicity control --- cpp/benchmarks/join/generate_input_tables.cuh | 117 ++++-------------- cpp/benchmarks/join/join_benchmark_common.hpp | 20 ++- 2 files changed, 31 insertions(+), 106 deletions(-) diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index d7f64716e58..e846317f472 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -41,17 +41,12 @@ __global__ static void init_curand(curandState* state, const int nstates) template __global__ static void init_build_tbl(key_type* const build_tbl, const size_type build_tbl_size, - const key_type rand_max, - const bool uniq_build_tbl_keys, - key_type* const lottery, - const size_type lottery_size, + const int multiplicity, curandState* state, const int num_states) { - static_assert(std::is_signed::value, "key_type needs to be signed for lottery to work"); - - const int start_idx = blockIdx.x * blockDim.x + threadIdx.x; - const key_type stride = blockDim.x * gridDim.x; + auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x; + auto const stride = blockDim.x * gridDim.x; assert(start_idx < num_states); curandState localState = state[start_idx]; @@ -59,28 +54,7 @@ __global__ static void init_build_tbl(key_type* const build_tbl, for (size_type idx = start_idx; idx < build_tbl_size; idx += stride) { const double x = curand_uniform_double(&localState); - if (uniq_build_tbl_keys) { - // If the build table keys need to be unique, go through lottery array from lottery_idx until - // finding a key which has not been used (-1). Mark the key as been used by atomically setting - // the spot to -1. - - size_type lottery_idx = x * lottery_size; - key_type lottery_val = -1; - - while (-1 == lottery_val) { - lottery_val = lottery[lottery_idx]; - - if (-1 != lottery_val) { - lottery_val = atomicCAS(lottery + lottery_idx, lottery_val, -1); - } - - lottery_idx = (lottery_idx + 1) % lottery_size; - } - - build_tbl[idx] = lottery_val; - } else { - build_tbl[idx] = x * rand_max; - } + build_tbl[idx] = static_cast(x * (build_tbl_size / multiplicity)); } state[start_idx] = localState; @@ -89,16 +63,15 @@ __global__ static void init_build_tbl(key_type* const build_tbl, template __global__ void init_probe_tbl(key_type* const probe_tbl, const size_type probe_tbl_size, - const key_type* const build_tbl, const size_type build_tbl_size, - const key_type* const lottery, - const size_type lottery_size, + const key_type rand_max, const double selectivity, + const int multiplicity, curandState* state, const int num_states) { - const int start_idx = blockIdx.x * blockDim.x + threadIdx.x; - const size_type stride = blockDim.x * gridDim.x; + auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x; + auto const stride = blockDim.x * gridDim.x; assert(start_idx < num_states); curandState localState = state[start_idx]; @@ -109,21 +82,15 @@ __global__ void init_probe_tbl(key_type* const probe_tbl, if (x <= selectivity) { // x <= selectivity means this key in the probe table should be present in the build table, so - // we pick a key from build_tbl - x = curand_uniform_double(&localState); - size_type build_tbl_idx = x * build_tbl_size; - - if (build_tbl_idx >= build_tbl_size) { build_tbl_idx = build_tbl_size - 1; } - - val = build_tbl[build_tbl_idx]; + // we pick a key from [0, build_tbl_size / multiplicity] + x = curand_uniform_double(&localState); + val = static_cast(x * (build_tbl_size / multiplicity)); } else { // This key in the probe table should not be present in the build table, so we pick a key from - // lottery. - x = curand_uniform_double(&localState); - size_type lottery_idx = x * lottery_size; - val = lottery[lottery_idx]; + // [build_tbl_size, rand_max]. + x = curand_uniform_double(&localState); + val = static_cast(x * (rand_max - build_tbl_size) + build_tbl_size); } - probe_tbl[idx] = val; } @@ -152,9 +119,7 @@ __global__ void init_probe_tbl(key_type* const probe_tbl, * @param[in] build_tbl_size number of keys in the build table * @param[in] selectivity probability with which an element of the probe table is * present in the build table. - * @param[in] rand_max maximum random number to generate. I.e. random numbers are - * integers from [0,rand_max]. - * @param[in] uniq_build_tbl_keys if each key in the build table should appear exactly once. + * @param[in] multiplicity number of matches for each key. */ template void generate_input_tables(key_type* const build_tbl, @@ -162,8 +127,7 @@ void generate_input_tables(key_type* const build_tbl, key_type* const probe_tbl, const size_type probe_tbl_size, const double selectivity, - const key_type rand_max, - const bool uniq_build_tbl_keys) + const int multiplicity) { // With large values of rand_max the a lot of temporary storage is needed for the lottery. At the // expense of not being that accurate with applying the selectivity an especially more memory @@ -171,9 +135,7 @@ void generate_input_tables(key_type* const build_tbl, // let one table choose random numbers from only one interval and the other only select with // selective probability from the same interval and from the other in the other cases. - static_assert(std::is_signed::value, "key_type needs to be signed for lottery to work"); - - const int block_size = 128; + constexpr int block_size = 128; // Maximize exposed parallelism while minimizing storage for curand state int num_blocks_init_build_tbl{-1}; @@ -198,55 +160,20 @@ void generate_input_tables(key_type* const build_tbl, CHECK_CUDA(0); - size_type lottery_size = - rand_max < std::numeric_limits::max() - 1 ? rand_max + 1 : rand_max; - rmm::device_uvector lottery(lottery_size, rmm::cuda_stream_default); - - if (uniq_build_tbl_keys) { - thrust::sequence(rmm::exec_policy(), lottery.begin(), lottery.end(), 0); - } - - init_build_tbl - <<>>(build_tbl, - build_tbl_size, - rand_max, - uniq_build_tbl_keys, - lottery.data(), - lottery_size, - devStates.data(), - num_states); + init_build_tbl<<>>( + build_tbl, build_tbl_size, multiplicity, devStates.data(), num_states); CHECK_CUDA(0); - rmm::device_uvector build_tbl_sorted(build_tbl_size, rmm::cuda_stream_default); - - CUDA_TRY(cudaMemcpy(build_tbl_sorted.data(), - build_tbl, - build_tbl_size * sizeof(key_type), - cudaMemcpyDeviceToDevice)); - - thrust::sort(rmm::exec_policy(), build_tbl_sorted.begin(), build_tbl_sorted.end()); - - // Exclude keys used in build table from lottery - thrust::counting_iterator first_lottery_elem(0); - thrust::counting_iterator last_lottery_elem = first_lottery_elem + lottery_size; - key_type* lottery_end = thrust::set_difference(rmm::exec_policy(), - first_lottery_elem, - last_lottery_elem, - build_tbl_sorted.begin(), - build_tbl_sorted.end(), - lottery.data()); - - lottery_size = thrust::distance(lottery.data(), lottery_end); + auto const rand_max = std::numeric_limits::max(); init_probe_tbl <<>>(probe_tbl, probe_tbl_size, - build_tbl, build_tbl_size, - lottery.data(), - lottery_size, + rand_max, selectivity, + multiplicity, devStates.data(), num_states); diff --git a/cpp/benchmarks/join/join_benchmark_common.hpp b/cpp/benchmarks/join/join_benchmark_common.hpp index e6fed454707..540366765c4 100644 --- a/cpp/benchmarks/join/join_benchmark_common.hpp +++ b/cpp/benchmarks/join/join_benchmark_common.hpp @@ -59,9 +59,8 @@ static void BM_join(state_type& state, Join JoinFunc) } }(); - const cudf::size_type rand_max_val{build_table_size * 2}; - const double selectivity = 0.3; - const bool is_build_table_key_unique = true; + const double selectivity = 0.3; + const int multiplicity = 1; // Generate build and probe tables cudf::test::UniformRandomGenerator rand_gen(0, build_table_size); @@ -94,8 +93,7 @@ static void BM_join(state_type& state, Join JoinFunc) probe_key_column->mutable_view().data(), probe_table_size, selectivity, - rand_max_val, - is_build_table_key_unique); + multiplicity); auto payload_data_it = thrust::make_counting_iterator(0); cudf::test::fixed_width_column_wrapper build_payload_column( @@ -124,12 +122,12 @@ static void BM_join(state_type& state, Join JoinFunc) if constexpr (std::is_same_v and (not is_conditional)) { state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { rmm::cuda_stream_view stream_view{launch.get_stream()}; - JoinFunc(probe_table, - build_table, - columns_to_join, - columns_to_join, - cudf::null_equality::UNEQUAL, - stream_view); + auto result = JoinFunc(probe_table, + build_table, + columns_to_join, + columns_to_join, + cudf::null_equality::UNEQUAL, + stream_view); }); } From 8d8741ff21e60c92db7861ee996b78443c0c978a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 22 Aug 2021 19:17:46 -0400 Subject: [PATCH 33/55] Fix a minor bug: stream instead of stream view --- cpp/src/join/hash_join.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index c5161bf7f2c..3b2ee4c3e71 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -284,7 +284,7 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, _hash_table{compute_hash_table_size(build.num_rows()), std::numeric_limits::max(), cudf::detail::JoinNoneValue, - stream} + stream.value()} { CUDF_FUNC_RANGE(); CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty"); From 9d9cd55fe820f83493e2e2459e7724ee41b7d73e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Aug 2021 16:52:20 -0400 Subject: [PATCH 34/55] Fetch cuco static multimap branch --- cpp/CMakeLists.txt | 3 --- cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a73895eb424..d6b457a94d4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -154,8 +154,6 @@ include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake) include(cmake/thirdparty/CUDF_GetcuCollections.cmake) # find or install GoogleTest include(cmake/thirdparty/CUDF_GetGTest.cmake) -# find cuCollections -include(cmake/thirdparty/CUDF_GetCUCO.cmake) # preprocess jitify-able kernels include(cmake/Modules/JitifyPreprocessKernels.cmake) # find cuFile @@ -500,7 +498,6 @@ target_compile_definitions(cudf PRIVATE "JITIFY_PRINT_LOG=0") target_include_directories(cudf PUBLIC "$" "$" - "$" "$" "$" "$" diff --git a/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake b/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake index 73717249585..54dbd0e821f 100644 --- a/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake @@ -22,8 +22,8 @@ function(find_and_configure_cucollections) # Find or install cuCollections CPMFindPackage(NAME cuco - GITHUB_REPOSITORY NVIDIA/cuCollections - GIT_TAG 0d602ae21ea4f38d23ed816aa948453d97b2ee4e + GITHUB_REPOSITORY PointKernel/cuCollections + GIT_TAG static-multi-map OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF" From 31cd0173e3bcf092cccfabc19522f67e2731623d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Aug 2021 16:56:30 -0400 Subject: [PATCH 35/55] Remove redundant CUDF_GetCUCO.cmake --- cpp/cmake/thirdparty/CUDF_GetCUCO.cmake | 37 ------------------------- 1 file changed, 37 deletions(-) delete mode 100644 cpp/cmake/thirdparty/CUDF_GetCUCO.cmake diff --git a/cpp/cmake/thirdparty/CUDF_GetCUCO.cmake b/cpp/cmake/thirdparty/CUDF_GetCUCO.cmake deleted file mode 100644 index ee06d53de32..00000000000 --- a/cpp/cmake/thirdparty/CUDF_GetCUCO.cmake +++ /dev/null @@ -1,37 +0,0 @@ -#============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#============================================================================= - -function(find_and_configure_cuco) - if(CUCO_INCLUDE) - set(CUCO_INCLUDE_DIR "${CUCO_INCLUDE}" PARENT_SCOPE) - return() - endif() - if(CUCO_INCLUDE_DIR) - set(CUCO_INCLUDE_DIR ${CUCO_INCLUDE_DIR} PARENT_SCOPE) - return() - endif() - CPMFindPackage(NAME cuco - GITHUB_REPOSITORY PointKernel/cuCollections - GIT_TAG static-multi-map - GIT_SHALLOW TRUE - DOWNLOAD_ONLY TRUE - OPTIONS "BUILD_BENCHMARKS OFF" - "BUILD_EXAMPLES OFF" - "BUILD_TESTS OFF") - set(CUCO_INCLUDE_DIR "${cuco_SOURCE_DIR}/include" PARENT_SCOPE) -endfunction() - -find_and_configure_cuco() From 46f516649829c65398116dac80b86c71fc35be44 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 24 Aug 2021 19:16:05 -0400 Subject: [PATCH 36/55] Update test_joining pytests: sort before compare --- python/cudf/cudf/tests/test_joining.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index c37939df7d3..3c053ff37b5 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -196,8 +196,11 @@ def test_dataframe_join_suffix(): # Check assert list(expect.columns) == list(got.columns) assert_eq(expect.index.values, got.index.values) - for k in expect.columns: - _check_series(expect[k].fillna(-1), got[k].fillna(-1)) + + got_sorted = got.sort_values(by=list(got.columns), axis=0) + expect_sorted = expect.sort_values(by=list(expect.columns), axis=0) + for k in expect_sorted.columns: + _check_series(expect_sorted[k].fillna(-1), got_sorted[k].fillna(-1)) def test_dataframe_join_cats(): @@ -1374,7 +1377,11 @@ def test_categorical_typecast_inner(): expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"], check_categorical=False) + assert_eq( + expect_data, + result["key"].sort_values().reset_index(drop=True), + check_categorical=False, + ) # Equal categories, unequal ordering -> error left = make_categorical_dataframe([1, 2, 3], ordered=False) @@ -1392,7 +1399,11 @@ def test_categorical_typecast_inner(): expect_dtype = cudf.CategoricalDtype(categories=[2, 3], ordered=False) expect_data = cudf.Series([2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"], check_categorical=False) + assert_eq( + expect_data, + result["key"].sort_values().reset_index(drop=True), + check_categorical=False, + ) # One is ordered -> error left = make_categorical_dataframe([1, 2, 3], ordered=False) @@ -1422,7 +1433,7 @@ def test_categorical_typecast_left(): expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"]) + assert_eq(expect_data, result["key"].sort_values().reset_index(drop=True)) # equal categories, unequal ordering -> error left = make_categorical_dataframe([1, 2, 3], ordered=True) @@ -1476,7 +1487,7 @@ def test_categorical_typecast_outer(): expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"]) + assert_eq(expect_data, result["key"].sort_values().reset_index(drop=True)) # equal categories, both ordered -> common dtype left = make_categorical_dataframe([1, 2, 3], ordered=True) @@ -1486,7 +1497,7 @@ def test_categorical_typecast_outer(): expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=True) expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"]) + assert_eq(expect_data, result["key"].sort_values().reset_index(drop=True)) # equal categories, one ordered -> error left = make_categorical_dataframe([1, 2, 3], ordered=False) From 57ac7312d72f40330af8d28f19d7f80ae6f9d3e7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Aug 2021 11:52:19 -0400 Subject: [PATCH 37/55] Use assert_join_results_equal instead of naive assert_eq --- python/cudf/cudf/tests/test_joining.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 3c053ff37b5..e32087e8bcc 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -1377,10 +1377,8 @@ def test_categorical_typecast_inner(): expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - assert_eq( - expect_data, - result["key"].sort_values().reset_index(drop=True), - check_categorical=False, + assert_join_results_equal( + expect_data, result["key"], how="inner", check_categorical=False ) # Equal categories, unequal ordering -> error @@ -1399,10 +1397,8 @@ def test_categorical_typecast_inner(): expect_dtype = cudf.CategoricalDtype(categories=[2, 3], ordered=False) expect_data = cudf.Series([2, 3], dtype=expect_dtype, name="key") - assert_eq( - expect_data, - result["key"].sort_values().reset_index(drop=True), - check_categorical=False, + assert_join_results_equal( + expect_data, result["key"], how="inner", check_categorical=False ) # One is ordered -> error @@ -1433,7 +1429,7 @@ def test_categorical_typecast_left(): expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"].sort_values().reset_index(drop=True)) + assert_join_results_equal(expect_data, result["key"], how="left") # equal categories, unequal ordering -> error left = make_categorical_dataframe([1, 2, 3], ordered=True) @@ -1452,7 +1448,7 @@ def test_categorical_typecast_left(): expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"].sort_values().reset_index(drop=True)) + assert_join_results_equal(expect_data, result["key"], how="left") # unequal categories, unequal ordering -> error left = make_categorical_dataframe([1, 2, 3], ordered=True) @@ -1487,7 +1483,7 @@ def test_categorical_typecast_outer(): expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"].sort_values().reset_index(drop=True)) + assert_join_results_equal(expect_data, result["key"], how="outer") # equal categories, both ordered -> common dtype left = make_categorical_dataframe([1, 2, 3], ordered=True) @@ -1497,7 +1493,7 @@ def test_categorical_typecast_outer(): expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=True) expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"].sort_values().reset_index(drop=True)) + assert_join_results_equal(expect_data, result["key"], how="outer") # equal categories, one ordered -> error left = make_categorical_dataframe([1, 2, 3], ordered=False) @@ -1516,7 +1512,7 @@ def test_categorical_typecast_outer(): expect_dtype = CategoricalDtype(categories=[1, 2, 3, 4], ordered=False) expect_data = cudf.Series([1, 2, 3, 4], dtype=expect_dtype, name="key") - assert_eq(expect_data, result["key"].sort_values().reset_index(drop=True)) + assert_join_results_equal(expect_data, result["key"], how="outer") # unequal categories, one ordered -> error left = make_categorical_dataframe([1, 2, 3], ordered=False) From c5690e0d38bd7d6954ff49517cae51f3231acacd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Aug 2021 13:49:16 -0400 Subject: [PATCH 38/55] Update pytests: sort before compare --- python/cudf/cudf/tests/test_indexing.py | 2 +- python/cudf/cudf/tests/test_multiindex.py | 4 ++-- python/cudf/cudf/tests/test_replace.py | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 58d39ff35a6..1619bf72304 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -361,7 +361,7 @@ def test_dataframe_loc_duplicate_index_scalar(): pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}, index=[1, 2, 1, 4, 2]) gdf = cudf.DataFrame.from_pandas(pdf) - assert_eq(pdf.loc[2], gdf.loc[2]) + assert_eq(pdf.loc[2].sort_index(), gdf.loc[2].sort_index()) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 18a82b58670..54234e9bf9e 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -296,7 +296,7 @@ def test_multiindex_loc(pdf, gdf, pdfIndex, key_tuple): assert_eq(pdfIndex, gdfIndex) pdf.index = pdfIndex gdf.index = gdfIndex - assert_eq(pdf.loc[key_tuple], gdf.loc[key_tuple]) + assert_eq(pdf.loc[key_tuple].sort_index(), gdf.loc[key_tuple].sort_index()) @pytest.mark.parametrize( @@ -964,7 +964,7 @@ def test_multiindex_rows_with_wildcard(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) pdf.index = pdfIndex gdf.index = gdfIndex - assert_eq(pdf.loc[("a",), :], gdf.loc[("a",), :]) + assert_eq(pdf.loc[("a",), :].sort_index(), gdf.loc[("a",), :].sort_index()) assert_eq(pdf.loc[(("a"), ("store")), :], gdf.loc[(("a"), ("store")), :]) assert_eq( pdf.loc[(("a"), ("store"), ("storm")), :], diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index f60baec746f..e14063d058b 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -58,7 +58,7 @@ def test_series_replace_all(gsr, to_replace, value): actual = gsr.replace(to_replace=gd_to_replace, value=gd_value) expected = psr.replace(to_replace=pd_to_replace, value=pd_value) - assert_eq(expected, actual) + assert_eq(expected.sort_index(), actual.sort_index()) def test_series_replace(): @@ -75,7 +75,7 @@ def test_series_replace(): psr4 = psr3.replace("one", "two") sr3 = cudf.from_pandas(psr3) sr4 = sr3.replace("one", "two") - assert_eq(psr4, sr4) + assert_eq(psr4.sort_index(), sr4.sort_index()) psr5 = psr3.replace("one", "five") sr5 = sr3.replace("one", "five") @@ -226,7 +226,7 @@ def test_dataframe_replace(df, to_replace, value): expected = pdf.replace(to_replace=pd_to_replace, value=pd_value) actual = gdf.replace(to_replace=gd_to_replace, value=gd_value) - assert_eq(expected, actual) + assert_eq(expected.sort_index(), actual.sort_index()) def test_dataframe_replace_with_nulls(): @@ -1001,8 +1001,8 @@ def test_replace_inplace(): assert_eq(sr_copy, psr_copy) sr.replace("one", "two", inplace=True) psr.replace("one", "two", inplace=True) - assert_eq(sr, psr) - assert_eq(sr_copy, psr_copy) + assert_eq(sr.sort_index(), psr.sort_index()) + assert_eq(sr_copy.sort_index(), psr_copy.sort_index()) pdf = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9]}) gdf = cudf.from_pandas(pdf) @@ -1342,4 +1342,4 @@ def test_series_replace_errors(): def test_replace_nulls(gsr, old, new, expected): actual = gsr.replace(old, new) - assert_eq(expected, actual) + assert_eq(expected.sort_index(), actual.sort_index()) From 8a00fd2d4f9d9f1d8338956fee06ccb12beb2172 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Aug 2021 16:16:33 -0400 Subject: [PATCH 39/55] Corrections: use sort_values() --- python/cudf/cudf/tests/test_indexing.py | 5 +++- python/cudf/cudf/tests/test_multiindex.py | 5 +++- python/cudf/cudf/tests/test_replace.py | 30 ++++++++++++++++++----- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 1619bf72304..edce22072c5 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -361,7 +361,10 @@ def test_dataframe_loc_duplicate_index_scalar(): pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}, index=[1, 2, 1, 4, 2]) gdf = cudf.DataFrame.from_pandas(pdf) - assert_eq(pdf.loc[2].sort_index(), gdf.loc[2].sort_index()) + assert_eq( + pdf.loc[2].sort_values().reset_index(drop=True), + gdf.loc[2].sort_values().reset_index(drop=True), + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 54234e9bf9e..7d12841d706 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -965,7 +965,10 @@ def test_multiindex_rows_with_wildcard(pdf, gdf, pdfIndex): pdf.index = pdfIndex gdf.index = gdfIndex assert_eq(pdf.loc[("a",), :].sort_index(), gdf.loc[("a",), :].sort_index()) - assert_eq(pdf.loc[(("a"), ("store")), :], gdf.loc[(("a"), ("store")), :]) + assert_eq( + pdf.loc[(("a"), ("store")), :].sort_index(), + gdf.loc[(("a"), ("store")), :].sort_index(), + ) assert_eq( pdf.loc[(("a"), ("store"), ("storm")), :], gdf.loc[(("a"), ("store"), ("storm")), :], diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index e14063d058b..c79182c63be 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -58,7 +58,10 @@ def test_series_replace_all(gsr, to_replace, value): actual = gsr.replace(to_replace=gd_to_replace, value=gd_value) expected = psr.replace(to_replace=pd_to_replace, value=pd_value) - assert_eq(expected.sort_index(), actual.sort_index()) + assert_eq( + expected.sort_values().reset_index(drop=True), + actual.sort_values().reset_index(drop=True), + ) def test_series_replace(): @@ -75,7 +78,10 @@ def test_series_replace(): psr4 = psr3.replace("one", "two") sr3 = cudf.from_pandas(psr3) sr4 = sr3.replace("one", "two") - assert_eq(psr4.sort_index(), sr4.sort_index()) + assert_eq( + psr4.sort_values().reset_index(drop=True), + sr4.sort_values().reset_index(drop=True), + ) psr5 = psr3.replace("one", "five") sr5 = sr3.replace("one", "five") @@ -226,7 +232,10 @@ def test_dataframe_replace(df, to_replace, value): expected = pdf.replace(to_replace=pd_to_replace, value=pd_value) actual = gdf.replace(to_replace=gd_to_replace, value=gd_value) - assert_eq(expected.sort_index(), actual.sort_index()) + assert_eq( + expected.sort_values().reset_index(drop=True), + actual.sort_values().reset_index(drop=True), + ) def test_dataframe_replace_with_nulls(): @@ -1001,8 +1010,14 @@ def test_replace_inplace(): assert_eq(sr_copy, psr_copy) sr.replace("one", "two", inplace=True) psr.replace("one", "two", inplace=True) - assert_eq(sr.sort_index(), psr.sort_index()) - assert_eq(sr_copy.sort_index(), psr_copy.sort_index()) + assert_eq( + sr.sort_values().reset_index(drop=True), + psr.sort_values().reset_index(drop=True), + ) + assert_eq( + sr_copy.sort_values().reset_index(drop=True), + psr_copy.sort_values().reset_index(drop=True), + ) pdf = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9]}) gdf = cudf.from_pandas(pdf) @@ -1342,4 +1357,7 @@ def test_series_replace_errors(): def test_replace_nulls(gsr, old, new, expected): actual = gsr.replace(old, new) - assert_eq(expected.sort_index(), actual.sort_index()) + assert_eq( + expected.sort_values().reset_index(drop=True), + actual.sort_values().reset_index(drop=True), + ) From 1385d50a1d51e9f1d53fd40417dabec0faef017b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 25 Aug 2021 17:41:59 -0400 Subject: [PATCH 40/55] Minor corrections --- python/cudf/cudf/tests/test_indexing.py | 8 ++++---- python/cudf/cudf/tests/test_replace.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index edce22072c5..b1eb9c7b01f 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -361,10 +361,10 @@ def test_dataframe_loc_duplicate_index_scalar(): pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}, index=[1, 2, 1, 4, 2]) gdf = cudf.DataFrame.from_pandas(pdf) - assert_eq( - pdf.loc[2].sort_values().reset_index(drop=True), - gdf.loc[2].sort_values().reset_index(drop=True), - ) + pdf_sorted = pdf.sort_values(by=list(pdf.columns), axis=0) + gdf_sorted = gdf.sort_values(by=list(gdf.columns), axis=0) + + assert_eq(pdf_sorted, gdf_sorted) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index c79182c63be..2e32b216630 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -232,10 +232,10 @@ def test_dataframe_replace(df, to_replace, value): expected = pdf.replace(to_replace=pd_to_replace, value=pd_value) actual = gdf.replace(to_replace=gd_to_replace, value=gd_value) - assert_eq( - expected.sort_values().reset_index(drop=True), - actual.sort_values().reset_index(drop=True), - ) + expected_sorted = expected.sort_values(by=list(expected.columns), axis=0) + actual_sorted = actual.sort_values(by=list(actual.columns), axis=0) + + assert_eq(expected_sorted, actual_sorted) def test_dataframe_replace_with_nulls(): From a3174286385b80882ad862f91a917737159a8965 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Aug 2021 17:14:40 -0400 Subject: [PATCH 41/55] Use insert instead of insert_if if row_bitmask is null --- cpp/src/join/hash_join.cu | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index ccca58cba6f..fb82ab155d5 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -77,11 +77,6 @@ void build_join_hash_table(cudf::table_view const& build, CUDF_EXPECTS(0 != build_device_table->num_columns(), "Selected build dataset is empty"); CUDF_EXPECTS(0 != build_device_table->num_rows(), "Build side table has no rows"); - auto const row_bitmask = (compare_nulls == null_equality::EQUAL) - ? rmm::device_buffer{0, stream} - : cudf::detail::bitmask_and(build, stream); - row_contains_null pred{static_cast(row_bitmask.data())}; - row_hash hash_build{*build_device_table}; auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); make_pair_function pair_func{hash_build, empty_key_sentinel}; @@ -90,7 +85,14 @@ void build_join_hash_table(cudf::table_view const& build, auto iter = thrust::make_transform_iterator(first, pair_func); size_type const build_table_num_rows{build_device_table->num_rows()}; - hash_table.insert_if(iter, iter + build_table_num_rows, pred, stream.value()); + if (compare_nulls == null_equality::EQUAL) { + hash_table.insert(iter, iter + build_table_num_rows, stream.value()); + } else { + auto const row_bitmask = cudf::detail::bitmask_and(build, stream); + row_contains_null pred{static_cast(row_bitmask.data())}; + + hash_table.insert_if(iter, iter + build_table_num_rows, pred, stream.value()); + } } /** From 5cfc15ecf6ef99d1c4349455de9e89b3c48f98e9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 26 Aug 2021 17:33:29 -0400 Subject: [PATCH 42/55] Compare against nullptr to avoid implicit type conversion --- cpp/src/join/hash_join.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index fb82ab155d5..33c39eda6b2 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -41,7 +41,7 @@ class row_contains_null { __device__ __inline__ bool operator()(const pair_type& pair) const noexcept { - return !_row_bitmask || cudf::bit_is_set(_row_bitmask, pair.second); + return _row_bitmask == nullptr or cudf::bit_is_set(_row_bitmask, pair.second); } private: From a8ddcf7c7bf3069621f24416ed29549e0456da50 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 30 Aug 2021 18:06:39 -0400 Subject: [PATCH 43/55] Add default cg size = 2 --- cpp/src/join/join_common_utils.hpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index ffb48690f76..253de77fc35 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -29,17 +29,19 @@ namespace cudf { namespace detail { constexpr size_type MAX_JOIN_SIZE{std::numeric_limits::max()}; +constexpr int DEFAULT_JOIN_CG_SIZE = 2; constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128; constexpr int DEFAULT_JOIN_CACHE_SIZE = 128; constexpr size_type JoinNoneValue = std::numeric_limits::min(); using pair_type = cuco::pair_type; -using multimap_type = cuco::static_multimap, - cuda::thread_scope_device, - default_allocator>; +using multimap_type = + cuco::static_multimap, + cuda::thread_scope_device, + default_allocator>; using row_hash = cudf::row_hasher; From 6165d3f1d8bf26b317e5c3cf0eaecb9d8b8703b7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 31 Aug 2021 18:51:01 -0400 Subject: [PATCH 44/55] Avoid unnecessary row_bitmask build --- cpp/src/join/hash_join.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index e0b649474e1..097549ddf5d 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -85,7 +85,7 @@ void build_join_hash_table(cudf::table_view const& build, auto iter = thrust::make_transform_iterator(first, pair_func); size_type const build_table_num_rows{build_device_table->num_rows()}; - if (compare_nulls == null_equality::EQUAL) { + if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { hash_table.insert(iter, iter + build_table_num_rows, stream.value()); } else { auto const row_bitmask = cudf::detail::bitmask_and(build, stream); From dd5b177d155b3998773c6ce8c8f2691e033c13e0 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 1 Sep 2021 08:47:49 -0400 Subject: [PATCH 45/55] Sort before assert_eq to avoid out-of-order comparison in pytests --- python/cudf/cudf/tests/test_multiindex.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 7d12841d706..562f284b33a 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -970,23 +970,28 @@ def test_multiindex_rows_with_wildcard(pdf, gdf, pdfIndex): gdf.loc[(("a"), ("store")), :].sort_index(), ) assert_eq( - pdf.loc[(("a"), ("store"), ("storm")), :], - gdf.loc[(("a"), ("store"), ("storm")), :], + pdf.loc[(("a"), ("store"), ("storm")), :].sort_index(), + gdf.loc[(("a"), ("store"), ("storm")), :].sort_index(), ) assert_eq( - pdf.loc[(("a"), ("store"), ("storm"), ("smoke")), :], - gdf.loc[(("a"), ("store"), ("storm"), ("smoke")), :], + pdf.loc[(("a"), ("store"), ("storm"), ("smoke")), :].sort_index(), + gdf.loc[(("a"), ("store"), ("storm"), ("smoke")), :].sort_index(), ) assert_eq( - pdf.loc[(slice(None), "store"), :], gdf.loc[(slice(None), "store"), :] + pdf.loc[(slice(None), "store"), :].sort_index(), + gdf.loc[(slice(None), "store"), :].sort_index(), ) assert_eq( - pdf.loc[(slice(None), slice(None), "storm"), :], - gdf.loc[(slice(None), slice(None), "storm"), :], + pdf.loc[(slice(None), slice(None), "storm"), :].sort_index(), + gdf.loc[(slice(None), slice(None), "storm"), :].sort_index(), ) assert_eq( - pdf.loc[(slice(None), slice(None), slice(None), "smoke"), :], - gdf.loc[(slice(None), slice(None), slice(None), "smoke"), :], + pdf.loc[ + (slice(None), slice(None), slice(None), "smoke"), : + ].sort_index(), + gdf.loc[ + (slice(None), slice(None), slice(None), "smoke"), : + ].sort_index(), ) From f6d3df4dc786522addd7d8ba14a61e293040664e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 1 Sep 2021 10:28:39 -0400 Subject: [PATCH 46/55] Update cuco cmake: use rapids_cpm_find instead of CPMFindPackage --- cpp/cmake/thirdparty/get_cucollections.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index 4dd8882a025..4703b5bd468 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -14,6 +14,8 @@ # limitations under the License. #============================================================================= +# cuCollections doesn't have a version + function(find_and_configure_cucollections) if(TARGET cuco::cuco) @@ -21,7 +23,7 @@ function(find_and_configure_cucollections) endif() # Find or install cuCollections - CPMFindPackage(NAME cuco + rapids_cpm_find(cuco 0.0.1 GLOBAL_TARGETS cuco::cuco CPM_ARGS GITHUB_REPOSITORY PointKernel/cuCollections From d2252b6959325675cbca70ce2eb9044d15d707d2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 1 Sep 2021 15:08:19 -0400 Subject: [PATCH 47/55] Remove redundant if TARGET check in cuco cmake --- cpp/cmake/thirdparty/get_cucollections.cmake | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index 4703b5bd468..05c0942d11b 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -17,11 +17,6 @@ # cuCollections doesn't have a version function(find_and_configure_cucollections) - - if(TARGET cuco::cuco) - return() - endif() - # Find or install cuCollections rapids_cpm_find(cuco 0.0.1 GLOBAL_TARGETS cuco::cuco From 8eeaf75e211888c008ca2ccc4e21c9bd4150987a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 2 Sep 2021 10:24:54 -0400 Subject: [PATCH 48/55] Minor correction on comment --- cpp/benchmarks/join/join_benchmark_common.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/join/join_benchmark_common.hpp b/cpp/benchmarks/join/join_benchmark_common.hpp index 3ca3804acc7..e88253395d8 100644 --- a/cpp/benchmarks/join/join_benchmark_common.hpp +++ b/cpp/benchmarks/join/join_benchmark_common.hpp @@ -66,7 +66,7 @@ static void BM_join(state_type& state, Join JoinFunc) // Generate build and probe tables cudf::test::UniformRandomGenerator rand_gen(0, build_table_size); auto build_random_null_mask = [&rand_gen](int size) { - // roughly 25% nulls + // roughly 75% nulls auto validity = thrust::make_transform_iterator( thrust::make_counting_iterator(0), [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; }); From 7175e35ae336fd6e140ea1b180dad1d0f15f58a1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 3 Sep 2021 07:17:14 -0400 Subject: [PATCH 49/55] Minor improvement: insert_if predicate on the stencil sequence --- cpp/src/join/hash_join.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 097549ddf5d..28ef5c134b6 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -39,9 +39,9 @@ class row_contains_null { public: row_contains_null(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} - __device__ __inline__ bool operator()(const pair_type& pair) const noexcept + __device__ __inline__ bool operator()(const size_type& i) const noexcept { - return _row_bitmask == nullptr or cudf::bit_is_set(_row_bitmask, pair.second); + return _row_bitmask == nullptr or cudf::bit_is_set(_row_bitmask, i); } private: @@ -91,7 +91,7 @@ void build_join_hash_table(cudf::table_view const& build, auto const row_bitmask = cudf::detail::bitmask_and(build, stream); row_contains_null pred{static_cast(row_bitmask.data())}; - hash_table.insert_if(iter, iter + build_table_num_rows, pred, stream.value()); + hash_table.insert_if_n(iter, first, build_table_num_rows, pred, stream.value()); } } From 0af148bf9c978ab3cd5649246059353edadf3b78 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 3 Sep 2021 11:03:42 -0400 Subject: [PATCH 50/55] Cleanups: remove unnecessary nullptr check + insert_if instead of insert_if_n --- cpp/src/join/hash_join.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 28ef5c134b6..07058d81ec1 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -41,7 +41,7 @@ class row_contains_null { __device__ __inline__ bool operator()(const size_type& i) const noexcept { - return _row_bitmask == nullptr or cudf::bit_is_set(_row_bitmask, i); + return cudf::bit_is_set(_row_bitmask, i); } private: @@ -91,7 +91,7 @@ void build_join_hash_table(cudf::table_view const& build, auto const row_bitmask = cudf::detail::bitmask_and(build, stream); row_contains_null pred{static_cast(row_bitmask.data())}; - hash_table.insert_if_n(iter, first, build_table_num_rows, pred, stream.value()); + hash_table.insert_if(iter, iter + build_table_num_rows, first, pred, stream.value()); } } From 7ee81c8da2d3d9d7763259123ebc7573a59fb818 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 10 Sep 2021 19:34:42 -0400 Subject: [PATCH 51/55] Updates: proper variable names + rename _empty --- cpp/src/join/hash_join.cu | 30 +++++++++++++++--------------- cpp/src/join/hash_join.cuh | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 07058d81ec1..fc1f022827e 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -72,19 +72,19 @@ void build_join_hash_table(cudf::table_view const& build, null_equality compare_nulls, rmm::cuda_stream_view stream) { - auto build_device_table = cudf::table_device_view::create(build, stream); + auto build_table_ptr = cudf::table_device_view::create(build, stream); - CUDF_EXPECTS(0 != build_device_table->num_columns(), "Selected build dataset is empty"); - CUDF_EXPECTS(0 != build_device_table->num_rows(), "Build side table has no rows"); + CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty"); + CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows"); - row_hash hash_build{*build_device_table}; + row_hash hash_build{*build_table_ptr}; auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); make_pair_function pair_func{hash_build, empty_key_sentinel}; thrust::counting_iterator first(0); auto iter = thrust::make_transform_iterator(first, pair_func); - size_type const build_table_num_rows{build_device_table->num_rows()}; + size_type const build_table_num_rows{build_table_ptr->num_rows()}; if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { hash_table.insert(iter, iter + build_table_num_rows, stream.value()); } else { @@ -282,7 +282,7 @@ hash_join::hash_join_impl::~hash_join_impl() = default; hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, null_equality compare_nulls, rmm::cuda_stream_view stream) - : _empty{!build.num_rows()}, + : _is_empty{build.num_rows() == 0}, _hash_table{compute_hash_table_size(build.num_rows()), std::numeric_limits::max(), cudf::detail::JoinNoneValue, @@ -299,7 +299,7 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, // need to store off the owning structures for some of the views in _build _created_null_columns = std::move(std::get<3>(flattened_build)); - if (_empty) { return; } + if (_is_empty) { return; } build_join_hash_table(_build, _hash_table, compare_nulls, stream); } @@ -350,7 +350,7 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p CUDF_FUNC_RANGE(); // Return directly if build table is empty - if (_empty) { return 0; } + if (_is_empty) { return 0; } auto flattened_probe = structs::detail::flatten_nested_columns( probe, {}, {}, structs::detail::column_nullability::FORCE); @@ -370,7 +370,7 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr CUDF_FUNC_RANGE(); // Trivial left join case - exit early - if (_empty) { return probe.num_rows(); } + if (_is_empty) { return probe.num_rows(); } auto flattened_probe = structs::detail::flatten_nested_columns( probe, {}, {}, structs::detail::column_nullability::FORCE); @@ -391,7 +391,7 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr CUDF_FUNC_RANGE(); // Trivial left join case - exit early - if (_empty) { return probe.num_rows(); } + if (_is_empty) { return probe.num_rows(); } auto flattened_probe = structs::detail::flatten_nested_columns( probe, {}, {}, structs::detail::column_nullability::FORCE); @@ -450,17 +450,17 @@ hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe, rmm::mr::device_memory_resource* mr) const { // Trivial left join case - exit early - if (_empty and JoinKind != cudf::detail::join_kind::INNER_JOIN) { + if (_is_empty and JoinKind != cudf::detail::join_kind::INNER_JOIN) { return get_trivial_left_join_indices(probe, stream, mr); } - CUDF_EXPECTS(!_empty, "Hash table of hash join is null."); + CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null."); - auto build_table = cudf::table_device_view::create(_build, stream); - auto probe_table = cudf::table_device_view::create(probe, stream); + auto build_table_ptr = cudf::table_device_view::create(_build, stream); + auto probe_table_ptr = cudf::table_device_view::create(probe, stream); auto join_indices = cudf::detail::probe_join_hash_table( - *build_table, *probe_table, _hash_table, compare_nulls, output_size, stream, mr); + *build_table_ptr, *probe_table_ptr, _hash_table, compare_nulls, output_size, stream, mr); if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) { auto complement_indices = detail::get_left_join_indices_complement( diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index e049db08b9c..5a0364a92bb 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -153,9 +153,9 @@ struct hash_join::hash_join_impl { hash_join_impl& operator=(hash_join_impl&&) = delete; private: + bool _is_empty; cudf::table_view _build; std::vector> _created_null_columns; - bool _empty; cudf::detail::multimap_type _hash_table; public: From 8cb07625a9f9485df982a21fcf12b76a1dbfd64a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 16 Sep 2021 22:21:10 -0400 Subject: [PATCH 52/55] Minor update: double hashing moved to cuco::detail namespace --- cpp/src/join/join_common_utils.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 253de77fc35..85cf3b9913c 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -36,12 +36,12 @@ constexpr size_type JoinNoneValue = std::numeric_limits::min(); using pair_type = cuco::pair_type; -using multimap_type = - cuco::static_multimap, - cuda::thread_scope_device, - default_allocator>; +using multimap_type = cuco::static_multimap< + hash_value_type, + size_type, + cuco::detail::double_hashing, + cuda::thread_scope_device, + default_allocator>; using row_hash = cudf::row_hasher; From bb66362d4038b436b077f6729e9e899b5b13aa64 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 20 Sep 2021 12:17:17 -0400 Subject: [PATCH 53/55] Address review comments: add docs + use make_counting_transform_iterator --- cpp/src/join/hash_join.cu | 12 +++++------- cpp/src/join/hash_join.cuh | 9 ++++++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index fc1f022827e..f4e5bf7c476 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -81,8 +81,8 @@ void build_join_hash_table(cudf::table_view const& build, auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); make_pair_function pair_func{hash_build, empty_key_sentinel}; - thrust::counting_iterator first(0); - auto iter = thrust::make_transform_iterator(first, pair_func); + thrust::counting_iterator stencil(0); + auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); size_type const build_table_num_rows{build_table_ptr->num_rows()}; if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { @@ -91,7 +91,7 @@ void build_join_hash_table(cudf::table_view const& build, auto const row_bitmask = cudf::detail::bitmask_and(build, stream); row_contains_null pred{static_cast(row_bitmask.data())}; - hash_table.insert_if(iter, iter + build_table_num_rows, first, pred, stream.value()); + hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value()); } } @@ -145,8 +145,7 @@ probe_join_hash_table(cudf::table_device_view build_table, auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); make_pair_function pair_func{hash_probe, empty_key_sentinel}; - thrust::counting_iterator first(0); - auto iter = thrust::make_transform_iterator(first, pair_func); + auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); const cudf::size_type probe_table_num_rows = probe_table.num_rows(); @@ -210,8 +209,7 @@ std::size_t get_full_join_size(cudf::table_device_view build_table, auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); make_pair_function pair_func{hash_probe, empty_key_sentinel}; - thrust::counting_iterator first(0); - auto iter = thrust::make_transform_iterator(first, pair_func); + auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); const cudf::size_type probe_table_num_rows = probe_table.num_rows(); diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 5a0364a92bb..e66ad0957a8 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -52,6 +52,9 @@ constexpr auto remap_sentinel_hash(H hash, S sentinel) return (hash == sentinel) ? (hash - 1) : hash; } +/** + * @brief Device functor to create a pair of hash value and index for a given row. + */ class make_pair_function { public: make_pair_function(row_hash const& hash, hash_value_type const empty_key_sentinel) @@ -59,8 +62,9 @@ class make_pair_function { { } - __device__ __inline__ cudf::detail::pair_type operator()(size_type i) const noexcept + __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept { + // Compute the hash value of row `i` auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel); return cuco::make_pair(std::move(row_hash_value), std::move(i)); } @@ -120,8 +124,7 @@ std::size_t compute_join_output_size(table_device_view build_table, auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); make_pair_function pair_func{hash_probe, empty_key_sentinel}; - thrust::counting_iterator first(0); - auto iter = thrust::make_transform_iterator(first, pair_func); + auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); size_type size; if constexpr (JoinKind == join_kind::LEFT_JOIN) { From df0da487beb340cf79d69d64ae180f18d0854860 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 20 Sep 2021 12:41:38 -0400 Subject: [PATCH 54/55] Minor updates + rename row_contains_null as row_is_valid --- cpp/src/join/hash_join.cu | 12 ++++++++---- cpp/src/join/join_common_utils.hpp | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index f4e5bf7c476..90973525111 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -35,9 +35,12 @@ namespace detail { namespace { -class row_contains_null { +/** + * @brief Device functor to determine if a row is valid. + */ +class row_is_valid { public: - row_contains_null(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} + row_is_valid(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} __device__ __inline__ bool operator()(const size_type& i) const noexcept { @@ -81,16 +84,17 @@ void build_join_hash_table(cudf::table_view const& build, auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); make_pair_function pair_func{hash_build, empty_key_sentinel}; - thrust::counting_iterator stencil(0); auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); size_type const build_table_num_rows{build_table_ptr->num_rows()}; if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { hash_table.insert(iter, iter + build_table_num_rows, stream.value()); } else { + thrust::counting_iterator stencil(0); auto const row_bitmask = cudf::detail::bitmask_and(build, stream); - row_contains_null pred{static_cast(row_bitmask.data())}; + row_is_valid pred{static_cast(row_bitmask.data())}; + // insert valid rows hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value()); } } diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 85cf3b9913c..3e314586b3c 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -39,8 +39,8 @@ using pair_type = cuco::pair_type; using multimap_type = cuco::static_multimap< hash_value_type, size_type, - cuco::detail::double_hashing, cuda::thread_scope_device, + cuco::detail::double_hashing, default_allocator>; using row_hash = cudf::row_hasher; From c49166e3a2523210534e1b3612e7d3c92688d308 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 1 Nov 2021 15:40:25 -0400 Subject: [PATCH 55/55] Updates: fetching the latest cuco tag --- cpp/src/join/hash_join.cu | 17 +++++++++-------- cpp/src/join/hash_join.cuh | 3 ++- cpp/src/join/join_common_utils.hpp | 14 ++++++++------ 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 2f9eab30012..99a94c45510 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -154,23 +154,24 @@ probe_join_hash_table(cudf::table_device_view build_table, const cudf::size_type probe_table_num_rows = probe_table.num_rows(); - auto out1_zip = thrust::make_zip_iterator( + auto out1_zip_begin = thrust::make_zip_iterator( thrust::make_tuple(thrust::make_discard_iterator(), left_indices->begin())); - auto out2_zip = thrust::make_zip_iterator( + auto out2_zip_begin = thrust::make_zip_iterator( thrust::make_tuple(thrust::make_discard_iterator(), right_indices->begin())); if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN or JoinKind == cudf::detail::join_kind::LEFT_JOIN) { - [[maybe_unused]] auto const actual_size = hash_table.pair_retrieve_outer( - iter, iter + probe_table_num_rows, out1_zip, out2_zip, equality, stream.value()); + [[maybe_unused]] auto [out1_zip_end, out2_zip_end] = hash_table.pair_retrieve_outer( + iter, iter + probe_table_num_rows, out1_zip_begin, out2_zip_begin, equality, stream.value()); if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) { + auto const actual_size = out1_zip_end - out1_zip_begin; left_indices->resize(actual_size, stream); right_indices->resize(actual_size, stream); } } else { hash_table.pair_retrieve( - iter, iter + probe_table_num_rows, out1_zip, out2_zip, equality, stream.value()); + iter, iter + probe_table_num_rows, out1_zip_begin, out2_zip_begin, equality, stream.value()); } return std::make_pair(std::move(left_indices), std::move(right_indices)); } @@ -218,13 +219,13 @@ std::size_t get_full_join_size(cudf::table_device_view build_table, const cudf::size_type probe_table_num_rows = probe_table.num_rows(); - auto out1_zip = thrust::make_zip_iterator( + auto out1_zip_begin = thrust::make_zip_iterator( thrust::make_tuple(thrust::make_discard_iterator(), left_indices->begin())); - auto out2_zip = thrust::make_zip_iterator( + auto out2_zip_begin = thrust::make_zip_iterator( thrust::make_tuple(thrust::make_discard_iterator(), right_indices->begin())); hash_table.pair_retrieve_outer( - iter, iter + probe_table_num_rows, out1_zip, out2_zip, equality, stream.value()); + iter, iter + probe_table_num_rows, out1_zip_begin, out2_zip_begin, equality, stream.value()); // Release intermediate memory allocation left_indices->resize(0, stream); diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index ef12a0f922c..aa3d6a20d7f 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -15,10 +15,11 @@ */ #pragma once -#include #include #include +#include +#include #include #include #include diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 3e314586b3c..d6eb5e93a98 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -36,12 +36,14 @@ constexpr size_type JoinNoneValue = std::numeric_limits::min(); using pair_type = cuco::pair_type; -using multimap_type = cuco::static_multimap< - hash_value_type, - size_type, - cuda::thread_scope_device, - cuco::detail::double_hashing, - default_allocator>; +using hash_type = cuco::detail::MurmurHash3_32; + +using multimap_type = + cuco::static_multimap, + cuco::double_hashing>; using row_hash = cudf::row_hasher;