From 3c2639500d142f08e391f2583800d5e22c514420 Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Fri, 18 Jun 2021 12:22:01 -0400 Subject: [PATCH] remove legacy renumbering code --- cpp/CMakeLists.txt | 1 - cpp/include/cugraph/functions.hpp | 34 - cpp/src/converters/renumber.cu | 68 -- cpp/src/converters/renumber.cuh | 321 ---------- cpp/src/sort/bitonic.cuh | 546 ----------------- cpp/src/utilities/heap.cuh | 222 ------- cpp/tests/CMakeLists.txt | 4 - cpp/tests/renumber/renumber_test.cu | 579 ------------------ python/cugraph/structure/graph_primtypes.pxd | 10 - .../structure/graph_primtypes_wrapper.pyx | 40 -- 10 files changed, 1825 deletions(-) delete mode 100644 cpp/src/converters/renumber.cu delete mode 100644 cpp/src/converters/renumber.cuh delete mode 100644 cpp/src/sort/bitonic.cuh delete mode 100644 cpp/src/utilities/heap.cuh delete mode 100644 cpp/tests/renumber/renumber_test.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ae0b524689f..7da32f451b8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -169,7 +169,6 @@ add_library(cugraph SHARED src/link_prediction/jaccard.cu src/link_prediction/overlap.cu src/layout/force_atlas2.cu - src/converters/renumber.cu src/converters/COOtoCSR.cu src/community/spectral_clustering.cu src/community/louvain.cu diff --git a/cpp/include/cugraph/functions.hpp b/cpp/include/cugraph/functions.hpp index 00e8648b156..930eeb52b5c 100644 --- a/cpp/include/cugraph/functions.hpp +++ b/cpp/include/cugraph/functions.hpp @@ -44,40 +44,6 @@ std::unique_ptr> coo_to_csr( GraphCOOView const &graph, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); -/** - * @brief Renumber source and destination indices - * - * Renumber source and destination indexes to be a dense numbering, - * using contiguous values between 0 and number of vertices minus 1. - * - * @throws cugraph::logic_error when an error occurs. - * - * @tparam VT_IN type of vertex index input - * @tparam VT_OUT type of vertex index output - * @tparam ET type of edge index - * - * @param[in] number_of_edges number of edges in the graph - * @param[in] src Pointer to device memory containing source vertex ids - * @param[in] dst Pointer to device memory containing destination vertex ids - * @param[out] src_renumbered Pointer to device memory containing the output source vertices. - * @param[out] dst_renumbered Pointer to device memory containing the output destination vertices. - * @param[out] map_size Pointer to local memory containing the number of elements in the - * renumbering map - * @param[in] mr Memory resource used to allocate the returned graph - * - * @return Unique pointer to renumbering map - * - */ -template -std::unique_ptr renumber_vertices( - ET number_of_edges, - VT_IN const *src, - VT_IN const *dst, - VT_OUT *src_renumbered, - VT_OUT *dst_renumbered, - ET *map_size, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); - /** * @brief Broadcast using handle communicator * diff --git a/cpp/src/converters/renumber.cu b/cpp/src/converters/renumber.cu deleted file mode 100644 index 9aedbc70e8b..00000000000 --- a/cpp/src/converters/renumber.cu +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "renumber.cuh" - -namespace cugraph { - -template -std::unique_ptr renumber_vertices( - ET number_of_edges, - VT_IN const *src, - VT_IN const *dst, - VT_OUT *src_renumbered, - VT_OUT *dst_renumbered, - ET *map_size, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) - -{ - // - // For now, let's just specify a default value of the hash size. - // This should be configurable. - // - // FIXME: cudf has a hash table implementation (moving to cuCollections) - // that is dynamic. We should use it instead, it will be faster - // and dynamically adjust to data sizes. - // - int hash_size = 8191; - - return cugraph::detail::renumber_vertices(number_of_edges, - src, - dst, - src_renumbered, - dst_renumbered, - map_size, - cugraph::detail::HashFunctionObjectInt(hash_size), - thrust::less(), - mr); -} - -template std::unique_ptr renumber_vertices(int32_t, - int64_t const *, - int64_t const *, - int32_t *, - int32_t *, - int32_t *, - rmm::mr::device_memory_resource *); -template std::unique_ptr renumber_vertices(int32_t, - int32_t const *, - int32_t const *, - int32_t *, - int32_t *, - int32_t *, - rmm::mr::device_memory_resource *); - -} // namespace cugraph diff --git a/cpp/src/converters/renumber.cuh b/cpp/src/converters/renumber.cuh deleted file mode 100644 index ccf4e6f62c2..00000000000 --- a/cpp/src/converters/renumber.cuh +++ /dev/null @@ -1,321 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#define CUB_STDERR - -#include - -#include - -#include -#include -#include -#include - -#include -#include - -#include -#include -#include "sort/bitonic.cuh" - -namespace cugraph { -namespace detail { - -namespace renumber { -typedef uint32_t hash_type; -typedef uint32_t index_type; -} // namespace renumber - -class HashFunctionObjectInt { - public: - HashFunctionObjectInt(renumber::hash_type hash_size) : hash_size_(hash_size) {} - - template - __device__ __inline__ renumber::hash_type operator()(const VertexIdType &vertex_id) const - { - return ((vertex_id % hash_size_) + hash_size_) % hash_size_; - } - - renumber::hash_type getHashSize() const { return hash_size_; } - - private: - renumber::hash_type hash_size_; -}; - -/** - * @brief Renumber vertices to a dense numbering (0..vertex_size-1) - * - * This is a templated function so it can take 32 or 64 bit integers. The - * intention is to take source and destination vertex ids that might be - * sparsely scattered across the range and push things down to a dense - * numbering. - * - * Arrays src, dst, src_renumbered, dst_renumbered and numbering_map are - * assumed to be pre-allocated. numbering_map is best safely allocated - * to store 2 * size vertices. - * - * @param[in] size Number of edges - * @param[in] src List of source vertices - * @param[in] dst List of dest vertices - * @param[out] src_renumbered List of source vertices, renumbered - * @param[out] dst_renumbered List of dest vertices, renumbered - * @param[out] vertex_size Number of unique vertices - * @param[out] numbering_map Map of new vertex id to original vertex id. numbering_map[newId] - * = oldId - * - */ -template -std::unique_ptr renumber_vertices(T_size size, - const T_in *src, - const T_in *dst, - T_out *src_renumbered, - T_out *dst_renumbered, - T_size *map_size, - Hash_t hash, - Compare_t compare, - rmm::mr::device_memory_resource *mr) -{ - // - // This function will allocate numbering_map to be the exact size needed - // (user doesn't know a priori how many unique vertices there are. - // - // Here's the idea: Create a hash table. Since we're dealing with integers, - // we can take the integer modulo some prime p to create hash buckets. Then - // we dedupe the hash buckets to create a deduped set of entries. This hash - // table can then be used to renumber everything. - // - // We need 2 arrays for hash indexes, and one array for data - // - cudaStream_t stream = nullptr; - - renumber::hash_type hash_size = hash.getHashSize(); - - rmm::device_vector hash_data_v(2 * size); - rmm::device_vector hash_bins_start_v(1 + hash_size, - renumber::index_type{0}); - rmm::device_vector hash_bins_end_v(1 + hash_size); - - T_in *hash_data = hash_data_v.data().get(); - renumber::index_type *hash_bins_start = hash_bins_start_v.data().get(); - renumber::index_type *hash_bins_end = hash_bins_end_v.data().get(); - - // - // Pass 1: count how many vertex ids end up in each hash bin - // - thrust::for_each(rmm::exec_policy(stream)->on(stream), - src, - src + size, - [hash_bins_start, hash] __device__(T_in vid) { - atomicAdd(hash_bins_start + hash(vid), renumber::index_type{1}); - }); - - thrust::for_each(rmm::exec_policy(stream)->on(stream), - dst, - dst + size, - [hash_bins_start, hash] __device__(T_in vid) { - atomicAdd(hash_bins_start + hash(vid), renumber::index_type{1}); - }); - - // - // Compute exclusive sum and copy it into both hash_bins_start and - // hash_bins_end. hash_bins_end will be used to populate the - // hash_data array and at the end will identify the end of - // each range. - // - thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), - hash_bins_start, - hash_bins_start + hash_size + 1, - hash_bins_end); - - CUDA_TRY(cudaMemcpy(hash_bins_start, - hash_bins_end, - (hash_size + 1) * sizeof(renumber::hash_type), - cudaMemcpyDeviceToDevice)); - - // - // Pass 2: Populate hash_data with data from the hash bins. - // - thrust::for_each(rmm::exec_policy(stream)->on(stream), - src, - src + size, - [hash_bins_end, hash_data, hash] __device__(T_in vid) { - uint32_t hash_index = hash(vid); - renumber::index_type hash_offset = atomicAdd(&hash_bins_end[hash_index], 1); - hash_data[hash_offset] = vid; - }); - - thrust::for_each(rmm::exec_policy(stream)->on(stream), - dst, - dst + size, - [hash_bins_end, hash_data, hash] __device__(T_in vid) { - uint32_t hash_index = hash(vid); - renumber::index_type hash_offset = atomicAdd(&hash_bins_end[hash_index], 1); - hash_data[hash_offset] = vid; - }); - - // - // Now that we have data in hash bins, we'll do a segmented sort of the has bins - // to sort each bin. This will allow us to identify duplicates (all duplicates - // are in the same hash bin so they will end up sorted consecutively). - // - renumber::index_type size_as_int = size; - cugraph::sort::bitonic::segmented_sort( - hash_size, size_as_int, hash_bins_start, hash_bins_end, hash_data, compare, stream); - - // - // Now we rinse and repeat. hash_data contains the data organized into sorted - // hash bins. This allows us to identify duplicates. We'll start over but - // we'll skip the duplicates when we repopulate the hash table. - // - - // - // Pass 3: count how many vertex ids end up in each hash bin after deduping - // - CUDA_TRY(cudaMemset(hash_bins_start, 0, (1 + hash_size) * sizeof(renumber::index_type))); - - thrust::for_each( - rmm::exec_policy(stream)->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(2 * size), - [hash_data, hash_bins_start, hash, compare, size] __device__(renumber::index_type idx) { - // - // Two items (a and b) are equal if - // compare(a,b) is false and compare(b,a) - // is also false. If either is true then - // a and b are not equal. - // - // Note that if there are k duplicate - // instances of an entry, only the LAST - // entry will be counted - // - bool unique = ((idx + 1) == (2 * size)) || compare(hash_data[idx], hash_data[idx + 1]) || - compare(hash_data[idx + 1], hash_data[idx]); - - if (unique) atomicAdd(hash_bins_start + hash(hash_data[idx]), renumber::index_type{1}); - }); - - // - // Compute exclusive sum and copy it into both hash_bins_start and - // hash bins end. - // - thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), - hash_bins_start, - hash_bins_start + hash_size + 1, - hash_bins_end); - - CUDA_TRY(cudaMemcpy(hash_bins_start, - hash_bins_end, - (hash_size + 1) * sizeof(renumber::hash_type), - cudaMemcpyDeviceToDevice)); - - // - // The last entry in the array (hash_bins_end[hash_size]) is the - // total number of unique vertices - // - renumber::index_type temp = 0; - CUDA_TRY(cudaMemcpy( - &temp, hash_bins_end + hash_size, sizeof(renumber::index_type), cudaMemcpyDeviceToHost)); - *map_size = temp; - - rmm::device_buffer numbering_map(temp * sizeof(T_in), stream, mr); - T_in *local_numbering_map = static_cast(numbering_map.data()); - - // - // Pass 4: Populate hash_data with data from the hash bins after deduping - // - thrust::for_each(rmm::exec_policy(stream)->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(2 * size), - [hash_bins_end, hash_data, local_numbering_map, hash, compare, size] __device__( - renumber::index_type idx) { - bool unique = ((idx + 1) == (2 * size)) || - compare(hash_data[idx], hash_data[idx + 1]) || - compare(hash_data[idx + 1], hash_data[idx]); - - if (unique) { - uint32_t hash_index = hash(hash_data[idx]); - renumber::index_type hash_offset = atomicAdd(&hash_bins_end[hash_index], 1); - local_numbering_map[hash_offset] = hash_data[idx]; - } - }); - - // - // At this point, hash_bins_start and numbering_map partition the - // unique data into a hash table. - // - - // - // If we do a segmented sort now, we can do the final lookups. - // - size_as_int = size; - cugraph::sort::bitonic::segmented_sort( - hash_size, size_as_int, hash_bins_start, hash_bins_end, local_numbering_map, compare, stream); - - // - // Renumber the input. For each vertex, identify the - // hash bin, and then search the hash bin for the - // record that matches, the relative offset between that - // element and the beginning of the array is the vertex - // id in the renumbered map. - // - thrust::for_each(rmm::exec_policy(stream)->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(size), - [local_numbering_map, - hash_bins_start, - hash_bins_end, - hash, - src, - src_renumbered, - compare] __device__(renumber::index_type idx) { - renumber::hash_type tmp = hash(src[idx]); - const T_in *id = - thrust::lower_bound(thrust::seq, - local_numbering_map + hash_bins_start[tmp], - local_numbering_map + hash_bins_end[tmp], - src[idx], - compare); - src_renumbered[idx] = id - local_numbering_map; - }); - - thrust::for_each(rmm::exec_policy(stream)->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(size), - [local_numbering_map, - hash_bins_start, - hash_bins_end, - hash, - dst, - dst_renumbered, - compare] __device__(renumber::index_type idx) { - renumber::hash_type tmp = hash(dst[idx]); - const T_in *id = - thrust::lower_bound(thrust::seq, - local_numbering_map + hash_bins_start[tmp], - local_numbering_map + hash_bins_end[tmp], - dst[idx], - compare); - dst_renumbered[idx] = id - local_numbering_map; - }); - - return std::make_unique(std::move(numbering_map)); -} - -} // namespace detail -} // namespace cugraph diff --git a/cpp/src/sort/bitonic.cuh b/cpp/src/sort/bitonic.cuh deleted file mode 100644 index b1b19bafdf0..00000000000 --- a/cpp/src/sort/bitonic.cuh +++ /dev/null @@ -1,546 +0,0 @@ -// -*-c++-*- - -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Bitonic sort implementation -// Author: Chuck Hastings charlesh@nvidia.com - -// TODO: Read a paper (Hagen Peters 2011) that suggests some -// ways to optimize this. Need to shift into a kernel -// and then organize to support multiple passes in -// a single kernel call. This should reduce kernel -// launch overhead and the number of memory references, -// which should drive down the overall time. -// - -#ifndef BITONIC_SORT_H -#define BITONIC_SORT_H - -#include -#include - -#include -#include - -namespace cugraph { -namespace sort { - -namespace bitonic { -/* - * This implementation is based upon the bitonic sort technique. - * This should be pretty efficient in a SIMT environment. - */ -namespace detail { -/** - * @brief Compare two items, if the compare functor returns true - * then swap them. - * - * @param a - reference to the first item - * @param b - reference to the second item - * @param compare - reference to a comparison functor - */ -template -inline void __device__ compareAndSwap(ValueT &a, ValueT &b, CompareT &compare) -{ - if (!compare(a, b)) { thrust::swap(a, b); } -} - -/* - * @brief perform repartitioning of two sorted partitions. This - * is analagous to the bitonic merge step. But it only - * performs the compare and swap portion of the bitonic - * merge. The subsequent sorts are handled externally. - * - * The repartition assumes that the data is segregated - * into partitions of binSize. So if there are 8 elements - * and a bin size of 2 then the array will be partitioned - * into 4 bins of size 2. Each bin is assumed to be - * sorted. The repartition takes consecutive bins and - * repartitions them so that the first bin contains the - * low elements and the second bin contains the high elements. - * - * @param array - the array containing the data we need to repartition - * @param count - the number of elements in the array - * @param binSize - the size of the bin - * @param compare - comparison functor - */ -template -void repartition(ValueT *array, int count, int binSize, CompareT &compare) -{ - thrust::for_each(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(count / 2), - - [array, count, binSize, compare] __device__(int idx) { - // - // Identify which elements in which partition - // we are responsible for comparing and swapping - // - // We're running count/2 iterations. Each iteration - // needs to operate on a pair of elements. Consider - // the pairs of partitions, this will let us determine - // which elements we compare. - // - int bi_partition = idx / binSize; - - // - // bi_partition identifies which pair of partitions - // we're operating on. Out of each bin we're only - // going to do binSize comparisons, so the first - // element in the comparison will be based on - // idx % binSize. - // - int offset = idx % binSize; - - // - // First element is easy. - // Second element is "easy" but we'll fix - // special cases below. - // - int i = bi_partition * (binSize * 2) + offset; - int j = (bi_partition + 1) * (binSize * 2) - 1 - offset; - - // - // The last partition pair is the problem. - // There are several cases: - // 1) Both partitions are full. This - // is the easy case, we can just - // compare and swap elements - // 2) First partition is full, the second - // partition is not full (possibly - // empty). In this case, we only - // compare some of the elements. - // 3) First partition is not full, there - // is no second partition. In this - // case we actually don't have any - // work to do. - // - // This should be a simple check. If the - // second element is beyond the end of - // the array then there is nothing to compare - // and swap. Note that if the first - // element is beyond the end of the array - // there is also nothing to compare and swap, - // but if the first element is beyond the - // end of the array then the second element - // will also be beyond the end of the array. - // - if (j < count) compareAndSwap(array[i], array[j], compare); - }); -} - -/* - * @brief perform shuffles. After the repartition we need - * to perform shuffles of the halves to get things in - * order. - * - * @param array - the array containing the data we need to repartition - * @param count - the number of elements in the array - * @param binSize - the size of the bin - * @param compare - comparison functor - */ -template -void shuffles(ValueT *array, int count, int binSize, CompareT &compare) -{ - thrust::for_each(thrust::make_counting_iterator(0), - thrust::make_counting_iterator((count + 1) / 2), - [array, count, binSize, compare] __device__(int idx) { - // - // Identify which elements in which partition - // we are responsible for comparing and swapping - // - // We're running count/2 iterations. Each iteration - // needs to operate on a pair of elements. Consider - // the pairs of partitions, this will let us determine - // which elements we compare. - // - int bi_partition = idx / binSize; - - // - // bi_partition identifies which pair of partitions - // we're operating on. Out of each bin we're only - // going to do binSize comparisons, so the first - // element in the comparison will be based on - // idx % binSize. - // - int offset = idx % binSize; - - // - // First element is easy. - // Second element is "easy" i + binSize. - // - int i = bi_partition * (binSize * 2) + offset; - int j = i + binSize; - - // - // If the second element is beyond the end of - // the array then there is nothing to compare - // and swap. - // - if (j < count) compareAndSwap(array[i], array[j], compare); - }); -} - -/* - * @brief perform repartitioning of two sorted partitions in the - * segmented sort case. - * - * The repartition assumes that the data is segregated - * into partitions of binSize. So if there are 8 elements - * and a bin size of 2 then the array will be partitioned - * into 4 bins of size 2. Each bin is assumed to be - * sorted. The repartition takes consecutive bins and - * repartitions them so that the first bin contains the - * low elements and the second bin contains the high elements. - * - * @param array - the array containing the data we need to repartition - * @param count - the number of elements in the array - * @param binSize - the size of the bin - * @param compare - comparison functor - */ -template -void repartition_segmented(const IndexT *d_begin_offsets, - const IndexT *d_end_offsets, - ValueT *d_items, - IndexT start, - IndexT stop, - IndexT *d_grouped_bins, - int binSize, - int max_count, - int bin_pairs, - CompareT &compare) -{ - thrust::for_each(thrust::device, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(max_count / 2), - [d_begin_offsets, - d_end_offsets, - d_items, - start, - stop, - d_grouped_bins, - bin_pairs, - binSize, - compare] __device__(int idx) { - // - // idx needs to be mapped into the correct place - // - int entry = idx / bin_pairs; - int entry_idx = idx % bin_pairs; - int base = d_begin_offsets[d_grouped_bins[start + entry]]; - int count = d_end_offsets[d_grouped_bins[start + entry]] - base; - - // - // Identify which elements in which partition - // we are responsible for comparing and swapping - // - // We're running count/2 iterations. Each iteration - // needs to operate on a pair of elements. Consider - // the pairs of partitions, this will let us determine - // which elements we compare. - // - int bi_partition = entry_idx / binSize; - - // - // bi_partition identifies which pair of partitions - // we're operating on. Out of each bin we're only - // going to do binSize comparisons, so the first - // element in the comparison will be based on - // idx % binSize. - // - int offset = entry_idx % binSize; - - // - // First element is easy. - // Second element is "easy" but we'll fix - // special cases below. - // - int i = bi_partition * (binSize * 2) + offset; - int j = (bi_partition + 1) * (binSize * 2) - 1 - offset; - - // - // The last partition pair is the problem. - // There are several cases: - // 1) Both partitions are full. This - // is the easy case, we can just - // compare and swap elements - // 2) First partition is full, the second - // partition is not full (possibly - // empty). In this case, we only - // compare some of the elements. - // 3) First partition is not full, there - // is no second partition. In this - // case we actually don't have any - // work to do. - // - // This should be a simple check. If the - // second element is beyond the end of - // the array then there is nothing to compare - // and swap. Note that if the first - // element is beyond the end of the array - // there is also nothing to compare and swap, - // but if the first element is beyond the - // end of the array then the second element - // will also be beyond the end of the array. - // - if (j < count) { - compareAndSwap(d_items[base + i], d_items[base + j], compare); - } - }); -} - -/* - * @brief perform shuffles. After the repartition we need - * to perform shuffles of the halves to get things in - * order. - * - * @param rowOffsets - the row offsets identifying the segments - * @param colIndices - the values to sort within the segments - * @param start - position within the grouped bins where we - * start this pass - * @param stop - position within the grouped bins where we stop - * this pass - * @param d_grouped_bins - lrb grouped bins. All bins between - * start and stop are in the same lrb bin - * @param binSize - the bitonic bin size for this pass of the shuffles - * @param max_count - maximum number of elements possible for - * this call - * @param bin_pairs - the number of bin pairs - * @param compare - the comparison functor - */ -template -void shuffles_segmented(const IndexT *d_begin_offsets, - const IndexT *d_end_offsets, - ValueT *d_items, - IndexT start, - IndexT stop, - IndexT *d_grouped_bins, - int binSize, - long max_count, - int bin_pairs, - CompareT &compare) -{ - thrust::for_each(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(max_count / 2), - [d_begin_offsets, - d_end_offsets, - d_items, - start, - stop, - d_grouped_bins, - compare, - max_count, - bin_pairs, - binSize] __device__(int idx) { - // - // idx needs to be mapped into the correct place - // - int entry = idx / bin_pairs; - int entry_idx = idx % bin_pairs; - int base = d_begin_offsets[d_grouped_bins[start + entry]]; - int count = d_end_offsets[d_grouped_bins[start + entry]] - base; - - // - // Identify which elements in which partition - // we are responsible for comparing and swapping - // - // We're running count/2 iterations. Each iteration - // needs to operate on a pair of elements. Consider - // the pairs of partitions, this will let us determine - // which elements we compare. - // - int bi_partition = entry_idx / binSize; - - // - // bi_partition identifies which pair of partitions - // we're operating on. Out of each bin we're only - // going to do binSize comparisons, so the first - // element in the comparison will be based on - // idx % binSize. - // - int offset = entry_idx % binSize; - - // - // First element is easy. - // Second element is "easy" i + binSize. - // - int i = bi_partition * (binSize * 2) + offset; - int j = i + binSize; - - // - // If the second element is beyond the end of - // the array then there is nothing to compare - // and swap. - // - if (j < count) compareAndSwap(d_items[base + i], d_items[base + j], compare); - }); -} -} // namespace detail - -template -void sort(ValueT *array, int count, CompareT &compare) -{ - for (int i = 1; i < count; i *= 2) { - detail::repartition(array, count, i, compare); - - for (int j = i / 2; j > 0; j /= 2) { detail::shuffles(array, count, j, compare); } - } -} - -/** - * @brief Perform a segmented sort. This function performs a sort - * on each segment of the specified input. This sort is done - * in place, so the d_items array is modified during this call. - * Sort is done according to the (optionally) specified - * comparison function. - * - * Note that this function uses O(num_segments) temporary - * memory during execution. - * - * @param [in] num_segments - the number of segments that the items array is divided into - * @param [in] num_items - the number of items in the array - * @param [in] d_begin_offsets - device array containing the offset denoting the start - * of each segment - * @param [in] d_end_offsets - device array containing the offset denoting the end - * of each segment. - * @param [in/out] d_items - device array containing the items to sort - * @param [in] compare - [optional] comparison function. Default is thrust::less. - * @param [in] stream - [optional] CUDA stream to launch kernels with. Default is stream 0. - * - * @return error code - */ -template -void segmented_sort(IndexT num_segments, - IndexT num_items, - const IndexT *d_begin_offsets, - const IndexT *d_end_offsets, - ValueT *d_items, - CompareT compare = thrust::less(), - cudaStream_t stream = nullptr) -{ - // - // NOTE: This should probably be computed somehow. At the moment - // we are limited to 32 bits because of memory sizes. - // - int lrb_size = 32; - IndexT lrb[lrb_size + 1]; - - rmm::device_vector lrb_v(lrb_size + 1); - rmm::device_vector grouped_bins_v(num_segments + 1); - - IndexT *d_lrb = lrb_v.data().get(); - IndexT *d_grouped_bins = grouped_bins_v.data().get(); - - CUDA_TRY(cudaMemset(d_lrb, 0, (lrb_size + 1) * sizeof(IndexT))); - - // - // First we'll count how many entries go in each bin - // - thrust::for_each(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_segments), - [d_begin_offsets, d_end_offsets, d_lrb] __device__(int idx) { - int size = d_end_offsets[idx] - d_begin_offsets[idx]; - // - // NOTE: If size is 0 or 1 then no - // sorting is required, so we'll - // eliminate those bins here - // - if (size > 1) atomicAdd(d_lrb + __clz(size), 1); - }); - - // - // Exclusive sum will identify where each bin begins - // - thrust::exclusive_scan( - rmm::exec_policy(stream)->on(stream), d_lrb, d_lrb + (lrb_size + 1), d_lrb); - - // - // Copy the start of each bin to local memory - // - CUDA_TRY(cudaMemcpy(lrb, d_lrb, (lrb_size + 1) * sizeof(IndexT), cudaMemcpyDeviceToHost)); - - // - // Now we'll populate grouped_bins. This will corrupt - // d_lrb, but we've already copied it locally. - // - thrust::for_each(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_segments), - [d_begin_offsets, d_end_offsets, d_lrb, d_grouped_bins] __device__(int idx) { - int size = d_end_offsets[idx] - d_begin_offsets[idx]; - if (size > 1) { - int pos = atomicAdd(d_lrb + __clz(size), 1); - d_grouped_bins[pos] = idx; - } - }); - - // - // At this point, d_grouped_bins contains the index of the - // different segments, ordered into log2 bins. - // - - // - // Now we're ready to go. - // - // For simplicity (at least for now), let's just - // iterate over each lrb bin. Note that the larger - // the index i, the smaller the size of each bin... but - // there will likely be many more inhabitants of that bin. - // - for (int i = 0; i < lrb_size; ++i) { - int size = lrb[i + 1] - lrb[i]; - if (size > 0) { - // - // There are inhabitants of this lrb range - // - // max_count will be used to drive the bitonic - // passes (1, 2, 4, 8, ... up to max_count) - // - int max_count = 1 << (lrb_size - i); - - for (int j = 1; j < max_count; j *= 2) { - detail::repartition_segmented(d_begin_offsets, - d_end_offsets, - d_items, - lrb[i], - lrb[i + 1], - d_grouped_bins, - j, - size * max_count, - max_count / 2, - compare); - - for (int k = j / 2; k > 0; k /= 2) { - detail::shuffles_segmented(d_begin_offsets, - d_end_offsets, - d_items, - lrb[i], - lrb[i + 1], - d_grouped_bins, - k, - size * max_count, - max_count / 2, - compare); - } - } - } - } -} - -} // namespace bitonic -} // namespace sort -} // namespace cugraph - -#endif diff --git a/cpp/src/utilities/heap.cuh b/cpp/src/utilities/heap.cuh deleted file mode 100644 index 0747a658324..00000000000 --- a/cpp/src/utilities/heap.cuh +++ /dev/null @@ -1,222 +0,0 @@ -// -*-c++-*- - -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Utilities to treat array as a heap -// Author: Chuck Hastings charlesh@nvidia.com - -#ifndef HEAP_H -#define HEAP_H - -namespace cugraph { -namespace detail { - -namespace heap { -/* - * Our goal here is to treat a C-style array indexed - * from 0 to n-1 as a heap. The heap is a binary tress - * structure where the root of each tree is the smallest - * (or largest) value in that subtree. - * - * This is a completely serial implementation. The intention - * from a parallelism perspective would be to use this on - * a block of data assigned to a particular GPU (or CPU) thread. - * - * These functions will allow you to use an existing - * c-style array (host or device side) and manipulate - * it as a heap. - * - * Note, the heap will be represented like this - the - * shape indicates the binary tree structure, the element - * indicates the index of the array that is associated - * with the element. This diagram will help understand - * the parent/child calculations defined below. - * - * 0 - * 1 2 - * 3 4 5 6 - * 7 8 9 10 11 12 13 14 - * - * So element 0 is the root of the tree, element 1 is the - * left child of 0, element 2 is the right child of 0, etc. - */ - -namespace detail { -/** - * @brief Identify the parent index of the specified index. - * NOTE: This function does no bounds checking, so - * the parent of 0 is 0. - * - * See the above documentation for a picture to describe - * the tree. - * - * IndexT is a templated integer type of the index - * - * @param[in] index - the current array index - * @return the index of the parent of the current index - */ -template -inline IndexT __host__ __device__ parent(IndexT index) -{ - static_assert(std::is_integral::value, "Index must be of an integral type"); - - return ((index + 1) / 2) - 1; -} - -/** - * @brief Identify the left child index of the specified index. - * NOTE: This function does no bounds checking, so - * the left child computed might be out of bounds. - * - * See the above documentation for a picture to describe - * the tree. - * - * IndexT is a templated integer type of the index - * - * @param[in] index - the current array index - * @return the index of the left child of the current index - */ -template -inline IndexT __host__ __device__ left_child(IndexT index) -{ - static_assert(std::is_integral::value, "Index must be of an integral type"); - - return ((index + 1) * 2 - 1); -} - -/** - * @brief Identify the right child index of the specified index. - * NOTE: This function does no bounds checking, so - * the right child computed might be out of bounds. - * - * See the above documentation for a picture to describe - * the tree. - * - * IndexT is a templated integer type of the index - * - * @param[in] index - the current array index - * @return the index of the right child of the current index - */ -template -inline IndexT __host__ __device__ right_child(IndexT index) -{ - static_assert(std::is_integral::value, "Index must be of an integral type"); - - return (index + 1) * 2; -} -} // namespace detail - -/** - * @brief Reorder an existing array of elements into a heap - * - * ArrayT is a templated type of the array elements - * IndexT is a templated integer type of the index - * CompareT is a templated compare function - * - * @param[in, out] array - the existing array - * @param[in] size - the number of elements in the existing array - * @param[in] compare - the comparison function to use - * - */ -template -inline void __host__ __device__ heapify(ArrayT *array, IndexT size, CompareT compare) -{ - static_assert(std::is_integral::value, "Index must be of an integral type"); - - // - // We want to order ourselves as a heap. This is accomplished by starting - // at the end and for each element, compare with its parent and - // swap if necessary. We repeat this until there are no more swaps - // (should take no more than log2(size) iterations). - // - IndexT count_swaps = 1; - while (count_swaps > 0) { - count_swaps = 0; - for (IndexT i = size - 1; i > 0; --i) { - IndexT p = detail::parent(i); - - if (compare(array[i], array[p])) { - thrust::swap(array[i], array[p]); - ++count_swaps; - } - } - } -} - -/** - * @brief Pop the top element off of the heap. Note that the caller - * should decrement the size - the last element in the - * array is no longer used. - * - * ArrayT is a templated type of the array elements - * IndexT is a templated integer type of the index - * CompareT is a templated compare function - * - * @return - the top of the heap. - */ -template -inline ArrayT __host__ __device__ heap_pop(ArrayT *array, IndexT size, CompareT compare) -{ - static_assert(std::is_integral::value, "Index must be of an integral type"); - - // - // Swap the top of the array with the last element - // - --size; - thrust::swap(array[0], array[size]); - - // - // Now top element is no longer the smallest (largest), so we need - // to sift it down to the proper location. - // - for (IndexT i = 0; i < size;) { - IndexT lc = detail::left_child(i); - IndexT rc = detail::right_child(i); - IndexT smaller = i; - - // - // We can go out of bounds, let's check the simple cases - // - if (rc < size) { - // - // Both children exist in tree, pick the smaller (lerger) - // one. - // - smaller = (compare(array[lc], array[rc])) ? lc : rc; - } else if (lc < size) { - smaller = lc; - } - - if ((smaller != i) && (compare(array[smaller], array[i]))) { - thrust::swap(array[i], array[smaller]); - i = smaller; - } else { - // - // If we don't swap then we can stop checking, break out of the loop - // - i = size; - } - } - - return array[size]; -} -} // namespace heap - -} // namespace detail -} // namespace cugraph - -#endif diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ec18640bc11..524b681601f 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -297,10 +297,6 @@ ConfigureTest(TRIANGLE_TEST community/triangle_test.cu) # - EGO tests -------------------------------------------------------------------------------- ConfigureTest(EGO_TEST community/egonet_test.cu) -################################################################################################### -# - RENUMBERING tests ----------------------------------------------------------------------------- -ConfigureTest(RENUMBERING_TEST renumber/renumber_test.cu) - ################################################################################################### # - FORCE ATLAS 2 tests -------------------------------------------------------------------------- ConfigureTest(FA2_TEST layout/force_atlas2_test.cu) diff --git a/cpp/tests/renumber/renumber_test.cu b/cpp/tests/renumber/renumber_test.cu deleted file mode 100644 index a7102402acf..00000000000 --- a/cpp/tests/renumber/renumber_test.cu +++ /dev/null @@ -1,579 +0,0 @@ -// -*-c++-*- - -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -//#include "gmock/gmock.h" - -#include - -#include - -#include - -#include -#include - -#include - -struct RenumberingTest : public ::testing::Test { -}; - -__global__ void display_list(const char *label, uint32_t *verts, size_t length) -{ - printf("%s\n", label); - - for (size_t i = 0; i < length; ++i) { printf(" %u\n", verts[i]); } -} - -__global__ void setup_generator(curandState *state) -{ - int id = threadIdx.x + blockIdx.x * blockDim.x; - curand_init(43, id, 0, &state[id]); -} - -__global__ void generate_sources(curandState *state, int n, uint32_t *verts) -{ - int first = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - - curandState local_state = state[first]; - for (int id = first; id < n; id += stride) { verts[id] = curand(&local_state); } - - state[first] = local_state; -} - -__global__ void generate_destinations(curandState *state, - int n, - const uint32_t *sources, - uint32_t *destinations) -{ - int first = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - - curandState local_state = state[first]; - for (int id = first; id < n; id += stride) { - destinations[id] = sources[curand(&local_state) % n]; - } - - state[first] = local_state; -} - -TEST_F(RenumberingTest, SmallFixedVertexList) -{ - uint32_t src_data[] = {4U, 6U, 8U, 20U, 1U}; - uint32_t dst_data[] = {1U, 29U, 35U, 0U, 77U}; - - uint32_t src_expected[] = {2U, 3U, 4U, 5U, 1U}; - uint32_t dst_expected[] = {1U, 6U, 7U, 0U, 8U}; - - size_t length = sizeof(src_data) / sizeof(src_data[0]); - - uint32_t *src_d; - uint32_t *dst_d; - - uint32_t tmp_results[length]; - uint32_t tmp_map[2 * length]; - - rmm::device_vector src(length); - rmm::device_vector dst(length); - src_d = src.data().get(); - dst_d = dst.data().get(); - - EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint32_t) * length, cudaMemcpyHostToDevice), - cudaSuccess); - EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint32_t) * length, cudaMemcpyHostToDevice), - cudaSuccess); - - size_t unique_verts = 0; - - auto number_map = cugraph::detail::renumber_vertices(length, - src_d, - dst_d, - src_d, - dst_d, - &unique_verts, - cugraph::detail::HashFunctionObjectInt(511), - thrust::less(), - rmm::mr::get_current_device_resource()); - - EXPECT_EQ(cudaMemcpy( - tmp_map, number_map->data(), sizeof(uint32_t) * unique_verts, cudaMemcpyDeviceToHost), - cudaSuccess); - EXPECT_EQ(cudaMemcpy(tmp_results, src_d, sizeof(uint32_t) * length, cudaMemcpyDeviceToHost), - cudaSuccess); - - for (size_t i = 0; i < length; ++i) { - EXPECT_EQ(tmp_results[i], src_expected[i]); - EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]); - } - - EXPECT_EQ(cudaMemcpy(tmp_results, dst_d, sizeof(uint32_t) * length, cudaMemcpyDeviceToHost), - cudaSuccess); - for (size_t i = 0; i < length; ++i) { - EXPECT_EQ(tmp_results[i], dst_expected[i]); - EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]); - } -} - -TEST_F(RenumberingTest, SmallFixedVertexListNegative) -{ - int64_t src_data[] = {4, 6, 8, -20, 1}; - int64_t dst_data[] = {1, 29, 35, 0, 77}; - - int64_t src_expected[] = {2, 3, 4, 8, 1}; - int64_t dst_expected[] = {1, 5, 6, 0, 7}; - - size_t length = sizeof(src_data) / sizeof(src_data[0]); - - int64_t *src_d; - int64_t *dst_d; - - int64_t tmp_results[length]; - int64_t tmp_map[2 * length]; - - rmm::device_vector src(length); - rmm::device_vector dst(length); - src_d = src.data().get(); - dst_d = dst.data().get(); - - EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(int64_t) * length, cudaMemcpyHostToDevice), - cudaSuccess); - EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(int64_t) * length, cudaMemcpyHostToDevice), - cudaSuccess); - - size_t unique_verts = 0; - - auto number_map = cugraph::detail::renumber_vertices(length, - src_d, - dst_d, - src_d, - dst_d, - &unique_verts, - cugraph::detail::HashFunctionObjectInt(511), - thrust::less(), - rmm::mr::get_current_device_resource()); - - EXPECT_EQ( - cudaMemcpy(tmp_map, number_map->data(), sizeof(int64_t) * unique_verts, cudaMemcpyDeviceToHost), - cudaSuccess); - EXPECT_EQ(cudaMemcpy(tmp_results, src_d, sizeof(int64_t) * length, cudaMemcpyDeviceToHost), - cudaSuccess); - - for (size_t i = 0; i < length; ++i) { - EXPECT_EQ(tmp_results[i], src_expected[i]); - EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]); - } - - EXPECT_EQ(cudaMemcpy(tmp_results, dst_d, sizeof(int64_t) * length, cudaMemcpyDeviceToHost), - cudaSuccess); - for (size_t i = 0; i < length; ++i) { - EXPECT_EQ(tmp_results[i], dst_expected[i]); - EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]); - } -} - -TEST_F(RenumberingTest, SmallFixedVertexList64Bit) -{ - uint64_t src_data[] = {4U, 6U, 8U, 20U, 1U}; - uint64_t dst_data[] = {1U, 29U, 35U, 0U, 77U}; - - uint64_t src_expected[] = {2U, 3U, 4U, 5U, 1U}; - uint64_t dst_expected[] = {1U, 6U, 7U, 0U, 8U}; - - size_t length = sizeof(src_data) / sizeof(src_data[0]); - - uint64_t *src_d; - uint64_t *dst_d; - - uint64_t tmp_results[length]; - uint64_t tmp_map[2 * length]; - - rmm::device_vector src(length); - rmm::device_vector dst(length); - src_d = src.data().get(); - dst_d = dst.data().get(); - - EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice), - cudaSuccess); - EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice), - cudaSuccess); - - size_t unique_verts = 0; - - auto number_map = cugraph::detail::renumber_vertices(length, - src_d, - dst_d, - src_d, - dst_d, - &unique_verts, - cugraph::detail::HashFunctionObjectInt(511), - thrust::less(), - rmm::mr::get_current_device_resource()); - - EXPECT_EQ(cudaMemcpy( - tmp_map, number_map->data(), sizeof(uint64_t) * unique_verts, cudaMemcpyDeviceToHost), - cudaSuccess); - EXPECT_EQ(cudaMemcpy(tmp_results, src_d, sizeof(uint64_t) * length, cudaMemcpyDeviceToHost), - cudaSuccess); - - for (size_t i = 0; i < length; ++i) { - EXPECT_EQ(tmp_results[i], src_expected[i]); - EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]); - } - - EXPECT_EQ(cudaMemcpy(tmp_results, dst_d, sizeof(uint64_t) * length, cudaMemcpyDeviceToHost), - cudaSuccess); - for (size_t i = 0; i < length; ++i) { - EXPECT_EQ(tmp_results[i], dst_expected[i]); - EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]); - } -} - -TEST_F(RenumberingTest, SmallFixedVertexList64BitTo32Bit) -{ - uint64_t src_data[] = {4U, 6U, 8U, 20U, 1U}; - uint64_t dst_data[] = {1U, 29U, 35U, 0U, 77U}; - - uint32_t src_expected[] = {2U, 3U, 4U, 5U, 1U}; - uint32_t dst_expected[] = {1U, 6U, 7U, 0U, 8U}; - - size_t length = sizeof(src_data) / sizeof(src_data[0]); - - uint64_t *src_d; - uint64_t *dst_d; - uint32_t *src_renumbered_d; - uint32_t *dst_renumbered_d; - - uint32_t tmp_results[length]; - uint64_t tmp_map[2 * length]; - - rmm::device_vector src(length); - rmm::device_vector dst(length); - src_d = src.data().get(); - dst_d = dst.data().get(); - rmm::device_vector src_renumbered(length); - rmm::device_vector dst_renumbered(length); - src_renumbered_d = src_renumbered.data().get(); - dst_renumbered_d = dst_renumbered.data().get(); - - EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice), - cudaSuccess); - EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice), - cudaSuccess); - - size_t unique_verts = 0; - - auto number_map = cugraph::detail::renumber_vertices(length, - src_d, - dst_d, - src_renumbered_d, - dst_renumbered_d, - &unique_verts, - cugraph::detail::HashFunctionObjectInt(511), - thrust::less(), - rmm::mr::get_current_device_resource()); - - EXPECT_EQ(cudaMemcpy( - tmp_map, number_map->data(), sizeof(uint64_t) * unique_verts, cudaMemcpyDeviceToHost), - cudaSuccess); - EXPECT_EQ( - cudaMemcpy(tmp_results, src_renumbered_d, sizeof(uint32_t) * length, cudaMemcpyDeviceToHost), - cudaSuccess); - - for (size_t i = 0; i < length; ++i) { - EXPECT_EQ(tmp_results[i], src_expected[i]); - EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]); - } - - EXPECT_EQ( - cudaMemcpy(tmp_results, dst_renumbered_d, sizeof(uint32_t) * length, cudaMemcpyDeviceToHost), - cudaSuccess); - for (size_t i = 0; i < length; ++i) { - EXPECT_EQ(tmp_results[i], dst_expected[i]); - EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]); - } -} - -TEST_F(RenumberingTest, Random100KVertexSet) -{ - const int num_verts = 100000; - - uint64_t *src_d; - uint64_t *dst_d; - - std::vector src_data_vec(num_verts); - std::vector dst_data_vec(num_verts); - std::vector tmp_results_vec(num_verts); - std::vector tmp_map_vec(2 * num_verts); - - uint64_t *src_data = src_data_vec.data(); - uint64_t *dst_data = dst_data_vec.data(); - uint64_t *tmp_results = tmp_results_vec.data(); - uint64_t *tmp_map = tmp_map_vec.data(); - rmm::device_vector src(num_verts); - rmm::device_vector dst(num_verts); - src_d = src.data().get(); - dst_d = dst.data().get(); - - // - // Generate random source and vertex values - // - srand(43); - - for (int i = 0; i < num_verts; ++i) { src_data[i] = (uint64_t)rand(); } - - for (int i = 0; i < num_verts; ++i) { dst_data[i] = (uint64_t)rand(); } - - EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * num_verts, cudaMemcpyHostToDevice), - cudaSuccess); - EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * num_verts, cudaMemcpyHostToDevice), - cudaSuccess); - - // - // Renumber everything - // - size_t unique_verts = 0; - size_t n_verts{num_verts}; - - auto start = std::chrono::system_clock::now(); - - auto number_map = cugraph::detail::renumber_vertices(n_verts, - src_d, - dst_d, - src_d, - dst_d, - &unique_verts, - cugraph::detail::HashFunctionObjectInt(511), - thrust::less(), - rmm::mr::get_current_device_resource()); - - auto end = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds = end - start; - - std::cout << "Renumber kernel elapsed time (ms): " << elapsed_seconds.count() * 1000 << std::endl; - - EXPECT_EQ(cudaMemcpy( - tmp_map, number_map->data(), sizeof(uint64_t) * unique_verts, cudaMemcpyDeviceToHost), - cudaSuccess); - EXPECT_EQ(cudaMemcpy(tmp_results, src_d, sizeof(uint64_t) * num_verts, cudaMemcpyDeviceToHost), - cudaSuccess); - - size_t min_id = unique_verts; - size_t max_id = 0; - - size_t cnt = 0; - for (size_t i = 0; i < num_verts; ++i) { - min_id = min(min_id, tmp_results[i]); - max_id = max(max_id, tmp_results[i]); - if (tmp_map[tmp_results[i]] != src_data[i]) ++cnt; - - if (cnt < 20) EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]); - } - - if (cnt > 0) printf(" src error count = %ld out of %d\n", cnt, num_verts); - - EXPECT_EQ(cudaMemcpy(tmp_results, dst_d, sizeof(uint64_t) * num_verts, cudaMemcpyDeviceToHost), - cudaSuccess); - for (size_t i = 0; i < num_verts; ++i) { - min_id = min(min_id, tmp_results[i]); - max_id = max(max_id, tmp_results[i]); - if (tmp_map[tmp_results[i]] != dst_data[i]) ++cnt; - - if (cnt < 20) EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]); - } - - if (cnt > 0) printf(" src error count = %ld out of %d\n", cnt, num_verts); - - EXPECT_EQ(min_id, 0); - EXPECT_EQ(max_id, (unique_verts - 1)); -} - -TEST_F(RenumberingTest, Random10MVertexSet) -{ - const int num_verts = 10000000; - - // A sampling of performance on single Quadro GV100 - // const int hash_size = 32767; // 238 ms - // const int hash_size = 8191; // 224 ms - const int hash_size = 511; // 224 ms - - uint32_t *src_d; - uint32_t *dst_d; - - rmm::device_vector src(num_verts); - rmm::device_vector dst(num_verts); - src_d = src.data().get(); - dst_d = dst.data().get(); - - // - // Init the random number generate - // - const int num_threads{64}; - curandState *state; - - rmm::device_vector state_vals(num_threads); - state = state_vals.data().get(); - setup_generator<<>>(state); - generate_sources<<>>(state, num_verts, src_d); - generate_destinations<<>>(state, num_verts, src_d, dst_d); - - std::cout << "done with initialization" << std::endl; - - // - // Renumber everything - // - size_t unique_verts = 0; - size_t n_verts{num_verts}; - - auto start = std::chrono::system_clock::now(); - auto number_map = - cugraph::detail::renumber_vertices(n_verts, - src_d, - dst_d, - src_d, - dst_d, - &unique_verts, - cugraph::detail::HashFunctionObjectInt(hash_size), - thrust::less(), - rmm::mr::get_current_device_resource()); - auto end = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds = end - start; - - std::cout << "Renumber kernel elapsed time (ms): " << elapsed_seconds.count() * 1000 << std::endl; - std::cout << " unique verts = " << unique_verts << std::endl; - std::cout << " hash size = " << hash_size << std::endl; -} - -TEST_F(RenumberingTest, Random100MVertexSet) -{ - const int num_verts = 100000000; - - // A sampling of performance on single Quadro GV100 - // const int hash_size = 8192; // 1811 ms - // const int hash_size = 16384; // 1746 ms - // const int hash_size = 32768; // 1662 ms - // const int hash_size = 65536; // 1569 ms - // const int hash_size = 16777216; // 1328 ms - const int hash_size = 511; - - uint32_t *src_d; - uint32_t *dst_d; - - rmm::device_vector src(num_verts); - rmm::device_vector dst(num_verts); - src_d = src.data().get(); - dst_d = dst.data().get(); - - // - // Init the random number generate - // - const int num_threads{64}; - curandState *state; - - rmm::device_vector state_vals(num_threads); - state = state_vals.data().get(); - setup_generator<<>>(state); - generate_sources<<>>(state, num_verts, src_d); - generate_destinations<<>>(state, num_verts, src_d, dst_d); - - std::cout << "done with initialization" << std::endl; - - // - // Renumber everything - // - size_t unique_verts = 0; - size_t n_verts{num_verts}; - - auto start = std::chrono::system_clock::now(); - auto number_map = - cugraph::detail::renumber_vertices(n_verts, - src_d, - dst_d, - src_d, - dst_d, - &unique_verts, - cugraph::detail::HashFunctionObjectInt(hash_size), - thrust::less(), - rmm::mr::get_current_device_resource()); - auto end = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds = end - start; - - std::cout << "Renumber kernel elapsed time (ms): " << elapsed_seconds.count() * 1000 << std::endl; - std::cout << " unique verts = " << unique_verts << std::endl; - std::cout << " hash size = " << hash_size << std::endl; -} - -TEST_F(RenumberingTest, Random500MVertexSet) -{ - const int num_verts = 500000000; - - // A sampling of performance on single Quadro GV100 - // const int hash_size = 8192; // 9918 ms - // const int hash_size = 16384; // 9550 ms - // const int hash_size = 32768; // 9146 ms - // const int hash_size = 131072; // 8537 ms - const int hash_size = 1048576; // 7335 ms - // const int hash_size = 511; // 7335 ms - - uint32_t *src_d; - uint32_t *dst_d; - - rmm::device_vector src(num_verts); - rmm::device_vector dst(num_verts); - src_d = src.data().get(); - dst_d = dst.data().get(); - - // - // Init the random number generate - // - const int num_threads{64}; - curandState *state; - - rmm::device_vector state_vals(num_threads); - state = state_vals.data().get(); - setup_generator<<>>(state); - generate_sources<<>>(state, num_verts, src_d); - generate_destinations<<>>(state, num_verts, src_d, dst_d); - - std::cout << "done with initialization" << std::endl; - - // - // Renumber everything - // - size_t unique_verts = 0; - size_t n_verts{num_verts}; - - auto start = std::chrono::system_clock::now(); - auto number_map = - cugraph::detail::renumber_vertices(n_verts, - src_d, - dst_d, - src_d, - dst_d, - &unique_verts, - cugraph::detail::HashFunctionObjectInt(hash_size), - thrust::less(), - rmm::mr::get_current_device_resource()); - auto end = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds = end - start; - - std::cout << "Renumber kernel elapsed time (ms): " << elapsed_seconds.count() * 1000 << std::endl; - std::cout << " unique verts = " << unique_verts << std::endl; - std::cout << " hash size = " << hash_size << std::endl; -} - -CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/python/cugraph/structure/graph_primtypes.pxd b/python/cugraph/structure/graph_primtypes.pxd index e0db6c31fca..533e2d84c66 100644 --- a/python/cugraph/structure/graph_primtypes.pxd +++ b/python/cugraph/structure/graph_primtypes.pxd @@ -128,16 +128,6 @@ cdef extern from "cugraph/algorithms.hpp" namespace "cugraph": cdef unique_ptr[GraphCOO[VT, ET, WT]] get_two_hop_neighbors[VT,ET,WT]( const GraphCSRView[VT, ET, WT] &graph) except + -cdef extern from "cugraph/functions.hpp" namespace "cugraph": - - cdef unique_ptr[device_buffer] renumber_vertices[VT_IN,VT_OUT,ET]( - ET number_of_edges, - const VT_IN *src, - const VT_IN *dst, - VT_OUT *src_renumbered, - VT_OUT *dst_renumbered, - ET *map_size) except + - cdef extern from "" namespace "std" nogil: cdef unique_ptr[GraphCOO[int,int,float]] move(unique_ptr[GraphCOO[int,int,float]]) diff --git a/python/cugraph/structure/graph_primtypes_wrapper.pyx b/python/cugraph/structure/graph_primtypes_wrapper.pyx index 91af28380c3..95de1d70732 100644 --- a/python/cugraph/structure/graph_primtypes_wrapper.pyx +++ b/python/cugraph/structure/graph_primtypes_wrapper.pyx @@ -18,7 +18,6 @@ from cugraph.structure.graph_primtypes cimport * from cugraph.structure.graph_primtypes cimport get_two_hop_neighbors as c_get_two_hop_neighbors -from cugraph.structure.graph_primtypes cimport renumber_vertices as c_renumber_vertices from cugraph.structure.utils_wrapper import * from libcpp cimport bool import enum @@ -52,45 +51,6 @@ class Direction(enum.Enum): OUT = 2 -def renumber(source_col, dest_col): - num_edges = len(source_col) - - src_renumbered = cudf.Series(np.zeros(num_edges), dtype=np.int32) - dst_renumbered = cudf.Series(np.zeros(num_edges), dtype=np.int32) - - cdef uintptr_t c_src = source_col.__cuda_array_interface__['data'][0] - cdef uintptr_t c_dst = dest_col.__cuda_array_interface__['data'][0] - cdef uintptr_t c_src_renumbered = src_renumbered.__cuda_array_interface__['data'][0] - cdef uintptr_t c_dst_renumbered = dst_renumbered.__cuda_array_interface__['data'][0] - cdef int map_size = 0 - cdef int n_edges = num_edges - - cdef unique_ptr[device_buffer] numbering_map - - if (source_col.dtype == np.int32): - numbering_map = move(c_renumber_vertices[int,int,int](n_edges, - c_src, - c_dst, - c_src_renumbered, - c_dst_renumbered, - &map_size)) - else: - numbering_map = move(c_renumber_vertices[long,int,int](n_edges, - c_src, - c_dst, - c_src_renumbered, - c_dst_renumbered, - &map_size)) - - - map = DeviceBuffer.c_from_unique_ptr(move(numbering_map)) - map = Buffer(map) - - output_map = cudf.Series(data=map, dtype=source_col.dtype) - - return src_renumbered, dst_renumbered, output_map - - def view_adj_list(input_graph): if input_graph.adjlist is None: