From 3c2639500d142f08e391f2583800d5e22c514420 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Fri, 18 Jun 2021 12:22:01 -0400
Subject: [PATCH] remove legacy renumbering code

---
 cpp/CMakeLists.txt                            |   1 -
 cpp/include/cugraph/functions.hpp             |  34 -
 cpp/src/converters/renumber.cu                |  68 --
 cpp/src/converters/renumber.cuh               | 321 ----------
 cpp/src/sort/bitonic.cuh                      | 546 -----------------
 cpp/src/utilities/heap.cuh                    | 222 -------
 cpp/tests/CMakeLists.txt                      |   4 -
 cpp/tests/renumber/renumber_test.cu           | 579 ------------------
 python/cugraph/structure/graph_primtypes.pxd  |  10 -
 .../structure/graph_primtypes_wrapper.pyx     |  40 --
 10 files changed, 1825 deletions(-)
 delete mode 100644 cpp/src/converters/renumber.cu
 delete mode 100644 cpp/src/converters/renumber.cuh
 delete mode 100644 cpp/src/sort/bitonic.cuh
 delete mode 100644 cpp/src/utilities/heap.cuh
 delete mode 100644 cpp/tests/renumber/renumber_test.cu
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ae0b524689f..7da32f451b8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -169,7 +169,6 @@ add_library(cugraph SHARED
     src/link_prediction/jaccard.cu
     src/link_prediction/overlap.cu
     src/layout/force_atlas2.cu
-    src/converters/renumber.cu
     src/converters/COOtoCSR.cu
     src/community/spectral_clustering.cu
     src/community/louvain.cu
diff --git a/cpp/include/cugraph/functions.hpp b/cpp/include/cugraph/functions.hpp
index 00e8648b156..930eeb52b5c 100644
--- a/cpp/include/cugraph/functions.hpp
+++ b/cpp/include/cugraph/functions.hpp
@@ -44,40 +44,6 @@ std::unique_ptr<GraphCSR<VT, ET, WT>> coo_to_csr(
   GraphCOOView<VT, ET, WT> const &graph,
   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief    Renumber source and destination indices
- *
- * Renumber source and destination indexes to be a dense numbering,
- * using contiguous values between 0 and number of vertices minus 1.
- *
- * @throws                    cugraph::logic_error when an error occurs.
- *
- * @tparam VT_IN              type of vertex index input
- * @tparam VT_OUT             type of vertex index output
- * @tparam ET                 type of edge index
- *
- * @param[in]  number_of_edges number of edges in the graph
- * @param[in]  src            Pointer to device memory containing source vertex ids
- * @param[in]  dst            Pointer to device memory containing destination vertex ids
- * @param[out] src_renumbered Pointer to device memory containing the output source vertices.
- * @param[out] dst_renumbered Pointer to device memory containing the output destination vertices.
- * @param[out] map_size       Pointer to local memory containing the number of elements in the
- * renumbering map
- * @param[in]  mr             Memory resource used to allocate the returned graph
- *
- * @return                    Unique pointer to renumbering map
- *
- */
-template <typename VT_IN, typename VT_OUT, typename ET>
-std::unique_ptr<rmm::device_buffer> renumber_vertices(
-  ET number_of_edges,
-  VT_IN const *src,
-  VT_IN const *dst,
-  VT_OUT *src_renumbered,
-  VT_OUT *dst_renumbered,
-  ET *map_size,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief    Broadcast using handle communicator
  *
diff --git a/cpp/src/converters/renumber.cu b/cpp/src/converters/renumber.cu
deleted file mode 100644
index 9aedbc70e8b..00000000000
--- a/cpp/src/converters/renumber.cu
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "renumber.cuh"
-
-namespace cugraph {
-
-template <typename VT_IN, typename VT_OUT, typename ET>
-std::unique_ptr<rmm::device_buffer> renumber_vertices(
-  ET number_of_edges,
-  VT_IN const *src,
-  VT_IN const *dst,
-  VT_OUT *src_renumbered,
-  VT_OUT *dst_renumbered,
-  ET *map_size,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
-
-{
-  //
-  //  For now, let's just specify a default value of the hash size.
-  //  This should be configurable.
-  //
-  //  FIXME:  cudf has a hash table implementation (moving to cuCollections)
-  //          that is dynamic.  We should use it instead, it will be faster
-  //          and dynamically adjust to data sizes.
-  //
-  int hash_size = 8191;
-
-  return cugraph::detail::renumber_vertices(number_of_edges,
-                                            src,
-                                            dst,
-                                            src_renumbered,
-                                            dst_renumbered,
-                                            map_size,
-                                            cugraph::detail::HashFunctionObjectInt(hash_size),
-                                            thrust::less<int32_t>(),
-                                            mr);
-}
-
-template std::unique_ptr<rmm::device_buffer> renumber_vertices(int32_t,
-                                                               int64_t const *,
-                                                               int64_t const *,
-                                                               int32_t *,
-                                                               int32_t *,
-                                                               int32_t *,
-                                                               rmm::mr::device_memory_resource *);
-template std::unique_ptr<rmm::device_buffer> renumber_vertices(int32_t,
-                                                               int32_t const *,
-                                                               int32_t const *,
-                                                               int32_t *,
-                                                               int32_t *,
-                                                               int32_t *,
-                                                               rmm::mr::device_memory_resource *);
-
-}  // namespace cugraph
diff --git a/cpp/src/converters/renumber.cuh b/cpp/src/converters/renumber.cuh
deleted file mode 100644
index ccf4e6f62c2..00000000000
--- a/cpp/src/converters/renumber.cuh
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#define CUB_STDERR
-
-#include <chrono>
-
-#include <cub/cub.cuh>
-
-#include <cuda_runtime_api.h>
-#include <thrust/binary_search.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
-
-#include <raft/cudart_utils.h>
-#include <rmm/device_buffer.hpp>
-
-#include <cugraph/utilities/error.hpp>
-#include <utilities/graph_utils.cuh>
-#include "sort/bitonic.cuh"
-
-namespace cugraph {
-namespace detail {
-
-namespace renumber {
-typedef uint32_t hash_type;
-typedef uint32_t index_type;
-}  // namespace renumber
-
-class HashFunctionObjectInt {
- public:
-  HashFunctionObjectInt(renumber::hash_type hash_size) : hash_size_(hash_size) {}
-
-  template <typename VertexIdType>
-  __device__ __inline__ renumber::hash_type operator()(const VertexIdType &vertex_id) const
-  {
-    return ((vertex_id % hash_size_) + hash_size_) % hash_size_;
-  }
-
-  renumber::hash_type getHashSize() const { return hash_size_; }
-
- private:
-  renumber::hash_type hash_size_;
-};
-
-/**
- * @brief Renumber vertices to a dense numbering (0..vertex_size-1)
- *
- *    This is a templated function so it can take 32 or 64 bit integers.  The
- *    intention is to take source and destination vertex ids that might be
- *    sparsely scattered across the range and push things down to a dense
- *    numbering.
- *
- *    Arrays src, dst, src_renumbered, dst_renumbered and numbering_map are
- *    assumed to be pre-allocated.  numbering_map is best safely allocated
- *    to store 2 * size vertices.
- *
- * @param[in]  size                 Number of edges
- * @param[in]  src                  List of source vertices
- * @param[in]  dst                  List of dest vertices
- * @param[out] src_renumbered       List of source vertices, renumbered
- * @param[out] dst_renumbered       List of dest vertices, renumbered
- * @param[out] vertex_size          Number of unique vertices
- * @param[out] numbering_map        Map of new vertex id to original vertex id. numbering_map[newId]
- * = oldId
- *
- */
-template <typename T_size, typename T_in, typename T_out, typename Hash_t, typename Compare_t>
-std::unique_ptr<rmm::device_buffer> renumber_vertices(T_size size,
-                                                      const T_in *src,
-                                                      const T_in *dst,
-                                                      T_out *src_renumbered,
-                                                      T_out *dst_renumbered,
-                                                      T_size *map_size,
-                                                      Hash_t hash,
-                                                      Compare_t compare,
-                                                      rmm::mr::device_memory_resource *mr)
-{
-  //
-  // This function will allocate numbering_map to be the exact size needed
-  // (user doesn't know a priori how many unique vertices there are.
-  //
-  // Here's the idea: Create a hash table. Since we're dealing with integers,
-  // we can take the integer modulo some prime p to create hash buckets.  Then
-  // we dedupe the hash buckets to create a deduped set of entries.  This hash
-  // table can then be used to renumber everything.
-  //
-  // We need 2 arrays for hash indexes, and one array for data
-  //
-  cudaStream_t stream = nullptr;
-
-  renumber::hash_type hash_size = hash.getHashSize();
-
-  rmm::device_vector<T_in> hash_data_v(2 * size);
-  rmm::device_vector<renumber::index_type> hash_bins_start_v(1 + hash_size,
-                                                             renumber::index_type{0});
-  rmm::device_vector<renumber::index_type> hash_bins_end_v(1 + hash_size);
-
-  T_in *hash_data                       = hash_data_v.data().get();
-  renumber::index_type *hash_bins_start = hash_bins_start_v.data().get();
-  renumber::index_type *hash_bins_end   = hash_bins_end_v.data().get();
-
-  //
-  //  Pass 1: count how many vertex ids end up in each hash bin
-  //
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   src,
-                   src + size,
-                   [hash_bins_start, hash] __device__(T_in vid) {
-                     atomicAdd(hash_bins_start + hash(vid), renumber::index_type{1});
-                   });
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   dst,
-                   dst + size,
-                   [hash_bins_start, hash] __device__(T_in vid) {
-                     atomicAdd(hash_bins_start + hash(vid), renumber::index_type{1});
-                   });
-
-  //
-  //  Compute exclusive sum and copy it into both hash_bins_start and
-  //  hash_bins_end.  hash_bins_end will be used to populate the
-  //  hash_data array and at the end will identify the end of
-  //  each range.
-  //
-  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
-                         hash_bins_start,
-                         hash_bins_start + hash_size + 1,
-                         hash_bins_end);
-
-  CUDA_TRY(cudaMemcpy(hash_bins_start,
-                      hash_bins_end,
-                      (hash_size + 1) * sizeof(renumber::hash_type),
-                      cudaMemcpyDeviceToDevice));
-
-  //
-  //  Pass 2: Populate hash_data with data from the hash bins.
-  //
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   src,
-                   src + size,
-                   [hash_bins_end, hash_data, hash] __device__(T_in vid) {
-                     uint32_t hash_index              = hash(vid);
-                     renumber::index_type hash_offset = atomicAdd(&hash_bins_end[hash_index], 1);
-                     hash_data[hash_offset]           = vid;
-                   });
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   dst,
-                   dst + size,
-                   [hash_bins_end, hash_data, hash] __device__(T_in vid) {
-                     uint32_t hash_index              = hash(vid);
-                     renumber::index_type hash_offset = atomicAdd(&hash_bins_end[hash_index], 1);
-                     hash_data[hash_offset]           = vid;
-                   });
-
-  //
-  //  Now that we have data in hash bins, we'll do a segmented sort of the has bins
-  //  to sort each bin.  This will allow us to identify duplicates (all duplicates
-  //  are in the same hash bin so they will end up sorted consecutively).
-  //
-  renumber::index_type size_as_int = size;
-  cugraph::sort::bitonic::segmented_sort(
-    hash_size, size_as_int, hash_bins_start, hash_bins_end, hash_data, compare, stream);
-
-  //
-  //  Now we rinse and repeat.  hash_data contains the data organized into sorted
-  //  hash bins.  This allows us to identify duplicates.  We'll start over but
-  //  we'll skip the duplicates when we repopulate the hash table.
-  //
-
-  //
-  //  Pass 3: count how many vertex ids end up in each hash bin after deduping
-  //
-  CUDA_TRY(cudaMemset(hash_bins_start, 0, (1 + hash_size) * sizeof(renumber::index_type)));
-
-  thrust::for_each(
-    rmm::exec_policy(stream)->on(stream),
-    thrust::make_counting_iterator<renumber::index_type>(0),
-    thrust::make_counting_iterator<renumber::index_type>(2 * size),
-    [hash_data, hash_bins_start, hash, compare, size] __device__(renumber::index_type idx) {
-      //
-      //     Two items (a and b) are equal if
-      //   compare(a,b) is false and compare(b,a)
-      //   is also false.  If either is true then
-      //   a and b are not equal.
-      //
-      //     Note that if there are k duplicate
-      //   instances of an entry, only the LAST
-      //   entry will be counted
-      //
-      bool unique = ((idx + 1) == (2 * size)) || compare(hash_data[idx], hash_data[idx + 1]) ||
-                    compare(hash_data[idx + 1], hash_data[idx]);
-
-      if (unique) atomicAdd(hash_bins_start + hash(hash_data[idx]), renumber::index_type{1});
-    });
-
-  //
-  //  Compute exclusive sum and copy it into both hash_bins_start and
-  //  hash bins end.
-  //
-  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
-                         hash_bins_start,
-                         hash_bins_start + hash_size + 1,
-                         hash_bins_end);
-
-  CUDA_TRY(cudaMemcpy(hash_bins_start,
-                      hash_bins_end,
-                      (hash_size + 1) * sizeof(renumber::hash_type),
-                      cudaMemcpyDeviceToDevice));
-
-  //
-  //    The last entry in the array (hash_bins_end[hash_size]) is the
-  //  total number of unique vertices
-  //
-  renumber::index_type temp = 0;
-  CUDA_TRY(cudaMemcpy(
-    &temp, hash_bins_end + hash_size, sizeof(renumber::index_type), cudaMemcpyDeviceToHost));
-  *map_size = temp;
-
-  rmm::device_buffer numbering_map(temp * sizeof(T_in), stream, mr);
-  T_in *local_numbering_map = static_cast<T_in *>(numbering_map.data());
-
-  //
-  //  Pass 4: Populate hash_data with data from the hash bins after deduping
-  //
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<renumber::index_type>(0),
-                   thrust::make_counting_iterator<renumber::index_type>(2 * size),
-                   [hash_bins_end, hash_data, local_numbering_map, hash, compare, size] __device__(
-                     renumber::index_type idx) {
-                     bool unique = ((idx + 1) == (2 * size)) ||
-                                   compare(hash_data[idx], hash_data[idx + 1]) ||
-                                   compare(hash_data[idx + 1], hash_data[idx]);
-
-                     if (unique) {
-                       uint32_t hash_index              = hash(hash_data[idx]);
-                       renumber::index_type hash_offset = atomicAdd(&hash_bins_end[hash_index], 1);
-                       local_numbering_map[hash_offset] = hash_data[idx];
-                     }
-                   });
-
-  //
-  //  At this point, hash_bins_start and numbering_map partition the
-  //  unique data into a hash table.
-  //
-
-  //
-  //  If we do a segmented sort now, we can do the final lookups.
-  //
-  size_as_int = size;
-  cugraph::sort::bitonic::segmented_sort(
-    hash_size, size_as_int, hash_bins_start, hash_bins_end, local_numbering_map, compare, stream);
-
-  //
-  //     Renumber the input.  For each vertex, identify the
-  //   hash bin, and then search the hash bin for the
-  //   record that matches, the relative offset between that
-  //   element and the beginning of the array is the vertex
-  //   id in the renumbered map.
-  //
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<renumber::index_type>(0),
-                   thrust::make_counting_iterator<renumber::index_type>(size),
-                   [local_numbering_map,
-                    hash_bins_start,
-                    hash_bins_end,
-                    hash,
-                    src,
-                    src_renumbered,
-                    compare] __device__(renumber::index_type idx) {
-                     renumber::hash_type tmp = hash(src[idx]);
-                     const T_in *id =
-                       thrust::lower_bound(thrust::seq,
-                                           local_numbering_map + hash_bins_start[tmp],
-                                           local_numbering_map + hash_bins_end[tmp],
-                                           src[idx],
-                                           compare);
-                     src_renumbered[idx] = id - local_numbering_map;
-                   });
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<renumber::index_type>(0),
-                   thrust::make_counting_iterator<renumber::index_type>(size),
-                   [local_numbering_map,
-                    hash_bins_start,
-                    hash_bins_end,
-                    hash,
-                    dst,
-                    dst_renumbered,
-                    compare] __device__(renumber::index_type idx) {
-                     renumber::hash_type tmp = hash(dst[idx]);
-                     const T_in *id =
-                       thrust::lower_bound(thrust::seq,
-                                           local_numbering_map + hash_bins_start[tmp],
-                                           local_numbering_map + hash_bins_end[tmp],
-                                           dst[idx],
-                                           compare);
-                     dst_renumbered[idx] = id - local_numbering_map;
-                   });
-
-  return std::make_unique<rmm::device_buffer>(std::move(numbering_map));
-}
-
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/sort/bitonic.cuh b/cpp/src/sort/bitonic.cuh
deleted file mode 100644
index b1b19bafdf0..00000000000
--- a/cpp/src/sort/bitonic.cuh
+++ /dev/null
@@ -1,546 +0,0 @@
-// -*-c++-*-
-
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Bitonic sort implementation
-// Author: Chuck Hastings charlesh@nvidia.com
-
-// TODO:  Read a paper (Hagen Peters 2011) that suggests some
-//        ways to optimize this.  Need to shift into a kernel
-//        and then organize to support multiple passes in
-//        a single kernel call.  This should reduce kernel
-//        launch overhead and the number of memory references,
-//        which should drive down the overall time.
-//
-
-#ifndef BITONIC_SORT_H
-#define BITONIC_SORT_H
-
-#include <thrust/for_each.h>
-#include <thrust/scan.h>
-
-#include <rmm/thrust_rmm_allocator.h>
-#include <cugraph/utilities/error.hpp>
-
-namespace cugraph {
-namespace sort {
-
-namespace bitonic {
-/*
- *  This implementation is based upon the bitonic sort technique.
- *  This should be pretty efficient in a SIMT environment.
- */
-namespace detail {
-/**
- * @brief Compare two items, if the compare functor returns true
- *        then swap them.
- *
- * @param a - reference to the first item
- * @param b - reference to the second item
- * @param compare - reference to a comparison functor
- */
-template <typename ValueT, typename CompareT>
-inline void __device__ compareAndSwap(ValueT &a, ValueT &b, CompareT &compare)
-{
-  if (!compare(a, b)) { thrust::swap(a, b); }
-}
-
-/*
- * @brief perform repartitioning of two sorted partitions.  This
- *        is analagous to the bitonic merge step.  But it only
- *        performs the compare and swap portion of the bitonic
- *        merge.  The subsequent sorts are handled externally.
- *
- *        The repartition assumes that the data is segregated
- *        into partitions of binSize.  So if there are 8 elements
- *        and a bin size of 2 then the array will be partitioned
- *        into 4 bins of size 2.  Each bin is assumed to be
- *        sorted.  The repartition takes consecutive bins and
- *        repartitions them so that the first bin contains the
- *        low elements and the second bin contains the high elements.
- *
- * @param array - the array containing the data we need to repartition
- * @param count - the number of elements in the array
- * @param binSize - the size of the bin
- * @param compare - comparison functor
- */
-template <typename ValueT, typename CompareT>
-void repartition(ValueT *array, int count, int binSize, CompareT &compare)
-{
-  thrust::for_each(thrust::make_counting_iterator<int>(0),
-                   thrust::make_counting_iterator<int>(count / 2),
-
-                   [array, count, binSize, compare] __device__(int idx) {
-                     //
-                     // Identify which elements in which partition
-                     // we are responsible for comparing and swapping
-                     //
-                     // We're running count/2 iterations.  Each iteration
-                     // needs to operate on a pair of elements.  Consider
-                     // the pairs of partitions, this will let us determine
-                     // which elements we compare.
-                     //
-                     int bi_partition = idx / binSize;
-
-                     //
-                     // bi_partition identifies which pair of partitions
-                     // we're operating on.  Out of each bin we're only
-                     // going to do binSize comparisons, so the first
-                     // element in the comparison will be based on
-                     // idx % binSize.
-                     //
-                     int offset = idx % binSize;
-
-                     //
-                     // First element is easy.
-                     // Second element is "easy" but we'll fix
-                     //   special cases below.
-                     //
-                     int i = bi_partition * (binSize * 2) + offset;
-                     int j = (bi_partition + 1) * (binSize * 2) - 1 - offset;
-
-                     //
-                     // The last partition pair is the problem.
-                     // There are several cases:
-                     //    1) Both partitions are full.  This
-                     //       is the easy case, we can just
-                     //       compare and swap elements
-                     //    2) First partition is full, the second
-                     //       partition is not full (possibly
-                     //       empty).  In this case, we only
-                     //       compare some of the elements.
-                     //    3) First partition is not full, there
-                     //       is no second partition.  In this
-                     //       case we actually don't have any
-                     //       work to do.
-                     //
-                     // This should be a simple check.  If the
-                     // second element is beyond the end of
-                     // the array then there is nothing to compare
-                     // and swap.  Note that if the first
-                     // element is beyond the end of the array
-                     // there is also nothing to compare and swap,
-                     // but if the first element is beyond the
-                     // end of the array then the second element
-                     // will also be beyond the end of the array.
-                     //
-                     if (j < count) compareAndSwap(array[i], array[j], compare);
-                   });
-}
-
-/*
- * @brief perform shuffles.  After the repartition we need
- *        to perform shuffles of the halves to get things in
- *        order.
- *
- * @param array - the array containing the data we need to repartition
- * @param count - the number of elements in the array
- * @param binSize - the size of the bin
- * @param compare - comparison functor
- */
-template <typename ValueT, typename CompareT>
-void shuffles(ValueT *array, int count, int binSize, CompareT &compare)
-{
-  thrust::for_each(thrust::make_counting_iterator<int>(0),
-                   thrust::make_counting_iterator<int>((count + 1) / 2),
-                   [array, count, binSize, compare] __device__(int idx) {
-                     //
-                     // Identify which elements in which partition
-                     // we are responsible for comparing and swapping
-                     //
-                     // We're running count/2 iterations.  Each iteration
-                     // needs to operate on a pair of elements.  Consider
-                     // the pairs of partitions, this will let us determine
-                     // which elements we compare.
-                     //
-                     int bi_partition = idx / binSize;
-
-                     //
-                     // bi_partition identifies which pair of partitions
-                     // we're operating on.  Out of each bin we're only
-                     // going to do binSize comparisons, so the first
-                     // element in the comparison will be based on
-                     // idx % binSize.
-                     //
-                     int offset = idx % binSize;
-
-                     //
-                     // First element is easy.
-                     // Second element is "easy" i + binSize.
-                     //
-                     int i = bi_partition * (binSize * 2) + offset;
-                     int j = i + binSize;
-
-                     //
-                     // If the second element is beyond the end of
-                     // the array then there is nothing to compare
-                     // and swap.
-                     //
-                     if (j < count) compareAndSwap(array[i], array[j], compare);
-                   });
-}
-
-/*
- * @brief perform repartitioning of two sorted partitions in the
- *        segmented sort case.
- *
- *        The repartition assumes that the data is segregated
- *        into partitions of binSize.  So if there are 8 elements
- *        and a bin size of 2 then the array will be partitioned
- *        into 4 bins of size 2.  Each bin is assumed to be
- *        sorted.  The repartition takes consecutive bins and
- *        repartitions them so that the first bin contains the
- *        low elements and the second bin contains the high elements.
- *
- * @param array - the array containing the data we need to repartition
- * @param count - the number of elements in the array
- * @param binSize - the size of the bin
- * @param compare - comparison functor
- */
-template <typename IndexT, typename ValueT, typename CompareT>
-void repartition_segmented(const IndexT *d_begin_offsets,
-                           const IndexT *d_end_offsets,
-                           ValueT *d_items,
-                           IndexT start,
-                           IndexT stop,
-                           IndexT *d_grouped_bins,
-                           int binSize,
-                           int max_count,
-                           int bin_pairs,
-                           CompareT &compare)
-{
-  thrust::for_each(thrust::device,
-                   thrust::make_counting_iterator<int>(0),
-                   thrust::make_counting_iterator<int>(max_count / 2),
-                   [d_begin_offsets,
-                    d_end_offsets,
-                    d_items,
-                    start,
-                    stop,
-                    d_grouped_bins,
-                    bin_pairs,
-                    binSize,
-                    compare] __device__(int idx) {
-                     //
-                     //  idx needs to be mapped into the correct place
-                     //
-                     int entry     = idx / bin_pairs;
-                     int entry_idx = idx % bin_pairs;
-                     int base      = d_begin_offsets[d_grouped_bins[start + entry]];
-                     int count     = d_end_offsets[d_grouped_bins[start + entry]] - base;
-
-                     //
-                     // Identify which elements in which partition
-                     // we are responsible for comparing and swapping
-                     //
-                     // We're running count/2 iterations.  Each iteration
-                     // needs to operate on a pair of elements.  Consider
-                     // the pairs of partitions, this will let us determine
-                     // which elements we compare.
-                     //
-                     int bi_partition = entry_idx / binSize;
-
-                     //
-                     // bi_partition identifies which pair of partitions
-                     // we're operating on.  Out of each bin we're only
-                     // going to do binSize comparisons, so the first
-                     // element in the comparison will be based on
-                     // idx % binSize.
-                     //
-                     int offset = entry_idx % binSize;
-
-                     //
-                     // First element is easy.
-                     // Second element is "easy" but we'll fix
-                     //   special cases below.
-                     //
-                     int i = bi_partition * (binSize * 2) + offset;
-                     int j = (bi_partition + 1) * (binSize * 2) - 1 - offset;
-
-                     //
-                     // The last partition pair is the problem.
-                     // There are several cases:
-                     //    1) Both partitions are full.  This
-                     //       is the easy case, we can just
-                     //       compare and swap elements
-                     //    2) First partition is full, the second
-                     //       partition is not full (possibly
-                     //       empty).  In this case, we only
-                     //       compare some of the elements.
-                     //    3) First partition is not full, there
-                     //       is no second partition.  In this
-                     //       case we actually don't have any
-                     //       work to do.
-                     //
-                     // This should be a simple check.  If the
-                     // second element is beyond the end of
-                     // the array then there is nothing to compare
-                     // and swap.  Note that if the first
-                     // element is beyond the end of the array
-                     // there is also nothing to compare and swap,
-                     // but if the first element is beyond the
-                     // end of the array then the second element
-                     // will also be beyond the end of the array.
-                     //
-                     if (j < count) {
-                       compareAndSwap(d_items[base + i], d_items[base + j], compare);
-                     }
-                   });
-}
-
-/*
- * @brief perform shuffles.  After the repartition we need
- *        to perform shuffles of the halves to get things in
- *        order.
- *
- * @param rowOffsets - the row offsets identifying the segments
- * @param colIndices - the values to sort within the segments
- * @param start - position within the grouped bins where we
- *                start this pass
- * @param stop - position within the grouped bins where we stop
- *               this pass
- * @param d_grouped_bins - lrb grouped bins.  All bins between
- *                         start and stop are in the same lrb bin
- * @param binSize - the bitonic bin size for this pass of the shuffles
- * @param max_count - maximum number of elements possible for
- *                    this call
- * @param bin_pairs - the number of bin pairs
- * @param compare - the comparison functor
- */
-template <typename IndexT, typename ValueT, typename CompareT>
-void shuffles_segmented(const IndexT *d_begin_offsets,
-                        const IndexT *d_end_offsets,
-                        ValueT *d_items,
-                        IndexT start,
-                        IndexT stop,
-                        IndexT *d_grouped_bins,
-                        int binSize,
-                        long max_count,
-                        int bin_pairs,
-                        CompareT &compare)
-{
-  thrust::for_each(thrust::make_counting_iterator<int>(0),
-                   thrust::make_counting_iterator<int>(max_count / 2),
-                   [d_begin_offsets,
-                    d_end_offsets,
-                    d_items,
-                    start,
-                    stop,
-                    d_grouped_bins,
-                    compare,
-                    max_count,
-                    bin_pairs,
-                    binSize] __device__(int idx) {
-                     //
-                     //  idx needs to be mapped into the correct place
-                     //
-                     int entry     = idx / bin_pairs;
-                     int entry_idx = idx % bin_pairs;
-                     int base      = d_begin_offsets[d_grouped_bins[start + entry]];
-                     int count     = d_end_offsets[d_grouped_bins[start + entry]] - base;
-
-                     //
-                     // Identify which elements in which partition
-                     // we are responsible for comparing and swapping
-                     //
-                     // We're running count/2 iterations.  Each iteration
-                     // needs to operate on a pair of elements.  Consider
-                     // the pairs of partitions, this will let us determine
-                     // which elements we compare.
-                     //
-                     int bi_partition = entry_idx / binSize;
-
-                     //
-                     // bi_partition identifies which pair of partitions
-                     // we're operating on.  Out of each bin we're only
-                     // going to do binSize comparisons, so the first
-                     // element in the comparison will be based on
-                     // idx % binSize.
-                     //
-                     int offset = entry_idx % binSize;
-
-                     //
-                     // First element is easy.
-                     // Second element is "easy" i + binSize.
-                     //
-                     int i = bi_partition * (binSize * 2) + offset;
-                     int j = i + binSize;
-
-                     //
-                     // If the second element is beyond the end of
-                     // the array then there is nothing to compare
-                     // and swap.
-                     //
-                     if (j < count) compareAndSwap(d_items[base + i], d_items[base + j], compare);
-                   });
-}
-}  // namespace detail
-
-template <typename ValueT, typename CompareT>
-void sort(ValueT *array, int count, CompareT &compare)
-{
-  for (int i = 1; i < count; i *= 2) {
-    detail::repartition(array, count, i, compare);
-
-    for (int j = i / 2; j > 0; j /= 2) { detail::shuffles(array, count, j, compare); }
-  }
-}
-
-/**
- * @brief Perform a segmented sort.  This function performs a sort
- *        on each segment of the specified input.  This sort is done
- *        in place, so the d_items array is modified during this call.
- *        Sort is done according to the (optionally) specified
- *        comparison function.
- *
- *        Note that this function uses O(num_segments) temporary
- *        memory during execution.
- *
- * @param [in] num_segments - the number of segments that the items array is divided into
- * @param [in] num_items - the number of items in the array
- * @param [in] d_begin_offsets - device array containing the offset denoting the start
- *                               of each segment
- * @param [in] d_end_offsets - device array containing the offset denoting the end
- *                               of each segment.
- * @param [in/out] d_items - device array containing the items to sort
- * @param [in] compare - [optional] comparison function.  Default is thrust::less<ValueT>.
- * @param [in] stream - [optional] CUDA stream to launch kernels with.  Default is stream 0.
- *
- * @return error code
- */
-template <typename IndexT, typename ValueT, typename CompareT>
-void segmented_sort(IndexT num_segments,
-                    IndexT num_items,
-                    const IndexT *d_begin_offsets,
-                    const IndexT *d_end_offsets,
-                    ValueT *d_items,
-                    CompareT compare    = thrust::less<ValueT>(),
-                    cudaStream_t stream = nullptr)
-{
-  //
-  //  NOTE: This should probably be computed somehow.  At the moment
-  //        we are limited to 32 bits because of memory sizes.
-  //
-  int lrb_size = 32;
-  IndexT lrb[lrb_size + 1];
-
-  rmm::device_vector<IndexT> lrb_v(lrb_size + 1);
-  rmm::device_vector<IndexT> grouped_bins_v(num_segments + 1);
-
-  IndexT *d_lrb          = lrb_v.data().get();
-  IndexT *d_grouped_bins = grouped_bins_v.data().get();
-
-  CUDA_TRY(cudaMemset(d_lrb, 0, (lrb_size + 1) * sizeof(IndexT)));
-
-  //
-  //  First we'll count how many entries go in each bin
-  //
-  thrust::for_each(thrust::make_counting_iterator<int>(0),
-                   thrust::make_counting_iterator<int>(num_segments),
-                   [d_begin_offsets, d_end_offsets, d_lrb] __device__(int idx) {
-                     int size = d_end_offsets[idx] - d_begin_offsets[idx];
-                     //
-                     // NOTE: If size is 0 or 1 then no
-                     //       sorting is required, so we'll
-                     //       eliminate those bins here
-                     //
-                     if (size > 1) atomicAdd(d_lrb + __clz(size), 1);
-                   });
-
-  //
-  //  Exclusive sum will identify where each bin begins
-  //
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream)->on(stream), d_lrb, d_lrb + (lrb_size + 1), d_lrb);
-
-  //
-  //  Copy the start of each bin to local memory
-  //
-  CUDA_TRY(cudaMemcpy(lrb, d_lrb, (lrb_size + 1) * sizeof(IndexT), cudaMemcpyDeviceToHost));
-
-  //
-  //  Now we'll populate grouped_bins.  This will corrupt
-  //  d_lrb, but we've already copied it locally.
-  //
-  thrust::for_each(thrust::make_counting_iterator<int>(0),
-                   thrust::make_counting_iterator<int>(num_segments),
-                   [d_begin_offsets, d_end_offsets, d_lrb, d_grouped_bins] __device__(int idx) {
-                     int size = d_end_offsets[idx] - d_begin_offsets[idx];
-                     if (size > 1) {
-                       int pos             = atomicAdd(d_lrb + __clz(size), 1);
-                       d_grouped_bins[pos] = idx;
-                     }
-                   });
-
-  //
-  //  At this point, d_grouped_bins contains the index of the
-  //  different segments, ordered into log2 bins.
-  //
-
-  //
-  //  Now we're ready to go.
-  //
-  //  For simplicity (at least for now), let's just
-  //  iterate over each lrb bin.  Note that the larger
-  //  the index i, the smaller the size of each bin... but
-  //  there will likely be many more inhabitants of that bin.
-  //
-  for (int i = 0; i < lrb_size; ++i) {
-    int size = lrb[i + 1] - lrb[i];
-    if (size > 0) {
-      //
-      //  There are inhabitants of this lrb range
-      //
-      //  max_count will be used to drive the bitonic
-      //  passes (1, 2, 4, 8, ... up to max_count)
-      //
-      int max_count = 1 << (lrb_size - i);
-
-      for (int j = 1; j < max_count; j *= 2) {
-        detail::repartition_segmented(d_begin_offsets,
-                                      d_end_offsets,
-                                      d_items,
-                                      lrb[i],
-                                      lrb[i + 1],
-                                      d_grouped_bins,
-                                      j,
-                                      size * max_count,
-                                      max_count / 2,
-                                      compare);
-
-        for (int k = j / 2; k > 0; k /= 2) {
-          detail::shuffles_segmented(d_begin_offsets,
-                                     d_end_offsets,
-                                     d_items,
-                                     lrb[i],
-                                     lrb[i + 1],
-                                     d_grouped_bins,
-                                     k,
-                                     size * max_count,
-                                     max_count / 2,
-                                     compare);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bitonic
-}  // namespace sort
-}  // namespace cugraph
-
-#endif
diff --git a/cpp/src/utilities/heap.cuh b/cpp/src/utilities/heap.cuh
deleted file mode 100644
index 0747a658324..00000000000
--- a/cpp/src/utilities/heap.cuh
+++ /dev/null
@@ -1,222 +0,0 @@
-// -*-c++-*-
-
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Utilities to treat array as a heap
-// Author: Chuck Hastings charlesh@nvidia.com
-
-#ifndef HEAP_H
-#define HEAP_H
-
-namespace cugraph {
-namespace detail {
-
-namespace heap {
-/*
- *  Our goal here is to treat a C-style array indexed
- *  from 0 to n-1 as a heap.  The heap is a binary tress
- *  structure where the root of each tree is the smallest
- *  (or largest) value in that subtree.
- *
- *  This is a completely serial implementation.  The intention
- *  from a parallelism perspective would be to use this on
- *  a block of data assigned to a particular GPU (or CPU) thread.
- *
- *  These functions will allow you to use an existing
- *  c-style array (host or device side) and manipulate
- *  it as a heap.
- *
- *  Note, the heap will be represented like this - the
- *  shape indicates the binary tree structure, the element
- *  indicates the index of the array that is associated
- *  with the element.  This diagram will help understand
- *  the parent/child calculations defined below.
- *
- *                    0
- *              1           2
- *           3     4     5     6
- *          7  8  9 10 11 12 13 14
- *
- *   So element 0 is the root of the tree, element 1 is the
- *   left child of 0, element 2 is the right child of 0, etc.
- */
-
-namespace detail {
-/**
- * @brief Identify the parent index of the specified index.
- *        NOTE: This function does no bounds checking, so
- *        the parent of 0 is 0.
- *
- *   See the above documentation for a picture to describe
- *   the tree.
- *
- *   IndexT is a templated integer type of the index
- *
- * @param[in]  index - the current array index
- * @return     the index of the parent of the current index
- */
-template <typename IndexT>
-inline IndexT __host__ __device__ parent(IndexT index)
-{
-  static_assert(std::is_integral<IndexT>::value, "Index must be of an integral type");
-
-  return ((index + 1) / 2) - 1;
-}
-
-/**
- * @brief Identify the left child index of the specified index.
- *        NOTE: This function does no bounds checking, so
- *        the left child computed might be out of bounds.
- *
- *   See the above documentation for a picture to describe
- *   the tree.
- *
- *   IndexT is a templated integer type of the index
- *
- * @param[in]  index - the current array index
- * @return     the index of the left child of the current index
- */
-template <typename IndexT>
-inline IndexT __host__ __device__ left_child(IndexT index)
-{
-  static_assert(std::is_integral<IndexT>::value, "Index must be of an integral type");
-
-  return ((index + 1) * 2 - 1);
-}
-
-/**
- * @brief Identify the right child index of the specified index.
- *        NOTE: This function does no bounds checking, so
- *        the right child computed might be out of bounds.
- *
- *   See the above documentation for a picture to describe
- *   the tree.
- *
- *   IndexT is a templated integer type of the index
- *
- * @param[in]  index - the current array index
- * @return     the index of the right child of the current index
- */
-template <typename IndexT>
-inline IndexT __host__ __device__ right_child(IndexT index)
-{
-  static_assert(std::is_integral<IndexT>::value, "Index must be of an integral type");
-
-  return (index + 1) * 2;
-}
-}  // namespace detail
-
-/**
- * @brief Reorder an existing array of elements into a heap
- *
- *   ArrayT is a templated type of the array elements
- *   IndexT is a templated integer type of the index
- *   CompareT is a templated compare function
- *
- * @param[in, out]   array   - the existing array
- * @param[in]        size    - the number of elements in the existing array
- * @param[in]        compare - the comparison function to use
- *
- */
-template <typename ArrayT, typename IndexT, typename CompareT>
-inline void __host__ __device__ heapify(ArrayT *array, IndexT size, CompareT compare)
-{
-  static_assert(std::is_integral<IndexT>::value, "Index must be of an integral type");
-
-  //
-  // We want to order ourselves as a heap.  This is accomplished by starting
-  // at the end and for each element, compare with its parent and
-  // swap if necessary.  We repeat this until there are no more swaps
-  // (should take no more than log2(size) iterations).
-  //
-  IndexT count_swaps = 1;
-  while (count_swaps > 0) {
-    count_swaps = 0;
-    for (IndexT i = size - 1; i > 0; --i) {
-      IndexT p = detail::parent(i);
-
-      if (compare(array[i], array[p])) {
-        thrust::swap(array[i], array[p]);
-        ++count_swaps;
-      }
-    }
-  }
-}
-
-/**
- * @brief Pop the top element off of the heap.  Note that the caller
- *        should decrement the size - the last element in the
- *        array is no longer used.
- *
- *   ArrayT is a templated type of the array elements
- *   IndexT is a templated integer type of the index
- *   CompareT is a templated compare function
- *
- * @return - the top of the heap.
- */
-template <typename ArrayT, typename IndexT, typename CompareT>
-inline ArrayT __host__ __device__ heap_pop(ArrayT *array, IndexT size, CompareT compare)
-{
-  static_assert(std::is_integral<IndexT>::value, "Index must be of an integral type");
-
-  //
-  //  Swap the top of the array with the last element
-  //
-  --size;
-  thrust::swap(array[0], array[size]);
-
-  //
-  //  Now top element is no longer the smallest (largest), so we need
-  //  to sift it down to the proper location.
-  //
-  for (IndexT i = 0; i < size;) {
-    IndexT lc      = detail::left_child(i);
-    IndexT rc      = detail::right_child(i);
-    IndexT smaller = i;
-
-    //
-    //  We can go out of bounds, let's check the simple cases
-    //
-    if (rc < size) {
-      //
-      //  Both children exist in tree, pick the smaller (lerger)
-      //  one.
-      //
-      smaller = (compare(array[lc], array[rc])) ? lc : rc;
-    } else if (lc < size) {
-      smaller = lc;
-    }
-
-    if ((smaller != i) && (compare(array[smaller], array[i]))) {
-      thrust::swap(array[i], array[smaller]);
-      i = smaller;
-    } else {
-      //
-      //  If we don't swap then we can stop checking, break out of the loop
-      //
-      i = size;
-    }
-  }
-
-  return array[size];
-}
-}  // namespace heap
-
-}  // namespace detail
-}  // namespace cugraph
-
-#endif
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ec18640bc11..524b681601f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -297,10 +297,6 @@ ConfigureTest(TRIANGLE_TEST community/triangle_test.cu)
 # - EGO tests --------------------------------------------------------------------------------
 ConfigureTest(EGO_TEST community/egonet_test.cu)
 
-###################################################################################################
-# - RENUMBERING tests -----------------------------------------------------------------------------
-ConfigureTest(RENUMBERING_TEST renumber/renumber_test.cu)
-
 ###################################################################################################
 # - FORCE ATLAS 2  tests --------------------------------------------------------------------------
 ConfigureTest(FA2_TEST layout/force_atlas2_test.cu)
diff --git a/cpp/tests/renumber/renumber_test.cu b/cpp/tests/renumber/renumber_test.cu
deleted file mode 100644
index a7102402acf..00000000000
--- a/cpp/tests/renumber/renumber_test.cu
+++ /dev/null
@@ -1,579 +0,0 @@
-// -*-c++-*-
-
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-//#include "gmock/gmock.h"
-
-#include <utilities/base_fixture.hpp>
-
-#include <converters/renumber.cuh>
-
-#include <rmm/thrust_rmm_allocator.h>
-
-#include <cuda_profiler_api.h>
-#include <curand_kernel.h>
-
-#include <chrono>
-
-struct RenumberingTest : public ::testing::Test {
-};
-
-__global__ void display_list(const char *label, uint32_t *verts, size_t length)
-{
-  printf("%s\n", label);
-
-  for (size_t i = 0; i < length; ++i) { printf("  %u\n", verts[i]); }
-}
-
-__global__ void setup_generator(curandState *state)
-{
-  int id = threadIdx.x + blockIdx.x * blockDim.x;
-  curand_init(43, id, 0, &state[id]);
-}
-
-__global__ void generate_sources(curandState *state, int n, uint32_t *verts)
-{
-  int first  = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-
-  curandState local_state = state[first];
-  for (int id = first; id < n; id += stride) { verts[id] = curand(&local_state); }
-
-  state[first] = local_state;
-}
-
-__global__ void generate_destinations(curandState *state,
-                                      int n,
-                                      const uint32_t *sources,
-                                      uint32_t *destinations)
-{
-  int first  = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-
-  curandState local_state = state[first];
-  for (int id = first; id < n; id += stride) {
-    destinations[id] = sources[curand(&local_state) % n];
-  }
-
-  state[first] = local_state;
-}
-
-TEST_F(RenumberingTest, SmallFixedVertexList)
-{
-  uint32_t src_data[] = {4U, 6U, 8U, 20U, 1U};
-  uint32_t dst_data[] = {1U, 29U, 35U, 0U, 77U};
-
-  uint32_t src_expected[] = {2U, 3U, 4U, 5U, 1U};
-  uint32_t dst_expected[] = {1U, 6U, 7U, 0U, 8U};
-
-  size_t length = sizeof(src_data) / sizeof(src_data[0]);
-
-  uint32_t *src_d;
-  uint32_t *dst_d;
-
-  uint32_t tmp_results[length];
-  uint32_t tmp_map[2 * length];
-
-  rmm::device_vector<uint32_t> src(length);
-  rmm::device_vector<uint32_t> dst(length);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint32_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint32_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-
-  size_t unique_verts = 0;
-
-  auto number_map = cugraph::detail::renumber_vertices(length,
-                                                       src_d,
-                                                       dst_d,
-                                                       src_d,
-                                                       dst_d,
-                                                       &unique_verts,
-                                                       cugraph::detail::HashFunctionObjectInt(511),
-                                                       thrust::less<uint32_t>(),
-                                                       rmm::mr::get_current_device_resource());
-
-  EXPECT_EQ(cudaMemcpy(
-              tmp_map, number_map->data(), sizeof(uint32_t) * unique_verts, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(tmp_results, src_d, sizeof(uint32_t) * length, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], src_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]);
-  }
-
-  EXPECT_EQ(cudaMemcpy(tmp_results, dst_d, sizeof(uint32_t) * length, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], dst_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
-  }
-}
-
-TEST_F(RenumberingTest, SmallFixedVertexListNegative)
-{
-  int64_t src_data[] = {4, 6, 8, -20, 1};
-  int64_t dst_data[] = {1, 29, 35, 0, 77};
-
-  int64_t src_expected[] = {2, 3, 4, 8, 1};
-  int64_t dst_expected[] = {1, 5, 6, 0, 7};
-
-  size_t length = sizeof(src_data) / sizeof(src_data[0]);
-
-  int64_t *src_d;
-  int64_t *dst_d;
-
-  int64_t tmp_results[length];
-  int64_t tmp_map[2 * length];
-
-  rmm::device_vector<int64_t> src(length);
-  rmm::device_vector<int64_t> dst(length);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(int64_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(int64_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-
-  size_t unique_verts = 0;
-
-  auto number_map = cugraph::detail::renumber_vertices(length,
-                                                       src_d,
-                                                       dst_d,
-                                                       src_d,
-                                                       dst_d,
-                                                       &unique_verts,
-                                                       cugraph::detail::HashFunctionObjectInt(511),
-                                                       thrust::less<int64_t>(),
-                                                       rmm::mr::get_current_device_resource());
-
-  EXPECT_EQ(
-    cudaMemcpy(tmp_map, number_map->data(), sizeof(int64_t) * unique_verts, cudaMemcpyDeviceToHost),
-    cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(tmp_results, src_d, sizeof(int64_t) * length, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], src_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]);
-  }
-
-  EXPECT_EQ(cudaMemcpy(tmp_results, dst_d, sizeof(int64_t) * length, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], dst_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
-  }
-}
-
-TEST_F(RenumberingTest, SmallFixedVertexList64Bit)
-{
-  uint64_t src_data[] = {4U, 6U, 8U, 20U, 1U};
-  uint64_t dst_data[] = {1U, 29U, 35U, 0U, 77U};
-
-  uint64_t src_expected[] = {2U, 3U, 4U, 5U, 1U};
-  uint64_t dst_expected[] = {1U, 6U, 7U, 0U, 8U};
-
-  size_t length = sizeof(src_data) / sizeof(src_data[0]);
-
-  uint64_t *src_d;
-  uint64_t *dst_d;
-
-  uint64_t tmp_results[length];
-  uint64_t tmp_map[2 * length];
-
-  rmm::device_vector<uint64_t> src(length);
-  rmm::device_vector<uint64_t> dst(length);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-
-  size_t unique_verts = 0;
-
-  auto number_map = cugraph::detail::renumber_vertices(length,
-                                                       src_d,
-                                                       dst_d,
-                                                       src_d,
-                                                       dst_d,
-                                                       &unique_verts,
-                                                       cugraph::detail::HashFunctionObjectInt(511),
-                                                       thrust::less<uint64_t>(),
-                                                       rmm::mr::get_current_device_resource());
-
-  EXPECT_EQ(cudaMemcpy(
-              tmp_map, number_map->data(), sizeof(uint64_t) * unique_verts, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(tmp_results, src_d, sizeof(uint64_t) * length, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], src_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]);
-  }
-
-  EXPECT_EQ(cudaMemcpy(tmp_results, dst_d, sizeof(uint64_t) * length, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], dst_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
-  }
-}
-
-TEST_F(RenumberingTest, SmallFixedVertexList64BitTo32Bit)
-{
-  uint64_t src_data[] = {4U, 6U, 8U, 20U, 1U};
-  uint64_t dst_data[] = {1U, 29U, 35U, 0U, 77U};
-
-  uint32_t src_expected[] = {2U, 3U, 4U, 5U, 1U};
-  uint32_t dst_expected[] = {1U, 6U, 7U, 0U, 8U};
-
-  size_t length = sizeof(src_data) / sizeof(src_data[0]);
-
-  uint64_t *src_d;
-  uint64_t *dst_d;
-  uint32_t *src_renumbered_d;
-  uint32_t *dst_renumbered_d;
-
-  uint32_t tmp_results[length];
-  uint64_t tmp_map[2 * length];
-
-  rmm::device_vector<uint64_t> src(length);
-  rmm::device_vector<uint64_t> dst(length);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-  rmm::device_vector<uint32_t> src_renumbered(length);
-  rmm::device_vector<uint32_t> dst_renumbered(length);
-  src_renumbered_d = src_renumbered.data().get();
-  dst_renumbered_d = dst_renumbered.data().get();
-
-  EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice),
-            cudaSuccess);
-
-  size_t unique_verts = 0;
-
-  auto number_map = cugraph::detail::renumber_vertices(length,
-                                                       src_d,
-                                                       dst_d,
-                                                       src_renumbered_d,
-                                                       dst_renumbered_d,
-                                                       &unique_verts,
-                                                       cugraph::detail::HashFunctionObjectInt(511),
-                                                       thrust::less<uint64_t>(),
-                                                       rmm::mr::get_current_device_resource());
-
-  EXPECT_EQ(cudaMemcpy(
-              tmp_map, number_map->data(), sizeof(uint64_t) * unique_verts, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  EXPECT_EQ(
-    cudaMemcpy(tmp_results, src_renumbered_d, sizeof(uint32_t) * length, cudaMemcpyDeviceToHost),
-    cudaSuccess);
-
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], src_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]);
-  }
-
-  EXPECT_EQ(
-    cudaMemcpy(tmp_results, dst_renumbered_d, sizeof(uint32_t) * length, cudaMemcpyDeviceToHost),
-    cudaSuccess);
-  for (size_t i = 0; i < length; ++i) {
-    EXPECT_EQ(tmp_results[i], dst_expected[i]);
-    EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
-  }
-}
-
-TEST_F(RenumberingTest, Random100KVertexSet)
-{
-  const int num_verts = 100000;
-
-  uint64_t *src_d;
-  uint64_t *dst_d;
-
-  std::vector<uint64_t> src_data_vec(num_verts);
-  std::vector<uint64_t> dst_data_vec(num_verts);
-  std::vector<uint64_t> tmp_results_vec(num_verts);
-  std::vector<uint64_t> tmp_map_vec(2 * num_verts);
-
-  uint64_t *src_data    = src_data_vec.data();
-  uint64_t *dst_data    = dst_data_vec.data();
-  uint64_t *tmp_results = tmp_results_vec.data();
-  uint64_t *tmp_map     = tmp_map_vec.data();
-  rmm::device_vector<uint64_t> src(num_verts);
-  rmm::device_vector<uint64_t> dst(num_verts);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  //
-  //  Generate random source and vertex values
-  //
-  srand(43);
-
-  for (int i = 0; i < num_verts; ++i) { src_data[i] = (uint64_t)rand(); }
-
-  for (int i = 0; i < num_verts; ++i) { dst_data[i] = (uint64_t)rand(); }
-
-  EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * num_verts, cudaMemcpyHostToDevice),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * num_verts, cudaMemcpyHostToDevice),
-            cudaSuccess);
-
-  //
-  //  Renumber everything
-  //
-  size_t unique_verts = 0;
-  size_t n_verts{num_verts};
-
-  auto start = std::chrono::system_clock::now();
-
-  auto number_map = cugraph::detail::renumber_vertices(n_verts,
-                                                       src_d,
-                                                       dst_d,
-                                                       src_d,
-                                                       dst_d,
-                                                       &unique_verts,
-                                                       cugraph::detail::HashFunctionObjectInt(511),
-                                                       thrust::less<uint64_t>(),
-                                                       rmm::mr::get_current_device_resource());
-
-  auto end                                      = std::chrono::system_clock::now();
-  std::chrono::duration<double> elapsed_seconds = end - start;
-
-  std::cout << "Renumber kernel elapsed time (ms): " << elapsed_seconds.count() * 1000 << std::endl;
-
-  EXPECT_EQ(cudaMemcpy(
-              tmp_map, number_map->data(), sizeof(uint64_t) * unique_verts, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  EXPECT_EQ(cudaMemcpy(tmp_results, src_d, sizeof(uint64_t) * num_verts, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-
-  size_t min_id = unique_verts;
-  size_t max_id = 0;
-
-  size_t cnt = 0;
-  for (size_t i = 0; i < num_verts; ++i) {
-    min_id = min(min_id, tmp_results[i]);
-    max_id = max(max_id, tmp_results[i]);
-    if (tmp_map[tmp_results[i]] != src_data[i]) ++cnt;
-
-    if (cnt < 20) EXPECT_EQ(tmp_map[tmp_results[i]], src_data[i]);
-  }
-
-  if (cnt > 0) printf("  src error count = %ld out of %d\n", cnt, num_verts);
-
-  EXPECT_EQ(cudaMemcpy(tmp_results, dst_d, sizeof(uint64_t) * num_verts, cudaMemcpyDeviceToHost),
-            cudaSuccess);
-  for (size_t i = 0; i < num_verts; ++i) {
-    min_id = min(min_id, tmp_results[i]);
-    max_id = max(max_id, tmp_results[i]);
-    if (tmp_map[tmp_results[i]] != dst_data[i]) ++cnt;
-
-    if (cnt < 20) EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
-  }
-
-  if (cnt > 0) printf("  src error count = %ld out of %d\n", cnt, num_verts);
-
-  EXPECT_EQ(min_id, 0);
-  EXPECT_EQ(max_id, (unique_verts - 1));
-}
-
-TEST_F(RenumberingTest, Random10MVertexSet)
-{
-  const int num_verts = 10000000;
-
-  //  A sampling of performance on single Quadro GV100
-  // const int hash_size =  32767;       // 238 ms
-  // const int hash_size =  8191;      // 224 ms
-  const int hash_size = 511;  // 224 ms
-
-  uint32_t *src_d;
-  uint32_t *dst_d;
-
-  rmm::device_vector<uint32_t> src(num_verts);
-  rmm::device_vector<uint32_t> dst(num_verts);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  //
-  //  Init the random number generate
-  //
-  const int num_threads{64};
-  curandState *state;
-
-  rmm::device_vector<curandState> state_vals(num_threads);
-  state = state_vals.data().get();
-  setup_generator<<<num_threads, 1>>>(state);
-  generate_sources<<<num_threads, 1>>>(state, num_verts, src_d);
-  generate_destinations<<<num_threads, 1>>>(state, num_verts, src_d, dst_d);
-
-  std::cout << "done with initialization" << std::endl;
-
-  //
-  //  Renumber everything
-  //
-  size_t unique_verts = 0;
-  size_t n_verts{num_verts};
-
-  auto start = std::chrono::system_clock::now();
-  auto number_map =
-    cugraph::detail::renumber_vertices(n_verts,
-                                       src_d,
-                                       dst_d,
-                                       src_d,
-                                       dst_d,
-                                       &unique_verts,
-                                       cugraph::detail::HashFunctionObjectInt(hash_size),
-                                       thrust::less<uint64_t>(),
-                                       rmm::mr::get_current_device_resource());
-  auto end                                      = std::chrono::system_clock::now();
-  std::chrono::duration<double> elapsed_seconds = end - start;
-
-  std::cout << "Renumber kernel elapsed time (ms): " << elapsed_seconds.count() * 1000 << std::endl;
-  std::cout << "  unique verts = " << unique_verts << std::endl;
-  std::cout << "  hash size = " << hash_size << std::endl;
-}
-
-TEST_F(RenumberingTest, Random100MVertexSet)
-{
-  const int num_verts = 100000000;
-
-  //  A sampling of performance on single Quadro GV100
-  // const int hash_size =  8192;        // 1811 ms
-  // const int hash_size =  16384;       // 1746 ms
-  // const int hash_size =  32768;       // 1662 ms
-  // const int hash_size =  65536;       // 1569 ms
-  // const int hash_size =  16777216;      // 1328 ms
-  const int hash_size = 511;
-
-  uint32_t *src_d;
-  uint32_t *dst_d;
-
-  rmm::device_vector<uint32_t> src(num_verts);
-  rmm::device_vector<uint32_t> dst(num_verts);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  //
-  //  Init the random number generate
-  //
-  const int num_threads{64};
-  curandState *state;
-
-  rmm::device_vector<curandState> state_vals(num_threads);
-  state = state_vals.data().get();
-  setup_generator<<<num_threads, 1>>>(state);
-  generate_sources<<<num_threads, 1>>>(state, num_verts, src_d);
-  generate_destinations<<<num_threads, 1>>>(state, num_verts, src_d, dst_d);
-
-  std::cout << "done with initialization" << std::endl;
-
-  //
-  //  Renumber everything
-  //
-  size_t unique_verts = 0;
-  size_t n_verts{num_verts};
-
-  auto start = std::chrono::system_clock::now();
-  auto number_map =
-    cugraph::detail::renumber_vertices(n_verts,
-                                       src_d,
-                                       dst_d,
-                                       src_d,
-                                       dst_d,
-                                       &unique_verts,
-                                       cugraph::detail::HashFunctionObjectInt(hash_size),
-                                       thrust::less<uint64_t>(),
-                                       rmm::mr::get_current_device_resource());
-  auto end                                      = std::chrono::system_clock::now();
-  std::chrono::duration<double> elapsed_seconds = end - start;
-
-  std::cout << "Renumber kernel elapsed time (ms): " << elapsed_seconds.count() * 1000 << std::endl;
-  std::cout << "  unique verts = " << unique_verts << std::endl;
-  std::cout << "  hash size = " << hash_size << std::endl;
-}
-
-TEST_F(RenumberingTest, Random500MVertexSet)
-{
-  const int num_verts = 500000000;
-
-  //  A sampling of performance on single Quadro GV100
-  // const int hash_size =  8192;      // 9918 ms
-  // const int hash_size =  16384;      // 9550 ms
-  // const int hash_size =  32768;      // 9146 ms
-  // const int hash_size =  131072;      // 8537 ms
-  const int hash_size = 1048576;  // 7335 ms
-  // const int hash_size =  511;      // 7335 ms
-
-  uint32_t *src_d;
-  uint32_t *dst_d;
-
-  rmm::device_vector<uint32_t> src(num_verts);
-  rmm::device_vector<uint32_t> dst(num_verts);
-  src_d = src.data().get();
-  dst_d = dst.data().get();
-
-  //
-  //  Init the random number generate
-  //
-  const int num_threads{64};
-  curandState *state;
-
-  rmm::device_vector<curandState> state_vals(num_threads);
-  state = state_vals.data().get();
-  setup_generator<<<num_threads, 1>>>(state);
-  generate_sources<<<num_threads, 1>>>(state, num_verts, src_d);
-  generate_destinations<<<num_threads, 1>>>(state, num_verts, src_d, dst_d);
-
-  std::cout << "done with initialization" << std::endl;
-
-  //
-  //  Renumber everything
-  //
-  size_t unique_verts = 0;
-  size_t n_verts{num_verts};
-
-  auto start = std::chrono::system_clock::now();
-  auto number_map =
-    cugraph::detail::renumber_vertices(n_verts,
-                                       src_d,
-                                       dst_d,
-                                       src_d,
-                                       dst_d,
-                                       &unique_verts,
-                                       cugraph::detail::HashFunctionObjectInt(hash_size),
-                                       thrust::less<uint64_t>(),
-                                       rmm::mr::get_current_device_resource());
-  auto end                                      = std::chrono::system_clock::now();
-  std::chrono::duration<double> elapsed_seconds = end - start;
-
-  std::cout << "Renumber kernel elapsed time (ms): " << elapsed_seconds.count() * 1000 << std::endl;
-  std::cout << "  unique verts = " << unique_verts << std::endl;
-  std::cout << "  hash size = " << hash_size << std::endl;
-}
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/python/cugraph/structure/graph_primtypes.pxd b/python/cugraph/structure/graph_primtypes.pxd
index e0db6c31fca..533e2d84c66 100644
--- a/python/cugraph/structure/graph_primtypes.pxd
+++ b/python/cugraph/structure/graph_primtypes.pxd
@@ -128,16 +128,6 @@ cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
     cdef unique_ptr[GraphCOO[VT, ET, WT]] get_two_hop_neighbors[VT,ET,WT](
         const GraphCSRView[VT, ET, WT] &graph) except +
 
-cdef extern from "cugraph/functions.hpp" namespace "cugraph":
-
-    cdef unique_ptr[device_buffer] renumber_vertices[VT_IN,VT_OUT,ET](
-        ET number_of_edges,
-        const VT_IN *src,
-        const VT_IN *dst,
-        VT_OUT *src_renumbered,
-        VT_OUT *dst_renumbered,
-        ET *map_size) except +
-
 
 cdef extern from "<utility>" namespace "std" nogil:
     cdef unique_ptr[GraphCOO[int,int,float]] move(unique_ptr[GraphCOO[int,int,float]])
diff --git a/python/cugraph/structure/graph_primtypes_wrapper.pyx b/python/cugraph/structure/graph_primtypes_wrapper.pyx
index 91af28380c3..95de1d70732 100644
--- a/python/cugraph/structure/graph_primtypes_wrapper.pyx
+++ b/python/cugraph/structure/graph_primtypes_wrapper.pyx
@@ -18,7 +18,6 @@
 
 from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure.graph_primtypes cimport get_two_hop_neighbors as c_get_two_hop_neighbors
-from cugraph.structure.graph_primtypes cimport renumber_vertices as c_renumber_vertices
 from cugraph.structure.utils_wrapper import *
 from libcpp cimport bool
 import enum
@@ -52,45 +51,6 @@ class Direction(enum.Enum):
     OUT = 2
 
 
-def renumber(source_col, dest_col):
-    num_edges = len(source_col)
-
-    src_renumbered = cudf.Series(np.zeros(num_edges), dtype=np.int32)
-    dst_renumbered = cudf.Series(np.zeros(num_edges), dtype=np.int32)
-
-    cdef uintptr_t c_src = source_col.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_dst = dest_col.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_src_renumbered = src_renumbered.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_dst_renumbered = dst_renumbered.__cuda_array_interface__['data'][0]
-    cdef int map_size = 0
-    cdef int n_edges = num_edges
-
-    cdef unique_ptr[device_buffer] numbering_map
-
-    if (source_col.dtype == np.int32):
-        numbering_map = move(c_renumber_vertices[int,int,int](n_edges,
-                                                              <int*>c_src,
-                                                              <int*>c_dst,
-                                                              <int*>c_src_renumbered,
-                                                              <int*>c_dst_renumbered,
-                                                              &map_size))
-    else:
-        numbering_map = move(c_renumber_vertices[long,int,int](n_edges,
-                                                               <long*>c_src,
-                                                               <long*>c_dst,
-                                                               <int*>c_src_renumbered,
-                                                               <int*>c_dst_renumbered,
-                                                               &map_size))
-
-
-    map = DeviceBuffer.c_from_unique_ptr(move(numbering_map))
-    map = Buffer(map)
-
-    output_map = cudf.Series(data=map, dtype=source_col.dtype)
-
-    return src_renumbered, dst_renumbered, output_map
-
-
 def view_adj_list(input_graph):
 
     if input_graph.adjlist is None: