rapidsai · rapids-bot · Jul 27, 2022 · May 31, 2022 · May 31, 2022 · May 31, 2022
@@ -630,6 +630,25 @@ DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu)
 #endif
 }
 
+/**
+ * @brief Shuffle the data inside a warp from lower lane IDs
+ * @tparam T the data type (currently assumed to be 4B)
+ * @param val value to be shuffled
+ * @param delta lower lane ID delta from where to shuffle
+ * @param width lane width
+ * @param mask mask of participating threads (Volta+)
+ * @return the shuffled data
+ */
+template <typename T>
+DI T shfl_up(T val, int delta, int width = WarpSize, uint32_t mask = 0xffffffffu)
+{
+#if CUDART_VERSION >= 9000
+  return __shfl_up_sync(mask, val, delta, width);
+#else
+  return __shfl_up(val, delta, width);
+#endif
+}
+
 /**
  * @brief Shuffle the data inside a warp
  * @tparam T the data type (currently assumed to be 4B)

@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng_device.cuh>
+#include <raft/random/rng_state.hpp>
+
+namespace raft {
+namespace random {
+namespace detail {
+
+template <typename IdxT, typename ProbT>
+DI void gen_and_update_bits(IdxT& src_id,
+                            IdxT& dst_id,
+                            ProbT a,
+                            ProbT ab,
+                            ProbT abc,
+                            IdxT r_scale,
+                            IdxT c_scale,
+                            IdxT curr_depth,
+                            raft::random::PCGenerator& gen)
+{
+  bool src_bit, dst_bit;
+  ProbT val;
+  gen.next(val);
+  if (val <= a) {
+    src_bit = dst_bit = false;
+  } else if (val <= ab) {
+    src_bit = false;
+    dst_bit = true;
+  } else if (val <= abc) {
+    src_bit = true;
+    dst_bit = false;
+  } else {
+    src_bit = dst_bit = true;
+  }
+  if (curr_depth < r_scale) { src_id |= (IdxT(src_bit) << (r_scale - curr_depth - 1)); }
+  if (curr_depth < c_scale) { dst_id |= (IdxT(dst_bit) << (c_scale - curr_depth - 1)); }
+}
+
+template <typename IdxT>
+DI void store_ids(
+  IdxT* out, IdxT* out_src, IdxT* out_dst, IdxT src_id, IdxT dst_id, IdxT idx, IdxT n_edges)
+{
+  if (idx < n_edges) {
+    if (out != nullptr) {
+      // uncoalesced gmem accesses!
+      out[idx * 2]     = src_id;
+      out[idx * 2 + 1] = dst_id;
+    }
+    if (out_src != nullptr) { out_src[idx] = src_id; }
+    if (out_dst != nullptr) { out_dst[idx] = dst_id; }
+  }
+}
+
+template <typename IdxT, typename ProbT>
+__global__ void rmat_gen_kernel(IdxT* out,
+                                IdxT* out_src,
+                                IdxT* out_dst,
+                                const ProbT* theta,
+                                IdxT r_scale,
+                                IdxT c_scale,
+                                IdxT n_edges,
+                                IdxT max_scale,
+                                raft::random::RngState r)
+{
+  IdxT idx = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
+  extern __shared__ ProbT s_theta[];
+  auto theta_len = max_scale * 2 * 2;
+  // load the probabilities into shared memory and then convert them into cdf's
+  // currently there are smem bank conflicts due to the way these are accessed
+  for (int i = threadIdx.x; i < theta_len; i += blockDim.x) {
+    s_theta[i] = theta[i];
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < max_scale; i += blockDim.x) {
+    auto a             = s_theta[4 * i];
+    auto b             = s_theta[4 * i + 1];
+    auto c             = s_theta[4 * i + 2];
+    s_theta[4 * i + 1] = a + b;
+    s_theta[4 * i + 2] = a + b + c;
+    s_theta[4 * i + 3] += a + b + c;
+  }
+  __syncthreads();
+  IdxT src_id{0}, dst_id{0};
+  raft::random::PCGenerator gen{r.seed, r.base_subsequence + idx, 0};
+  for (IdxT i = 0; i < max_scale; ++i) {
+    auto a = s_theta[i * 4], ab = s_theta[i * 4 + 1], abc = s_theta[i * 4 + 2];
+    gen_and_update_bits(src_id, dst_id, a, ab, abc, r_scale, c_scale, i, gen);
+  }
+  store_ids(out, out_src, out_dst, src_id, dst_id, idx, n_edges);
+}
+
+template <typename IdxT, typename ProbT>
+void rmat_rectangular_gen_caller(IdxT* out,
+                                 IdxT* out_src,
+                                 IdxT* out_dst,
+                                 const ProbT* theta,
+                                 IdxT r_scale,
+                                 IdxT c_scale,
+                                 IdxT n_edges,
+                                 cudaStream_t stream,
+                                 raft::random::RngState& r)
+{
+  if (n_edges <= 0) return;
+  static constexpr int N_THREADS = 512;
+  auto max_scale                 = max(r_scale, c_scale);
+  size_t smem_size               = sizeof(ProbT) * max_scale * 2 * 2;
+  auto n_blks                    = raft::ceildiv<IdxT>(n_edges, N_THREADS);
+  rmat_gen_kernel<<<n_blks, N_THREADS, smem_size, stream>>>(
+    out, out_src, out_dst, theta, r_scale, c_scale, n_edges, max_scale, r);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  r.advance(n_edges, max_scale);
+}
+
+template <typename IdxT, typename ProbT>
+__global__ void rmat_gen_kernel(IdxT* out,
+                                IdxT* out_src,
+                                IdxT* out_dst,
+                                ProbT a,
+                                ProbT b,
+                                ProbT c,
+                                IdxT r_scale,
+                                IdxT c_scale,
+                                IdxT n_edges,
+                                IdxT max_scale,
+                                raft::random::RngState r)
+{
+  IdxT idx = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
+  IdxT src_id{0}, dst_id{0};
+  raft::random::PCGenerator gen{r.seed, r.base_subsequence + idx, 0};
+  auto min_scale = min(r_scale, c_scale);
+  IdxT i         = 0;
+  for (; i < min_scale; ++i) {
+    gen_and_update_bits(src_id, dst_id, a, a + b, a + b + c, r_scale, c_scale, i, gen);
+  }
+  for (; i < r_scale; ++i) {
+    gen_and_update_bits(src_id, dst_id, a + b, a + b, ProbT(1), r_scale, c_scale, i, gen);
+  }
+  for (; i < c_scale; ++i) {
+    gen_and_update_bits(src_id, dst_id, a + c, ProbT(1), ProbT(1), r_scale, c_scale, i, gen);
+  }
+  store_ids(out, out_src, out_dst, src_id, dst_id, idx, n_edges);
+}
+
+template <typename IdxT, typename ProbT>
+void rmat_rectangular_gen_caller(IdxT* out,
+                                 IdxT* out_src,
+                                 IdxT* out_dst,
+                                 ProbT a,
+                                 ProbT b,
+                                 ProbT c,
+                                 IdxT r_scale,
+                                 IdxT c_scale,
+                                 IdxT n_edges,
+                                 cudaStream_t stream,
+                                 raft::random::RngState& r)
+{
+  if (n_edges <= 0) return;
+  static constexpr int N_THREADS = 512;
+  auto max_scale                 = max(r_scale, c_scale);
+  auto n_blks                    = raft::ceildiv<IdxT>(n_edges, N_THREADS);
+  rmat_gen_kernel<<<n_blks, N_THREADS, 0, stream>>>(
+    out, out_src, out_dst, a, b, c, r_scale, c_scale, n_edges, max_scale, r);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  r.advance(n_edges, max_scale);
+}
+
+}  // end namespace detail
+}  // end namespace random
+}  // end namespace raft
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/rmat_rectangular_generator.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief Generate RMAT for a rectangular shaped adjacency matrices (useful when
+ *        graphs to be generated are bipartite)
+ *
+ * @tparam IdxT  node indices type
+ * @tparam ProbT data type used for probability distributions (either fp32 or fp64)
+ *
+ * @param[out] out     generated edgelist [on device] [dim = n_edges x 2]. On each row
+ *                     the first element corresponds to the source node id while the
+ *                     second, the destination node id. If you don't need this output
+ *                     then pass a `nullptr` in its place.
+ * @param[out] out_src list of source node id's [on device] [len = n_edges]. If you
+ *                     don't need this output then pass a `nullptr` in its place.
+ * @param[out] out_dst list of destination node id's [on device] [len = n_edges]. If
+ *                     you don't need this output then pass a `nullptr` in its place.
+ * @param[in]  theta   distribution of each quadrant at each level of resolution.
+ *                     Since these are probabilities, each of the 2x2 matrix for
+ *                     each level of the RMAT must sum to one. [on device]
+ *                     [dim = max(r_scale, c_scale) x 2 x 2]. Of course, it is assumed
+ *                     that each of the group of 2 x 2 numbers all sum up to 1.
+ * @param[in]  r_scale 2^r_scale represents the number of source nodes
+ * @param[in]  c_scale 2^c_scale represents the number of destination nodes
+ * @param[in]  n_edges number of edges to generate
+ * @param[in]  stream  cuda stream to schedule the work on
+ * @param[in]  r       underlying state of the random generator. Especially useful when
+ *                     one wants to call this API for multiple times in order to generate
+ *                     a larger graph. For that case, just create this object with the
+ *                     initial seed once and after every call continue to pass the same
+ *                     object for the successive calls.
+ *
+ * When `r_scale != c_scale` it is referred to as rectangular adjacency matrix case (IOW generating
+ * bipartite graphs). In this case, at `depth >= r_scale`, the distribution is assumed to be:
+ * `[theta[4 * depth] + theta[4 * depth + 2], theta[4 * depth + 1] + theta[4 * depth + 3]; 0, 0]`.
+ * Then for the `depth >= c_scale`, the distribution is assumed to be:
+ * `[theta[4 * depth] + theta[4 * depth + 1], 0; theta[4 * depth + 2] + theta[4 * depth + 3], 0]`.
+ *
+ * @note This can generate duplicate edges and self-loops. It is the responsibility of the
+ *       caller to clean them up accordingly.
+
+ * @note This also only generates directed graphs. If undirected graphs are needed, then a
+ *       separate post-processing step is expected to be done by the caller.
+ *
+ * @{
+ */
+template <typename IdxT, typename ProbT>
+void rmat_rectangular_gen(IdxT* out,
+                          IdxT* out_src,
+                          IdxT* out_dst,
+                          const ProbT* theta,
+                          IdxT r_scale,
+                          IdxT c_scale,
+                          IdxT n_edges,
+                          cudaStream_t stream,
+                          raft::random::RngState& r)
+{
+  detail::rmat_rectangular_gen_caller(
+    out, out_src, out_dst, theta, r_scale, c_scale, n_edges, stream, r);
+}
+
+/**
+ * This is the same as the previous method but assumes the same a, b, c, d probability
+ * distributions across all the scales
+ */
+template <typename IdxT, typename ProbT>
+void rmat_rectangular_gen(IdxT* out,
+                          IdxT* out_src,
+                          IdxT* out_dst,
+                          ProbT a,
+                          ProbT b,
+                          ProbT c,
+                          IdxT r_scale,
+                          IdxT c_scale,
+                          IdxT n_edges,
+                          cudaStream_t stream,
+                          raft::random::RngState& r)
+{
+  detail::rmat_rectangular_gen_caller(
+    out, out_src, out_dst, a, b, c, r_scale, c_scale, n_edges, stream, r);
+}
+/** @} */
+
+}  // end namespace raft::random
@@ -83,6 +83,7 @@ add_executable(test_raft
     test/random/permute.cu
     test/random/rng.cu
     test/random/rng_int.cu
+    test/random/rmat_rectangular_generator.cu
     test/random/sample_without_replacement.cu
     test/span.cpp
     test/span.cu