diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh index 362dba66c5..62f5cdcccf 100644 --- a/cpp/include/raft/cuda_utils.cuh +++ b/cpp/include/raft/cuda_utils.cuh @@ -630,6 +630,25 @@ DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu) #endif } +/** + * @brief Shuffle the data inside a warp from lower lane IDs + * @tparam T the data type (currently assumed to be 4B) + * @param val value to be shuffled + * @param delta lower lane ID delta from where to shuffle + * @param width lane width + * @param mask mask of participating threads (Volta+) + * @return the shuffled data + */ +template +DI T shfl_up(T val, int delta, int width = WarpSize, uint32_t mask = 0xffffffffu) +{ +#if CUDART_VERSION >= 9000 + return __shfl_up_sync(mask, val, delta, width); +#else + return __shfl_up(val, delta, width); +#endif +} + /** * @brief Shuffle the data inside a warp * @tparam T the data type (currently assumed to be 4B) diff --git a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh new file mode 100644 index 0000000000..8a1f23e785 --- /dev/null +++ b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace random { +namespace detail { + +template +DI void gen_and_update_bits(IdxT& src_id, + IdxT& dst_id, + ProbT a, + ProbT ab, + ProbT abc, + IdxT r_scale, + IdxT c_scale, + IdxT curr_depth, + raft::random::PCGenerator& gen) +{ + bool src_bit, dst_bit; + ProbT val; + gen.next(val); + if (val <= a) { + src_bit = dst_bit = false; + } else if (val <= ab) { + src_bit = false; + dst_bit = true; + } else if (val <= abc) { + src_bit = true; + dst_bit = false; + } else { + src_bit = dst_bit = true; + } + if (curr_depth < r_scale) { src_id |= (IdxT(src_bit) << (r_scale - curr_depth - 1)); } + if (curr_depth < c_scale) { dst_id |= (IdxT(dst_bit) << (c_scale - curr_depth - 1)); } +} + +template +DI void store_ids( + IdxT* out, IdxT* out_src, IdxT* out_dst, IdxT src_id, IdxT dst_id, IdxT idx, IdxT n_edges) +{ + if (idx < n_edges) { + if (out != nullptr) { + // uncoalesced gmem accesses! + out[idx * 2] = src_id; + out[idx * 2 + 1] = dst_id; + } + if (out_src != nullptr) { out_src[idx] = src_id; } + if (out_dst != nullptr) { out_dst[idx] = dst_id; } + } +} + +template +__global__ void rmat_gen_kernel(IdxT* out, + IdxT* out_src, + IdxT* out_dst, + const ProbT* theta, + IdxT r_scale, + IdxT c_scale, + IdxT n_edges, + IdxT max_scale, + raft::random::RngState r) +{ + IdxT idx = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x); + extern __shared__ ProbT s_theta[]; + auto theta_len = max_scale * 2 * 2; + // load the probabilities into shared memory and then convert them into cdf's + // currently there are smem bank conflicts due to the way these are accessed + for (int i = threadIdx.x; i < theta_len; i += blockDim.x) { + s_theta[i] = theta[i]; + } + __syncthreads(); + for (int i = threadIdx.x; i < max_scale; i += blockDim.x) { + auto a = s_theta[4 * i]; + auto b = s_theta[4 * i + 1]; + auto c = s_theta[4 * i + 2]; + s_theta[4 * i + 1] = a + b; + s_theta[4 * i + 2] = a + b + c; + s_theta[4 * i + 3] += a + b + c; + } + __syncthreads(); + IdxT src_id{0}, dst_id{0}; + raft::random::PCGenerator gen{r.seed, r.base_subsequence + idx, 0}; + for (IdxT i = 0; i < max_scale; ++i) { + auto a = s_theta[i * 4], ab = s_theta[i * 4 + 1], abc = s_theta[i * 4 + 2]; + gen_and_update_bits(src_id, dst_id, a, ab, abc, r_scale, c_scale, i, gen); + } + store_ids(out, out_src, out_dst, src_id, dst_id, idx, n_edges); +} + +template +void rmat_rectangular_gen_caller(IdxT* out, + IdxT* out_src, + IdxT* out_dst, + const ProbT* theta, + IdxT r_scale, + IdxT c_scale, + IdxT n_edges, + cudaStream_t stream, + raft::random::RngState& r) +{ + if (n_edges <= 0) return; + static constexpr int N_THREADS = 512; + auto max_scale = max(r_scale, c_scale); + size_t smem_size = sizeof(ProbT) * max_scale * 2 * 2; + auto n_blks = raft::ceildiv(n_edges, N_THREADS); + rmat_gen_kernel<<>>( + out, out_src, out_dst, theta, r_scale, c_scale, n_edges, max_scale, r); + RAFT_CUDA_TRY(cudaGetLastError()); + r.advance(n_edges, max_scale); +} + +template +__global__ void rmat_gen_kernel(IdxT* out, + IdxT* out_src, + IdxT* out_dst, + ProbT a, + ProbT b, + ProbT c, + IdxT r_scale, + IdxT c_scale, + IdxT n_edges, + IdxT max_scale, + raft::random::RngState r) +{ + IdxT idx = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x); + IdxT src_id{0}, dst_id{0}; + raft::random::PCGenerator gen{r.seed, r.base_subsequence + idx, 0}; + auto min_scale = min(r_scale, c_scale); + IdxT i = 0; + for (; i < min_scale; ++i) { + gen_and_update_bits(src_id, dst_id, a, a + b, a + b + c, r_scale, c_scale, i, gen); + } + for (; i < r_scale; ++i) { + gen_and_update_bits(src_id, dst_id, a + b, a + b, ProbT(1), r_scale, c_scale, i, gen); + } + for (; i < c_scale; ++i) { + gen_and_update_bits(src_id, dst_id, a + c, ProbT(1), ProbT(1), r_scale, c_scale, i, gen); + } + store_ids(out, out_src, out_dst, src_id, dst_id, idx, n_edges); +} + +template +void rmat_rectangular_gen_caller(IdxT* out, + IdxT* out_src, + IdxT* out_dst, + ProbT a, + ProbT b, + ProbT c, + IdxT r_scale, + IdxT c_scale, + IdxT n_edges, + cudaStream_t stream, + raft::random::RngState& r) +{ + if (n_edges <= 0) return; + static constexpr int N_THREADS = 512; + auto max_scale = max(r_scale, c_scale); + auto n_blks = raft::ceildiv(n_edges, N_THREADS); + rmat_gen_kernel<<>>( + out, out_src, out_dst, a, b, c, r_scale, c_scale, n_edges, max_scale, r); + RAFT_CUDA_TRY(cudaGetLastError()); + r.advance(n_edges, max_scale); +} + +} // end namespace detail +} // end namespace random +} // end namespace raft diff --git a/cpp/include/raft/random/rmat_rectangular_generator.cuh b/cpp/include/raft/random/rmat_rectangular_generator.cuh new file mode 100644 index 0000000000..aad1cf0c88 --- /dev/null +++ b/cpp/include/raft/random/rmat_rectangular_generator.cuh @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/rmat_rectangular_generator.cuh" + +namespace raft::random { + +/** + * @brief Generate RMAT for a rectangular shaped adjacency matrices (useful when + * graphs to be generated are bipartite) + * + * @tparam IdxT node indices type + * @tparam ProbT data type used for probability distributions (either fp32 or fp64) + * + * @param[out] out generated edgelist [on device] [dim = n_edges x 2]. On each row + * the first element corresponds to the source node id while the + * second, the destination node id. If you don't need this output + * then pass a `nullptr` in its place. + * @param[out] out_src list of source node id's [on device] [len = n_edges]. If you + * don't need this output then pass a `nullptr` in its place. + * @param[out] out_dst list of destination node id's [on device] [len = n_edges]. If + * you don't need this output then pass a `nullptr` in its place. + * @param[in] theta distribution of each quadrant at each level of resolution. + * Since these are probabilities, each of the 2x2 matrix for + * each level of the RMAT must sum to one. [on device] + * [dim = max(r_scale, c_scale) x 2 x 2]. Of course, it is assumed + * that each of the group of 2 x 2 numbers all sum up to 1. + * @param[in] r_scale 2^r_scale represents the number of source nodes + * @param[in] c_scale 2^c_scale represents the number of destination nodes + * @param[in] n_edges number of edges to generate + * @param[in] stream cuda stream to schedule the work on + * @param[in] r underlying state of the random generator. Especially useful when + * one wants to call this API for multiple times in order to generate + * a larger graph. For that case, just create this object with the + * initial seed once and after every call continue to pass the same + * object for the successive calls. + * + * When `r_scale != c_scale` it is referred to as rectangular adjacency matrix case (IOW generating + * bipartite graphs). In this case, at `depth >= r_scale`, the distribution is assumed to be: + * `[theta[4 * depth] + theta[4 * depth + 2], theta[4 * depth + 1] + theta[4 * depth + 3]; 0, 0]`. + * Then for the `depth >= c_scale`, the distribution is assumed to be: + * `[theta[4 * depth] + theta[4 * depth + 1], 0; theta[4 * depth + 2] + theta[4 * depth + 3], 0]`. + * + * @note This can generate duplicate edges and self-loops. It is the responsibility of the + * caller to clean them up accordingly. + + * @note This also only generates directed graphs. If undirected graphs are needed, then a + * separate post-processing step is expected to be done by the caller. + * + * @{ + */ +template +void rmat_rectangular_gen(IdxT* out, + IdxT* out_src, + IdxT* out_dst, + const ProbT* theta, + IdxT r_scale, + IdxT c_scale, + IdxT n_edges, + cudaStream_t stream, + raft::random::RngState& r) +{ + detail::rmat_rectangular_gen_caller( + out, out_src, out_dst, theta, r_scale, c_scale, n_edges, stream, r); +} + +/** + * This is the same as the previous method but assumes the same a, b, c, d probability + * distributions across all the scales + */ +template +void rmat_rectangular_gen(IdxT* out, + IdxT* out_src, + IdxT* out_dst, + ProbT a, + ProbT b, + ProbT c, + IdxT r_scale, + IdxT c_scale, + IdxT n_edges, + cudaStream_t stream, + raft::random::RngState& r) +{ + detail::rmat_rectangular_gen_caller( + out, out_src, out_dst, a, b, c, r_scale, c_scale, n_edges, stream, r); +} +/** @} */ + +} // end namespace raft::random diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 43c6257966..10ed08c0b5 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -83,6 +83,7 @@ add_executable(test_raft test/random/permute.cu test/random/rng.cu test/random/rng_int.cu + test/random/rmat_rectangular_generator.cu test/random/sample_without_replacement.cu test/span.cpp test/span.cu diff --git a/cpp/test/random/rmat_rectangular_generator.cu b/cpp/test/random/rmat_rectangular_generator.cu new file mode 100644 index 0000000000..4ccda9c1fa --- /dev/null +++ b/cpp/test/random/rmat_rectangular_generator.cu @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "../test_utils.h" + +#include +#include +#include +#include + +namespace raft { +namespace random { + +// Courtesy: cuGraph unit-tests + +struct RmatInputs { + size_t r_scale; + size_t c_scale; + size_t n_edges; + bool theta_array; + uint64_t seed; + float eps; +}; + +template +__global__ void normalize_kernel( + OutT* theta, const InT* in_vals, size_t max_scale, size_t r_scale, size_t c_scale) +{ + size_t idx = threadIdx.x; + if (idx < max_scale) { + auto a = OutT(in_vals[4 * idx]); + auto b = OutT(in_vals[4 * idx + 1]); + auto c = OutT(in_vals[4 * idx + 2]); + auto d = OutT(in_vals[4 * idx + 3]); + auto sum = a + b + c + d; + a /= sum; + b /= sum; + c /= sum; + d /= sum; + theta[4 * idx] = a; + theta[4 * idx + 1] = b; + theta[4 * idx + 2] = c; + theta[4 * idx + 3] = d; + } +} + +// handle rectangular cases correctly +template +__global__ void handle_rect_kernel(OutT* theta, size_t max_scale, size_t r_scale, size_t c_scale) +{ + size_t idx = threadIdx.x; + if (idx < max_scale) { + auto a = theta[4 * idx]; + auto b = theta[4 * idx + 1]; + auto c = theta[4 * idx + 2]; + auto d = theta[4 * idx + 3]; + if (idx >= r_scale) { + a += c; + c = OutT(0); + b += d; + d = OutT(0); + } + if (idx >= c_scale) { + a += b; + b = OutT(0); + c += d; + d = OutT(0); + } + theta[4 * idx] = a; + theta[4 * idx + 1] = b; + theta[4 * idx + 2] = c; + theta[4 * idx + 3] = d; + } +} + +// for a single probability distribution across depths, just replicate the theta's! +// this will keep the test code simpler +template +__global__ void theta_kernel(OutT* theta, size_t max_scale, size_t r_scale, size_t c_scale) +{ + size_t idx = threadIdx.x; + if (idx != 0 && idx < max_scale) { + auto a = theta[0]; + auto b = theta[1]; + auto c = theta[2]; + auto d = theta[3]; + if (idx >= r_scale) { + a += c; + c = OutT(0); + b += d; + d = OutT(0); + } + if (idx >= c_scale) { + a += b; + b = OutT(0); + c += d; + d = OutT(0); + } + theta[4 * idx] = a; + theta[4 * idx + 1] = b; + theta[4 * idx + 2] = c; + theta[4 * idx + 3] = d; + } +} + +template +void normalize(OutT* theta, + const InT* in_vals, + size_t max_scale, + size_t r_scale, + size_t c_scale, + bool handle_rect, + bool theta_array, + cudaStream_t stream) +{ + // one threadblock with 256 threads is more than enough as the 'scale' parameters + // won't be that large! + normalize_kernel<<<1, 256, 0, stream>>>(theta, in_vals, max_scale, r_scale, c_scale); + RAFT_CUDA_TRY(cudaGetLastError()); + if (handle_rect) { + handle_rect_kernel<<<1, 256, 0, stream>>>(theta, max_scale, r_scale, c_scale); + RAFT_CUDA_TRY(cudaGetLastError()); + } + if (!theta_array) { + theta_kernel<<<1, 256, 0, stream>>>(theta, max_scale, r_scale, c_scale); + RAFT_CUDA_TRY(cudaGetLastError()); + } +} + +__global__ void compute_hist( + int* hist, const size_t* out, size_t len, size_t max_scale, size_t r_scale, size_t c_scale) +{ + size_t idx = (threadIdx.x + blockIdx.x * blockDim.x) * 2; + if (idx + 1 < len) { + auto src = out[idx], dst = out[idx + 1]; + for (size_t j = 0; j < max_scale; ++j) { + bool src_bit = j < r_scale ? src & (1 << (r_scale - j - 1)) : 0; + bool dst_bit = j < c_scale ? dst & (1 << (c_scale - j - 1)) : 0; + auto idx = j * 4 + src_bit * 2 + dst_bit; + atomicAdd(hist + idx, 1); + } + } +} + +class RmatGenTest : public ::testing::TestWithParam { + public: + RmatGenTest() + : handle{}, + stream{handle.get_stream()}, + params{::testing::TestWithParam::GetParam()}, + out{params.n_edges * 2, stream}, + out_src{params.n_edges, stream}, + out_dst{params.n_edges, stream}, + theta{0, stream}, + h_theta{}, + state{params.seed, GeneratorType::GenPC}, + max_scale{std::max(params.r_scale, params.c_scale)} + { + theta.resize(4 * max_scale, stream); + uniform(state, theta.data(), theta.size(), 0.0f, 1.0f, stream); + normalize(theta.data(), + theta.data(), + max_scale, + params.r_scale, + params.c_scale, + params.r_scale != params.c_scale, + params.theta_array, + stream); + h_theta.resize(theta.size()); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + raft::update_host(h_theta.data(), theta.data(), theta.size(), stream); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + } + + protected: + void SetUp() override + { + if (params.theta_array) { + rmat_rectangular_gen(out.data(), + out_src.data(), + out_dst.data(), + theta.data(), + params.r_scale, + params.c_scale, + params.n_edges, + stream, + state); + } else { + rmat_rectangular_gen(out.data(), + out_src.data(), + out_dst.data(), + h_theta[0], + h_theta[1], + h_theta[2], + params.r_scale, + params.c_scale, + params.n_edges, + stream, + state); + } + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + } + + void validate() + { + rmm::device_uvector hist{theta.size(), stream}; + RAFT_CUDA_TRY(cudaMemsetAsync(hist.data(), 0, hist.size() * sizeof(int), stream)); + compute_hist<<(out.size() / 2, 256), 256, 0, stream>>>( + hist.data(), out.data(), out.size(), max_scale, params.r_scale, params.c_scale); + RAFT_CUDA_TRY(cudaGetLastError()); + rmm::device_uvector computed_theta{theta.size(), stream}; + normalize(computed_theta.data(), + hist.data(), + max_scale, + params.r_scale, + params.c_scale, + false, + true, + stream); + RAFT_CUDA_TRY(cudaGetLastError()); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + ASSERT_TRUE(devArrMatchHost( + h_theta.data(), computed_theta.data(), theta.size(), CompareApprox(params.eps))); + } + + protected: + raft::handle_t handle; + cudaStream_t stream; + + RmatInputs params; + rmm::device_uvector out, out_src, out_dst; + rmm::device_uvector theta; + std::vector h_theta; + RngState state; + size_t max_scale; +}; + +static const float TOLERANCE = 0.01f; + +const std::vector inputs = { + // square adjacency + {16, 16, 100000, false, 123456ULL, TOLERANCE}, + {16, 16, 100000, true, 123456ULL, TOLERANCE}, + {16, 16, 200000, false, 123456ULL, TOLERANCE}, + {16, 16, 200000, true, 123456ULL, TOLERANCE}, + {18, 18, 100000, false, 123456ULL, TOLERANCE}, + {18, 18, 100000, true, 123456ULL, TOLERANCE}, + {18, 18, 200000, false, 123456ULL, TOLERANCE}, + {18, 18, 200000, true, 123456ULL, TOLERANCE}, + {16, 16, 100000, false, 456789ULL, TOLERANCE}, + {16, 16, 100000, true, 456789ULL, TOLERANCE}, + {16, 16, 200000, false, 456789ULL, TOLERANCE}, + {16, 16, 200000, true, 456789ULL, TOLERANCE}, + {18, 18, 100000, false, 456789ULL, TOLERANCE}, + {18, 18, 100000, true, 456789ULL, TOLERANCE}, + {18, 18, 200000, false, 456789ULL, TOLERANCE}, + {18, 18, 200000, true, 456789ULL, TOLERANCE}, + + // rectangular adjacency + {16, 18, 200000, false, 123456ULL, TOLERANCE}, + {16, 18, 200000, true, 123456ULL, TOLERANCE}, + {18, 16, 200000, false, 123456ULL, TOLERANCE}, + {18, 16, 200000, true, 123456ULL, TOLERANCE}, + {16, 18, 200000, false, 456789ULL, TOLERANCE}, + {16, 18, 200000, true, 456789ULL, TOLERANCE}, + {18, 16, 200000, false, 456789ULL, TOLERANCE}, + {18, 16, 200000, true, 456789ULL, TOLERANCE}}; + +TEST_P(RmatGenTest, Result) { validate(); } +INSTANTIATE_TEST_SUITE_P(RmatGenTests, RmatGenTest, ::testing::ValuesIn(inputs)); + +} // namespace random +} // namespace raft