Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create cub-based argmin primitive and replace argmin_along_rows in ANN kmeans #912

Merged
merged 6 commits into from
Nov 9, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ COMPILE_DIST_LIBRARY=OFF
ENABLE_NN_DEPENDENCIES=OFF

TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;NEIGHBORS_TEST;STATS_TEST;UTILS_TEST"
BENCH_TARGETS="CLUSTER_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;SPARSE_BENCH;RANDOM_BENCH"
BENCH_TARGETS="CLUSTER_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_BENCH;SPARSE_BENCH;RANDOM_BENCH"
ENABLE_thrust_DEPENDENCY=ON

CACHE_ARGS=""
Expand Down
6 changes: 6 additions & 0 deletions cpp/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,12 @@ if(BUILD_BENCH)
bench/main.cpp
)

ConfigureBench(NAME MATRIX_BENCH
PATH
bench/matrix/argmin.cu
bench/main.cpp
)

ConfigureBench(NAME RANDOM_BENCH
PATH
bench/random/make_blobs.cu
Expand Down
59 changes: 59 additions & 0 deletions cpp/bench/matrix/argmin.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <common/benchmark.hpp>
#include <raft/matrix/math.cuh>

#include <rmm/device_uvector.hpp>

namespace raft::bench::linalg {

struct ArgminParams {
int64_t rows, cols;
};

template <typename T, typename OutT>
struct Argmin : public fixture {
Argmin(const ArgminParams& p)
: params(p), matrix(p.rows * p.cols, stream), indices(p.cols, stream)
Nyrio marked this conversation as resolved.
Show resolved Hide resolved
{
}

void run_benchmark(::benchmark::State& state) override
{
loop_on_state(state, [this]() {
raft::matrix::argmin(matrix.data(), params.rows, params.cols, indices.data(), stream);
});
}

private:
ArgminParams params;
rmm::device_uvector<T> matrix;
rmm::device_uvector<OutT> indices;
}; // struct Argmin

const std::vector<ArgminParams> kInputSizes{
{64, 1000}, {128, 1000}, {256, 1000}, {512, 1000}, {1024, 1000},
{64, 10000}, {128, 10000}, {256, 10000}, {512, 10000}, {1024, 10000},
{64, 100000}, {128, 100000}, {256, 100000}, {512, 100000}, {1024, 100000},
{64, 1000000}, {128, 1000000}, {256, 1000000}, {512, 1000000}, {1024, 1000000},
{64, 10000000}, {128, 10000000}, {256, 10000000}, {512, 10000000}, {1024, 10000000},
};

RAFT_BENCH_REGISTER((Argmin<float, uint32_t>), "", kInputSizes);
RAFT_BENCH_REGISTER((Argmin<double, uint32_t>), "", kInputSizes);

} // namespace raft::bench::linalg
55 changes: 34 additions & 21 deletions cpp/include/raft/matrix/detail/math.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -362,45 +362,58 @@ void matrixVectorBinarySub(Type* data,
stream);
}

// Computes the argmax(d_in) column-wise in a DxN matrix
template <typename T, typename IdxT, int TPB>
__global__ void argmaxKernel(const T* d_in, int D, int N, IdxT* argmax)
// Computes an argmin/argmax column-wise in a DxN matrix
template <typename RedOp, int TPB, typename T, typename OutT, typename IdxT>
__global__ void argReduceKernel(const T* d_in, IdxT D, IdxT N, OutT* out)
{
typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
typedef cub::
BlockReduce<cub::KeyValuePair<IdxT, T>, TPB, cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY>
BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;

// compute maxIndex=argMax index for column
using KVP = cub::KeyValuePair<int, T>;
int rowStart = blockIdx.x * D;
KVP thread_data(-1, -raft::myInf<T>());
using KVP = cub::KeyValuePair<IdxT, T>;
IdxT rowStart = static_cast<IdxT>(blockIdx.x) * D;
KVP thread_data(0, std::is_same_v<RedOp, cub::ArgMax> ? -raft::myInf<T>() : raft::myInf<T>());

for (int i = threadIdx.x; i < D; i += TPB) {
int idx = rowStart + i;
thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx]));
for (IdxT i = threadIdx.x; i < D; i += TPB) {
IdxT idx = rowStart + i;
thread_data = RedOp()(thread_data, KVP(i, d_in[idx]));
}

auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, RedOp());

if (threadIdx.x == 0) { argmax[blockIdx.x] = maxKV.key; }
if (threadIdx.x == 0) { out[blockIdx.x] = maxKV.key; }
}

template <typename math_t, typename idx_t>
void argmax(const math_t* in, int n_rows, int n_cols, idx_t* out, cudaStream_t stream)
template <typename RedOp, typename math_t, typename out_t, typename idx_t>
inline void argReduce(const math_t* in, idx_t n_rows, idx_t n_cols, out_t* out, cudaStream_t stream)
{
int D = n_rows;
int N = n_cols;
idx_t D = n_rows;
idx_t N = n_cols;
if (D <= 32) {
argmaxKernel<math_t, idx_t, 32><<<N, 32, 0, stream>>>(in, D, N, out);
argReduceKernel<RedOp, 32><<<N, 32, 0, stream>>>(in, D, N, out);
} else if (D <= 64) {
argmaxKernel<math_t, idx_t, 64><<<N, 64, 0, stream>>>(in, D, N, out);
argReduceKernel<RedOp, 64><<<N, 64, 0, stream>>>(in, D, N, out);
} else if (D <= 128) {
argmaxKernel<math_t, idx_t, 128><<<N, 128, 0, stream>>>(in, D, N, out);
argReduceKernel<RedOp, 128><<<N, 128, 0, stream>>>(in, D, N, out);
} else {
argmaxKernel<math_t, idx_t, 256><<<N, 256, 0, stream>>>(in, D, N, out);
argReduceKernel<RedOp, 256><<<N, 256, 0, stream>>>(in, D, N, out);
}
RAFT_CUDA_TRY(cudaPeekAtLastError());
}

template <typename math_t, typename out_t, typename idx_t>
void argmin(const math_t* in, idx_t n_rows, idx_t n_cols, out_t* out, cudaStream_t stream)
{
argReduce<cub::ArgMin>(in, n_rows, n_cols, out, stream);
}

template <typename math_t, typename out_t, typename idx_t>
void argmax(const math_t* in, idx_t n_rows, idx_t n_cols, out_t* out, cudaStream_t stream)
{
argReduce<cub::ArgMax>(in, n_rows, n_cols, out, stream);
}

// Utility kernel needed for signFlip.
// Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by
// flipping the sign if the |max| value for each column is negative.
Expand Down
18 changes: 16 additions & 2 deletions cpp/include/raft/matrix/math.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,20 @@ void ratio(

/** @} */

/**
* @brief Argmin: find the row idx with minimum value for each column
* @param in: input matrix
Nyrio marked this conversation as resolved.
Show resolved Hide resolved
* @param n_rows: number of rows of input matrix
* @param n_cols: number of columns of input matrix
* @param out: output vector of size n_cols
* @param stream: cuda stream
*/
template <typename math_t, typename out_t, typename idx_t = int>
void argmin(const math_t* in, idx_t n_rows, idx_t n_cols, out_t* out, cudaStream_t stream)
Nyrio marked this conversation as resolved.
Show resolved Hide resolved
{
detail::argmin(in, n_rows, n_cols, out, stream);
}

/**
* @brief Argmax: find the row idx with maximum value for each column
* @param in: input matrix
Nyrio marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -310,8 +324,8 @@ void ratio(
* @param out: output vector of size n_cols
* @param stream: cuda stream
*/
template <typename math_t, typename idx_t>
void argmax(const math_t* in, int n_rows, int n_cols, idx_t* out, cudaStream_t stream)
template <typename math_t, typename out_t, typename idx_t = int>
void argmax(const math_t* in, idx_t n_rows, idx_t n_cols, out_t* out, cudaStream_t stream)
Nyrio marked this conversation as resolved.
Show resolved Hide resolved
{
detail::argmax(in, n_rows, n_cols, out, stream);
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <raft/linalg/gemm.cuh>
#include <raft/linalg/norm.cuh>
#include <raft/linalg/unary_op.cuh>
#include <raft/matrix/math.cuh>
#include <raft/matrix/matrix.cuh>
#include <raft/util/cuda_utils.cuh>

Expand Down Expand Up @@ -144,8 +145,7 @@ inline void predict_float_core(const handle_t& handle,
distances.data(),
n_clusters,
stream);
utils::argmin_along_rows(
n_rows, static_cast<IdxT>(n_clusters), distances.data(), labels, stream);
raft::matrix::argmin(distances.data(), (IdxT)n_clusters, n_rows, labels, stream);
break;
}
default: {
Expand Down
60 changes: 0 additions & 60 deletions cpp/include/raft/spatial/knn/detail/ann_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -151,66 +151,6 @@ inline void memzero(T* ptr, IdxT n_elems, rmm::cuda_stream_view stream)
}
}

template <typename IdxT, typename OutT>
__global__ void argmin_along_rows_kernel(IdxT n_rows, uint32_t n_cols, const float* a, OutT* out)
Nyrio marked this conversation as resolved.
Show resolved Hide resolved
{
__shared__ OutT shm_ids[1024]; // NOLINT
__shared__ float shm_vals[1024]; // NOLINT
IdxT i = blockIdx.x;
if (i >= n_rows) return;
OutT min_idx = n_cols;
float min_val = raft::upper_bound<float>();
for (OutT j = threadIdx.x; j < n_cols; j += blockDim.x) {
if (min_val > a[j + n_cols * i]) {
min_val = a[j + n_cols * i];
min_idx = j;
}
}
shm_vals[threadIdx.x] = min_val;
shm_ids[threadIdx.x] = min_idx;
__syncthreads();
for (IdxT offset = blockDim.x / 2; offset > 0; offset >>= 1) {
if (threadIdx.x < offset) {
if (shm_vals[threadIdx.x] < shm_vals[threadIdx.x + offset]) {
} else if (shm_vals[threadIdx.x] > shm_vals[threadIdx.x + offset]) {
shm_vals[threadIdx.x] = shm_vals[threadIdx.x + offset];
shm_ids[threadIdx.x] = shm_ids[threadIdx.x + offset];
} else if (shm_ids[threadIdx.x] > shm_ids[threadIdx.x + offset]) {
shm_ids[threadIdx.x] = shm_ids[threadIdx.x + offset];
}
}
__syncthreads();
}
if (threadIdx.x == 0) { out[i] = shm_ids[0]; }
}

/**
* @brief Find index of the smallest element in each row.
*
* NB: device-only function
* TODO: specialize select_k for the case of `k == 1` and use that one instead.
*
* @tparam IdxT index type
* @tparam OutT output type
*
* @param n_rows
* @param n_cols
* @param[in] a device pointer to the row-major matrix [n_rows, n_cols]
* @param[out] out device pointer to the vector of selected indices [n_rows]
* @param stream
*/
template <typename IdxT, typename OutT>
inline void argmin_along_rows(
IdxT n_rows, IdxT n_cols, const float* a, OutT* out, rmm::cuda_stream_view stream)
{
IdxT block_dim = 1024;
while (block_dim > n_cols) {
block_dim /= 2;
}
block_dim = max(block_dim, (IdxT)128);
argmin_along_rows_kernel<IdxT, OutT><<<n_rows, block_dim, 0, stream>>>(n_rows, n_cols, a, out);
}

template <typename IdxT>
__global__ void dots_along_rows_kernel(IdxT n_rows, IdxT n_cols, const float* a, float* out)
{
Expand Down