Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace normalize_rows in ann_utils.cuh by a new rowNormalize prim and improve performance for thin matrices (small n_cols) #979

Merged
merged 10 commits into from
Nov 17, 2022
1 change: 1 addition & 0 deletions cpp/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ if(BUILD_BENCH)
bench/linalg/add.cu
bench/linalg/map_then_reduce.cu
bench/linalg/matrix_vector_op.cu
bench/linalg/normalize.cu
bench/linalg/reduce_rows_by_key.cu
bench/linalg/reduce.cu
bench/main.cpp
Expand Down
79 changes: 79 additions & 0 deletions cpp/bench/linalg/normalize.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <common/benchmark.hpp>
#include <raft/linalg/normalize.cuh>
#include <raft/random/rng.cuh>
#include <raft/util/itertools.hpp>

#include <rmm/device_uvector.hpp>

namespace raft::bench::linalg {

template <typename IdxT>
struct normalize_input {
IdxT rows, cols;
};

template <typename IdxT>
inline auto operator<<(std::ostream& os, const normalize_input<IdxT>& p) -> std::ostream&
{
os << p.rows << "#" << p.cols;
return os;
}

template <typename T, typename IdxT>
struct rowNormalize : public fixture {
rowNormalize(const normalize_input<IdxT>& p)
: params(p), in(p.rows * p.cols, stream), out(p.rows * p.cols, stream)
{
raft::random::RngState rng{1234};
raft::random::uniform(rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0, stream);
}

void run_benchmark(::benchmark::State& state) override
{
std::ostringstream label_stream;
label_stream << params;
state.SetLabel(label_stream.str());

loop_on_state(state, [this]() {
auto input_view = raft::make_device_matrix_view<const T, IdxT, raft::row_major>(
in.data(), params.rows, params.cols);
auto output_view = raft::make_device_matrix_view<T, IdxT, raft::row_major>(
out.data(), params.rows, params.cols);
raft::linalg::rowNormalize(handle, input_view, output_view);
});
}

private:
normalize_input<IdxT> params;
rmm::device_uvector<T> in, out;
}; // struct rowNormalize

const std::vector<normalize_input<int>> normalize_inputs_i32 =
raft::util::itertools::product<normalize_input<int>>({1000, 10000, 100000, 1000000},
{2, 4, 8, 16, 32, 64, 128, 256});
const std::vector<normalize_input<int64_t>> normalize_inputs_i64 =
raft::util::itertools::product<normalize_input<int64_t>>({1000, 10000, 100000, 1000000},
{2, 4, 8, 16, 32, 64, 128, 256});

RAFT_BENCH_REGISTER((rowNormalize<float, int>), "", normalize_inputs_i32);
RAFT_BENCH_REGISTER((rowNormalize<double, int>), "", normalize_inputs_i32);
RAFT_BENCH_REGISTER((rowNormalize<float, int64_t>), "", normalize_inputs_i64);
RAFT_BENCH_REGISTER((rowNormalize<double, int64_t>), "", normalize_inputs_i64);

} // namespace raft::bench::linalg
80 changes: 80 additions & 0 deletions cpp/include/raft/linalg/detail/normalize.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <raft/util/cuda_utils.cuh>

namespace raft {
namespace linalg {
namespace detail {

template <int warpSize, int rpb>
struct NormalizeWarpPolicy {
static constexpr int LogicalWarpSize = warpSize;
static constexpr int RowsPerBlock = rpb;
static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock;
};

template <typename Policy, typename Type, typename IdxType>
__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
coalescedNormalizeWarpKernel(Type* out, const Type* in, IdxType D, IdxType N)
{
IdxType i = threadIdx.y + (blockDim.y * static_cast<IdxType>(blockIdx.x));
if (i >= N) return;

Type sqsum = 0.0;
for (IdxType j = threadIdx.x; j < D; j += blockDim.x) {
Type val = in[j + D * i];
sqsum += val * val;
}
sqsum = raft::logicalWarpReduce<Policy::LogicalWarpSize>(sqsum);
if (sqsum <= 1e-8) return;
sqsum = rsqrt(sqsum);
for (IdxType j = threadIdx.x; j < D; j += blockDim.x) {
out[j + D * i] = in[j + D * i] * sqsum;
}
}

template <typename Policy, typename Type, typename IdxType>
inline void coalescedNormalizeLauncher(
Type* out, const Type* in, IdxType D, IdxType N, cudaStream_t stream)
{
dim3 grid(ceildiv(N, (IdxType)Policy::RowsPerBlock), 1, 1);
dim3 block(Policy::LogicalWarpSize, Policy::RowsPerBlock, 1);
coalescedNormalizeWarpKernel<Policy><<<grid, block, 0, stream>>>(out, in, D, N);
}

template <typename Type, typename IdxType>
void coalescedNormalize(Type* out, const Type* in, IdxType D, IdxType N, cudaStream_t stream)
{
if (D <= 2) {
coalescedNormalizeLauncher<NormalizeWarpPolicy<2, 64>>(out, in, D, N, stream);
} else if (D <= 4) {
coalescedNormalizeLauncher<NormalizeWarpPolicy<4, 32>>(out, in, D, N, stream);
} else if (D <= 8) {
coalescedNormalizeLauncher<NormalizeWarpPolicy<8, 16>>(out, in, D, N, stream);
} else if (D <= 16) {
coalescedNormalizeLauncher<NormalizeWarpPolicy<16, 8>>(out, in, D, N, stream);
} else {
coalescedNormalizeLauncher<NormalizeWarpPolicy<32, 4>>(out, in, D, N, stream);
}
RAFT_CUDA_TRY(cudaPeekAtLastError());
}

} // namespace detail
} // namespace linalg
} // namespace raft
4 changes: 2 additions & 2 deletions cpp/include/raft/linalg/norm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ using detail::NormType;
* @tparam Lambda device final lambda
* @tparam IdxType Integer type used to for addressing
* @param dots the output vector of row-wise dot products
* @param data the input matrix (currently assumed to be row-major)
* @param data the input matrix
* @param D number of columns of data
* @param N number of rows of data
* @param type the type of norm to be applied
Expand All @@ -71,7 +71,7 @@ void rowNorm(Type* dots,
* @tparam Lambda device final lambda
* @tparam IdxType Integer type used to for addressing
* @param dots the output vector of column-wise dot products
* @param data the input matrix (currently assumed to be row-major)
* @param data the input matrix
* @param D number of columns of data
* @param N number of rows of data
* @param type the type of norm to be applied
Expand Down
53 changes: 53 additions & 0 deletions cpp/include/raft/linalg/normalize.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include "detail/normalize.cuh"

namespace raft {
namespace linalg {

/**
* @brief Divide rows by their L2 norm.
*
* Note that the implementation is efficient for matrices with a large number of rows, not "thick"
* matrices with few long rows.
*
* @tparam ElementType Input/Output data type
* @tparam IndexType Integer type used to for addressing
* @param[in] handle raft::handle_t
* @param[in] in the input raft::device_matrix_view
* @param[out] out the output raft::device_matrix_view
*/
template <typename ElementType, typename IndexType>
void rowNormalize(const raft::handle_t& handle,
Nyrio marked this conversation as resolved.
Show resolved Hide resolved
raft::device_matrix_view<const ElementType, IndexType, row_major> in,
raft::device_matrix_view<ElementType, IndexType, row_major> out)
{
RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
RAFT_EXPECTS(in.extent(0) == out.extent(0),
"The number of rows of the input and output should be equal");
RAFT_EXPECTS(in.extent(1) == out.extent(1),
"The number of columns of the input and output should be equal");

detail::coalescedNormalize(
out.data_handle(), in.data_handle(), in.extent(1), in.extent(0), handle.get_stream());
}

} // namespace linalg
} // namespace raft
12 changes: 10 additions & 2 deletions cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <raft/linalg/gemm.cuh>
#include <raft/linalg/matrix_vector_op.cuh>
#include <raft/linalg/norm.cuh>
#include <raft/linalg/normalize.cuh>
#include <raft/linalg/unary_op.cuh>
#include <raft/matrix/matrix.cuh>
#include <raft/util/cuda_utils.cuh>
Expand Down Expand Up @@ -665,8 +666,15 @@ void balancing_em_iters(const handle_t& handle,
// To avoid converging to zero, we normalize the center vectors on every iteration.
case raft::distance::DistanceType::InnerProduct:
case raft::distance::DistanceType::CosineExpanded:
case raft::distance::DistanceType::CorrelationExpanded:
utils::normalize_rows<uint32_t>(n_clusters, dim, cluster_centers, stream);
case raft::distance::DistanceType::CorrelationExpanded: {
auto clusters_in_view =
raft::make_device_matrix_view<const float, uint32_t, raft::row_major>(
cluster_centers, n_clusters, dim);
auto clusters_out_view = raft::make_device_matrix_view<float, uint32_t, raft::row_major>(
cluster_centers, n_clusters, dim);
raft::linalg::rowNormalize(handle, clusters_in_view, clusters_out_view);
break;
}
default: break;
}
// E: Expectation step - predict labels
Expand Down
43 changes: 0 additions & 43 deletions cpp/include/raft/spatial/knn/detail/ann_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -259,49 +259,6 @@ inline void dots_along_rows(
*/
}

template <typename IdxT>
__global__ void normalize_rows_kernel(IdxT n_rows, IdxT n_cols, float* a)
{
IdxT i = threadIdx.y + (blockDim.y * static_cast<IdxT>(blockIdx.x));
if (i >= n_rows) return;

float sqsum = 0.0;
for (IdxT j = threadIdx.x; j < n_cols; j += blockDim.x) {
float val = a[j + (n_cols * i)];
sqsum += val * val;
}
sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1);
sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2);
sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4);
sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8);
sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16);
if (sqsum <= 1e-8) return;
sqsum = rsqrtf(sqsum); // reciprocal of the square root
for (IdxT j = threadIdx.x; j < n_cols; j += blockDim.x) {
a[j + n_cols * i] *= sqsum;
}
}

/**
* @brief Divide rows by their L2 norm (square root of sum of squares).
*
* NB: device-only function
*
* @tparam IdxT index type
*
* @param[in] n_rows
* @param[in] n_cols
* @param[inout] a device pointer to a row-major matrix [n_rows, n_cols]
* @param stream
*/
template <typename IdxT>
inline void normalize_rows(IdxT n_rows, IdxT n_cols, float* a, rmm::cuda_stream_view stream)
{
dim3 threads(32, 4, 1); // DO NOT CHANGE
dim3 blocks(ceildiv(n_rows, threads.y), 1, 1);
normalize_rows_kernel<IdxT><<<blocks, threads, 0, stream>>>(n_rows, n_cols, a);
}

template <typename IdxT, typename Lambda>
__global__ void map_along_rows_kernel(
IdxT n_rows, uint32_t n_cols, float* a, const uint32_t* d, Lambda map)
Expand Down
26 changes: 20 additions & 6 deletions cpp/include/raft/util/cuda_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,25 @@ DI auto dp4a(unsigned int a, unsigned int b, unsigned int c) -> unsigned int
#endif
}

/**
* @brief Logical-warp-level sum reduction
* @tparam logicalWarpSize Logical warp size (2, 4, 8, 16 or 32)
* @tparam T Value type to be reduced
* @param val input value
* @return Reduction result. All lanes will have the valid result.
* @todo Expand this to support arbitrary reduction ops
Nyrio marked this conversation as resolved.
Show resolved Hide resolved
*/
template <int logicalWarpSize, typename T>
DI T logicalWarpReduce(T val)
{
#pragma unroll
for (int i = logicalWarpSize / 2; i > 0; i >>= 1) {
T tmp = shfl_xor(val, i);
val += tmp;
}
return val;
}

/**
* @brief Warp-level sum reduction
* @param val input value
Expand All @@ -742,12 +761,7 @@ DI auto dp4a(unsigned int a, unsigned int b, unsigned int c) -> unsigned int
template <typename T>
DI T warpReduce(T val)
{
#pragma unroll
for (int i = WarpSize / 2; i > 0; i >>= 1) {
T tmp = shfl_xor(val, i);
val += tmp;
}
return val;
return logicalWarpReduce<WarpSize>(val);
}

/**
Expand Down
Loading