rapidsai · rapids-bot · Nov 17, 2022 · Nov 2, 2022 · Nov 2, 2022 · Nov 3, 2022
@@ -95,6 +95,7 @@ if(BUILD_BENCH)
             bench/linalg/add.cu
             bench/linalg/map_then_reduce.cu
             bench/linalg/matrix_vector_op.cu
+            bench/linalg/normalize.cu
             bench/linalg/reduce_rows_by_key.cu
             bench/linalg/reduce.cu
             bench/main.cpp

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/linalg/normalize.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+template <typename IdxT>
+struct normalize_input {
+  IdxT rows, cols;
+};
+
+template <typename IdxT>
+inline auto operator<<(std::ostream& os, const normalize_input<IdxT>& p) -> std::ostream&
+{
+  os << p.rows << "#" << p.cols;
+  return os;
+}
+
+template <typename T, typename IdxT>
+struct rowNormalize : public fixture {
+  rowNormalize(const normalize_input<IdxT>& p)
+    : params(p), in(p.rows * p.cols, stream), out(p.rows * p.cols, stream)
+  {
+    raft::random::RngState rng{1234};
+    raft::random::uniform(rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0, stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    loop_on_state(state, [this]() {
+      auto input_view = raft::make_device_matrix_view<const T, IdxT, raft::row_major>(
+        in.data(), params.rows, params.cols);
+      auto output_view = raft::make_device_matrix_view<T, IdxT, raft::row_major>(
+        out.data(), params.rows, params.cols);
+      raft::linalg::rowNormalize(handle, input_view, output_view);
+    });
+  }
+
+ private:
+  normalize_input<IdxT> params;
+  rmm::device_uvector<T> in, out;
+};  // struct rowNormalize
+
+const std::vector<normalize_input<int>> normalize_inputs_i32 =
+  raft::util::itertools::product<normalize_input<int>>({1000, 10000, 100000, 1000000},
+                                                       {2, 4, 8, 16, 32, 64, 128, 256});
+const std::vector<normalize_input<int64_t>> normalize_inputs_i64 =
+  raft::util::itertools::product<normalize_input<int64_t>>({1000, 10000, 100000, 1000000},
+                                                           {2, 4, 8, 16, 32, 64, 128, 256});
+
+RAFT_BENCH_REGISTER((rowNormalize<float, int>), "", normalize_inputs_i32);
+RAFT_BENCH_REGISTER((rowNormalize<double, int>), "", normalize_inputs_i32);
+RAFT_BENCH_REGISTER((rowNormalize<float, int64_t>), "", normalize_inputs_i64);
+RAFT_BENCH_REGISTER((rowNormalize<double, int64_t>), "", normalize_inputs_i64);
+
+}  // namespace raft::bench::linalg
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <int warpSize, int rpb>
+struct NormalizeWarpPolicy {
+  static constexpr int LogicalWarpSize = warpSize;
+  static constexpr int RowsPerBlock    = rpb;
+  static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock;
+};
+
+template <typename Policy, typename Type, typename IdxType>
+__global__ void __launch_bounds__(Policy::ThreadsPerBlock)
+  coalescedNormalizeWarpKernel(Type* out, const Type* in, IdxType D, IdxType N)
+{
+  IdxType i = threadIdx.y + (blockDim.y * static_cast<IdxType>(blockIdx.x));
+  if (i >= N) return;
+
+  Type sqsum = 0.0;
+  for (IdxType j = threadIdx.x; j < D; j += blockDim.x) {
+    Type val = in[j + D * i];
+    sqsum += val * val;
+  }
+  sqsum = raft::logicalWarpReduce<Policy::LogicalWarpSize>(sqsum);
+  if (sqsum <= 1e-8) return;
+  sqsum = rsqrt(sqsum);
+  for (IdxType j = threadIdx.x; j < D; j += blockDim.x) {
+    out[j + D * i] = in[j + D * i] * sqsum;
+  }
+}
+
+template <typename Policy, typename Type, typename IdxType>
+inline void coalescedNormalizeLauncher(
+  Type* out, const Type* in, IdxType D, IdxType N, cudaStream_t stream)
+{
+  dim3 grid(ceildiv(N, (IdxType)Policy::RowsPerBlock), 1, 1);
+  dim3 block(Policy::LogicalWarpSize, Policy::RowsPerBlock, 1);
+  coalescedNormalizeWarpKernel<Policy><<<grid, block, 0, stream>>>(out, in, D, N);
+}
+
+template <typename Type, typename IdxType>
+void coalescedNormalize(Type* out, const Type* in, IdxType D, IdxType N, cudaStream_t stream)
+{
+  if (D <= 2) {
+    coalescedNormalizeLauncher<NormalizeWarpPolicy<2, 64>>(out, in, D, N, stream);
+  } else if (D <= 4) {
+    coalescedNormalizeLauncher<NormalizeWarpPolicy<4, 32>>(out, in, D, N, stream);
+  } else if (D <= 8) {
+    coalescedNormalizeLauncher<NormalizeWarpPolicy<8, 16>>(out, in, D, N, stream);
+  } else if (D <= 16) {
+    coalescedNormalizeLauncher<NormalizeWarpPolicy<16, 8>>(out, in, D, N, stream);
+  } else {
+    coalescedNormalizeLauncher<NormalizeWarpPolicy<32, 4>>(out, in, D, N, stream);
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
@@ -44,7 +44,7 @@ using detail::NormType;
  * @tparam Lambda device final lambda
  * @tparam IdxType Integer type used to for addressing
  * @param dots the output vector of row-wise dot products
- * @param data the input matrix (currently assumed to be row-major)
+ * @param data the input matrix
  * @param D number of columns of data
  * @param N number of rows of data
  * @param type the type of norm to be applied
@@ -71,7 +71,7 @@ void rowNorm(Type* dots,
  * @tparam Lambda device final lambda
  * @tparam IdxType Integer type used to for addressing
  * @param dots the output vector of column-wise dot products
- * @param data the input matrix (currently assumed to be row-major)
+ * @param data the input matrix
  * @param D number of columns of data
  * @param N number of rows of data
  * @param type the type of norm to be applied

@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/normalize.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Divide rows by their L2 norm.
+ *
+ * Note that the implementation is efficient for matrices with a large number of rows, not "thick"
+ * matrices with few long rows.
+ *
+ * @tparam ElementType Input/Output data type
+ * @tparam IndexType Integer type used to for addressing
+ * @param[in] handle raft::handle_t
+ * @param[in] in the input raft::device_matrix_view
+ * @param[out] out the output raft::device_matrix_view
+ */
+template <typename ElementType, typename IndexType>
+void rowNormalize(const raft::handle_t& handle,
+                  raft::device_matrix_view<const ElementType, IndexType, row_major> in,
+                  raft::device_matrix_view<ElementType, IndexType, row_major> out)
+{
+  RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+  RAFT_EXPECTS(in.extent(0) == out.extent(0),
+               "The number of rows of the input and output should be equal");
+  RAFT_EXPECTS(in.extent(1) == out.extent(1),
+               "The number of columns of the input and output should be equal");
+
+  detail::coalescedNormalize(
+    out.data_handle(), in.data_handle(), in.extent(1), in.extent(0), handle.get_stream());
+}
+
+}  // namespace linalg
+}  // namespace raft
@@ -32,6 +32,7 @@
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
+#include <raft/linalg/normalize.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -665,8 +666,15 @@ void balancing_em_iters(const handle_t& handle,
       // To avoid converging to zero, we normalize the center vectors on every iteration.
       case raft::distance::DistanceType::InnerProduct:
       case raft::distance::DistanceType::CosineExpanded:
-      case raft::distance::DistanceType::CorrelationExpanded:
-        utils::normalize_rows<uint32_t>(n_clusters, dim, cluster_centers, stream);
+      case raft::distance::DistanceType::CorrelationExpanded: {
+        auto clusters_in_view =
+          raft::make_device_matrix_view<const float, uint32_t, raft::row_major>(
+            cluster_centers, n_clusters, dim);
+        auto clusters_out_view = raft::make_device_matrix_view<float, uint32_t, raft::row_major>(
+          cluster_centers, n_clusters, dim);
+        raft::linalg::rowNormalize(handle, clusters_in_view, clusters_out_view);
+        break;
+      }
       default: break;
     }
     // E: Expectation step - predict labels

@@ -259,49 +259,6 @@ inline void dots_along_rows(
    */
 }
 
-template <typename IdxT>
-__global__ void normalize_rows_kernel(IdxT n_rows, IdxT n_cols, float* a)
-{
-  IdxT i = threadIdx.y + (blockDim.y * static_cast<IdxT>(blockIdx.x));
-  if (i >= n_rows) return;
-
-  float sqsum = 0.0;
-  for (IdxT j = threadIdx.x; j < n_cols; j += blockDim.x) {
-    float val = a[j + (n_cols * i)];
-    sqsum += val * val;
-  }
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 1);
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 2);
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 4);
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 8);
-  sqsum += __shfl_xor_sync(0xffffffff, sqsum, 16);
-  if (sqsum <= 1e-8) return;
-  sqsum = rsqrtf(sqsum);  // reciprocal of the square root
-  for (IdxT j = threadIdx.x; j < n_cols; j += blockDim.x) {
-    a[j + n_cols * i] *= sqsum;
-  }
-}
-
-/**
- * @brief Divide rows by their L2 norm (square root of sum of squares).
- *
- * NB: device-only function
- *
- * @tparam IdxT index type
- *
- * @param[in] n_rows
- * @param[in] n_cols
- * @param[inout] a device pointer to a row-major matrix [n_rows, n_cols]
- * @param stream
- */
-template <typename IdxT>
-inline void normalize_rows(IdxT n_rows, IdxT n_cols, float* a, rmm::cuda_stream_view stream)
-{
-  dim3 threads(32, 4, 1);  // DO NOT CHANGE
-  dim3 blocks(ceildiv(n_rows, threads.y), 1, 1);
-  normalize_rows_kernel<IdxT><<<blocks, threads, 0, stream>>>(n_rows, n_cols, a);
-}
-
 template <typename IdxT, typename Lambda>
 __global__ void map_along_rows_kernel(
   IdxT n_rows, uint32_t n_cols, float* a, const uint32_t* d, Lambda map)

@@ -729,6 +729,25 @@ DI auto dp4a(unsigned int a, unsigned int b, unsigned int c) -> unsigned int
 #endif
 }
 
+/**
+ * @brief Logical-warp-level sum reduction
+ * @tparam logicalWarpSize Logical warp size (2, 4, 8, 16 or 32)
+ * @tparam T Value type to be reduced
+ * @param val input value
+ * @return Reduction result. All lanes will have the valid result.
+ * @todo Expand this to support arbitrary reduction ops
+ */
+template <int logicalWarpSize, typename T>
+DI T logicalWarpReduce(T val)
+{
+#pragma unroll
+  for (int i = logicalWarpSize / 2; i > 0; i >>= 1) {
+    T tmp = shfl_xor(val, i);
+    val += tmp;
+  }
+  return val;
+}
+
 /**
  * @brief Warp-level sum reduction
  * @param val input value
@@ -742,12 +761,7 @@ DI auto dp4a(unsigned int a, unsigned int b, unsigned int c) -> unsigned int
 template <typename T>
 DI T warpReduce(T val)
 {
-#pragma unroll
-  for (int i = WarpSize / 2; i > 0; i >>= 1) {
-    T tmp = shfl_xor(val, i);
-    val += tmp;
-  }
-  return val;
+  return logicalWarpReduce<WarpSize>(val);
 }
 
 /**