rapidsai · JohnZed · Oct 23, 2020 · Sep 9, 2020 · Sep 10, 2020 · Sep 10, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # RAFT 0.17.0 (Date TBD)
 
 ## New Features
+- PR #65: Adding cuml prims that break circular dependency between cuml and cumlprims projects
 
 ## Improvements
 - PR #73: Move DistanceType enum from cuML to RAFT

@@ -235,8 +235,34 @@ if(BUILD_RAFT_TESTS)
     test/cudart_utils.cpp
     test/handle.cpp
     test/integer_utils.cpp
+    test/linalg/add.cu
+    test/linalg/binary_op.cu
+    test/linalg/coalesced_reduction.cu
+    test/linalg/divide.cu
+    test/linalg/eig.cu
+    test/linalg/eig_sel.cu
+    test/linalg/gemm_layout.cu
+    test/linalg/map_then_reduce.cu
+    test/linalg/matrix_vector_op.cu
+    test/linalg/multiply.cu
+    test/linalg/norm.cu
+    test/linalg/reduce.cu
+    test/linalg/strided_reduction.cu
+    test/linalg/subtract.cu
+    test/linalg/svd.cu
+    test/linalg/transpose.cu
+    test/linalg/unary_op.cu
+    test/matrix/math.cu
+    test/matrix/matrix.cu
     test/mr/device/buffer.cpp
     test/mr/host/buffer.cpp
+    test/random/rng.cu
+    test/random/rng_int.cu
+    test/random/sample_without_replacement.cu
+    test/stats/mean.cu
+    test/stats/mean_center.cu
+    test/stats/stddev.cu
+    test/stats/sum.cu
     test/test.cpp
     test/spectral_matrix.cu
     test/eigen_solvers.cu

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <raft/mr/device/buffer.hpp>
+
+namespace raft {
+
+/**
+     * @brief Convenience wrapper over cub's SortPairs method
+     * @tparam KeyT key type
+     * @tparam ValueT value type
+     * @param workspace workspace buffer which will get resized if not enough space
+     * @param inKeys input keys array
+     * @param outKeys output keys array
+     * @param inVals input values array
+     * @param outVals output values array
+     * @param len array length
+     * @param stream cuda stream
+     */
+template <typename KeyT, typename ValueT>
+void sortPairs(raft::mr::device::buffer<char> &workspace, const KeyT *inKeys,
+               KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len,
+               cudaStream_t stream) {
+  size_t worksize;
+  cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals,
+                                  outVals, len, 0, sizeof(KeyT) * 8, stream);
+  workspace.resize(worksize, stream);
+  cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys,
+                                  inVals, outVals, len, 0, sizeof(KeyT) * 8,
+                                  stream);
+}
+
+}  // namespace raft
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+
+template <typename DataT, int VecLen, typename Lambda, typename IdxT>
+__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx,
+                              IdxT len, Lambda op) {
+  typedef TxN_t<DataT, VecLen> DataVec;
+  typedef TxN_t<IdxT, VecLen> IdxVec;
+  IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
+  tid *= VecLen;
+  if (tid >= len) return;
+  IdxVec idxIn;
+  idxIn.load(idx, tid);
+  DataVec dataIn;
+#pragma unroll
+  for (int i = 0; i < VecLen; ++i) {
+    auto inPos = idxIn.val.data[i];
+    dataIn.val.data[i] = op(in[inPos], tid + i);
+  }
+  dataIn.store(out, tid);
+}
+
+template <typename DataT, int VecLen, typename Lambda, typename IdxT, int TPB>
+void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len,
+                 Lambda op, cudaStream_t stream) {
+  const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB);
+  scatterKernel<DataT, VecLen, Lambda, IdxT>
+    <<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
+  CUDA_CHECK(cudaGetLastError());
+}
+
+/**
+     * @brief Performs scatter operation based on the input indexing array
+     * @tparam DataT data type whose array gets scattered
+     * @tparam IdxT indexing type
+     * @tparam TPB threads-per-block in the final kernel launched
+     * @tparam Lambda the device-lambda performing a unary operation on the loaded
+     * data before it gets scattered
+     * @param out the output array
+     * @param in the input array
+     * @param idx the indexing array
+     * @param len number of elements in the input array
+     * @param stream cuda stream where to launch work
+     * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This
+     * will be applied to every element before scattering it to the right location.
+     * The second param in this method will be the destination index.
+     */
+template <typename DataT, typename IdxT,
+          typename Lambda = raft::Nop<DataT, IdxT>, int TPB = 256>
+void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len,
+             cudaStream_t stream, Lambda op = raft::Nop<DataT, IdxT>()) {
+  if (len <= 0) return;
+  constexpr size_t DataSize = sizeof(DataT);
+  constexpr size_t IdxSize = sizeof(IdxT);
+  constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize;
+  size_t bytes = len * MaxPerElem;
+  if (16 / MaxPerElem && bytes % 16 == 0) {
+    scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len,
+                                                           op, stream);
+  } else if (8 / MaxPerElem && bytes % 8 == 0) {
+    scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
+                                                          stream);
+  } else if (4 / MaxPerElem && bytes % 4 == 0) {
+    scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
+                                                          stream);
+  } else if (2 / MaxPerElem && bytes % 2 == 0) {
+    scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
+                                                          stream);
+  } else if (1 / MaxPerElem) {
+    scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
+                                                          stream);
+  } else {
+    scatterImpl<DataT, 1, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+  }
+}
+
+}  // namespace raft