-
Notifications
You must be signed in to change notification settings - Fork 199
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[REVIEW] Moving cuML prims to RAFT (#65)
* rng working * linalg tests working * got all tests working * changelog * clang format * updated from cuml * style check * added leftover prims * more style format * adding cuda check include files * its actually in cudart_utils.h Co-authored-by: Dante Gama Dessavre <[email protected]>
- Loading branch information
Showing
68 changed files
with
11,292 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
/* | ||
* Copyright (c) 2019, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cub/cub.cuh> | ||
#include <raft/mr/device/buffer.hpp> | ||
|
||
namespace raft { | ||
|
||
/** | ||
* @brief Convenience wrapper over cub's SortPairs method | ||
* @tparam KeyT key type | ||
* @tparam ValueT value type | ||
* @param workspace workspace buffer which will get resized if not enough space | ||
* @param inKeys input keys array | ||
* @param outKeys output keys array | ||
* @param inVals input values array | ||
* @param outVals output values array | ||
* @param len array length | ||
* @param stream cuda stream | ||
*/ | ||
template <typename KeyT, typename ValueT> | ||
void sortPairs(raft::mr::device::buffer<char> &workspace, const KeyT *inKeys, | ||
KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len, | ||
cudaStream_t stream) { | ||
size_t worksize; | ||
cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals, | ||
outVals, len, 0, sizeof(KeyT) * 8, stream); | ||
workspace.resize(worksize, stream); | ||
cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys, | ||
inVals, outVals, len, 0, sizeof(KeyT) * 8, | ||
stream); | ||
} | ||
|
||
} // namespace raft |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
/* | ||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <raft/cuda_utils.cuh> | ||
#include <raft/vectorized.cuh> | ||
|
||
namespace raft { | ||
|
||
template <typename DataT, int VecLen, typename Lambda, typename IdxT> | ||
__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx, | ||
IdxT len, Lambda op) { | ||
typedef TxN_t<DataT, VecLen> DataVec; | ||
typedef TxN_t<IdxT, VecLen> IdxVec; | ||
IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x); | ||
tid *= VecLen; | ||
if (tid >= len) return; | ||
IdxVec idxIn; | ||
idxIn.load(idx, tid); | ||
DataVec dataIn; | ||
#pragma unroll | ||
for (int i = 0; i < VecLen; ++i) { | ||
auto inPos = idxIn.val.data[i]; | ||
dataIn.val.data[i] = op(in[inPos], tid + i); | ||
} | ||
dataIn.store(out, tid); | ||
} | ||
|
||
template <typename DataT, int VecLen, typename Lambda, typename IdxT, int TPB> | ||
void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len, | ||
Lambda op, cudaStream_t stream) { | ||
const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB); | ||
scatterKernel<DataT, VecLen, Lambda, IdxT> | ||
<<<nblks, TPB, 0, stream>>>(out, in, idx, len, op); | ||
CUDA_CHECK(cudaGetLastError()); | ||
} | ||
|
||
/** | ||
* @brief Performs scatter operation based on the input indexing array | ||
* @tparam DataT data type whose array gets scattered | ||
* @tparam IdxT indexing type | ||
* @tparam TPB threads-per-block in the final kernel launched | ||
* @tparam Lambda the device-lambda performing a unary operation on the loaded | ||
* data before it gets scattered | ||
* @param out the output array | ||
* @param in the input array | ||
* @param idx the indexing array | ||
* @param len number of elements in the input array | ||
* @param stream cuda stream where to launch work | ||
* @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This | ||
* will be applied to every element before scattering it to the right location. | ||
* The second param in this method will be the destination index. | ||
*/ | ||
template <typename DataT, typename IdxT, | ||
typename Lambda = raft::Nop<DataT, IdxT>, int TPB = 256> | ||
void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len, | ||
cudaStream_t stream, Lambda op = raft::Nop<DataT, IdxT>()) { | ||
if (len <= 0) return; | ||
constexpr size_t DataSize = sizeof(DataT); | ||
constexpr size_t IdxSize = sizeof(IdxT); | ||
constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize; | ||
size_t bytes = len * MaxPerElem; | ||
if (16 / MaxPerElem && bytes % 16 == 0) { | ||
scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, | ||
op, stream); | ||
} else if (8 / MaxPerElem && bytes % 8 == 0) { | ||
scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, | ||
stream); | ||
} else if (4 / MaxPerElem && bytes % 4 == 0) { | ||
scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, | ||
stream); | ||
} else if (2 / MaxPerElem && bytes % 2 == 0) { | ||
scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, | ||
stream); | ||
} else if (1 / MaxPerElem) { | ||
scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, | ||
stream); | ||
} else { | ||
scatterImpl<DataT, 1, Lambda, IdxT, TPB>(out, in, idx, len, op, stream); | ||
} | ||
} | ||
|
||
} // namespace raft |
Oops, something went wrong.