Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add Fused L2 Expanded KNN kernel #339

Merged
merged 29 commits into from
Nov 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
75c9f27
add fused L2 expanded kNN kernel, this is faster by at least 20-25% o…
mdoijade Sep 24, 2021
7a1e1e6
use lid > firsActiveLane instead of bitwise left shift and & for upda…
mdoijade Sep 27, 2021
e655cd4
Merge branch 'branch-21.12' into fusedL2ExpandedKNN
mdoijade Sep 27, 2021
290d28d
fix incorrect output for NN >32 case when taking prod-cons knn merge …
mdoijade Sep 28, 2021
60d9201
Merge branch 'branch-21.12' into fusedL2ExpandedKNN
mdoijade Sep 28, 2021
5f3cea1
fix clang format issues
mdoijade Sep 28, 2021
5b5f7a0
enable testing of cuml using this raft fork
mdoijade Sep 28, 2021
738c604
add custom atomicMax function which works fine if negative zeros are …
mdoijade Sep 29, 2021
15cbda8
merge branch-21.12 and test customAtomicMax without +0 addition
mdoijade Sep 29, 2021
352cc2d
fix hang in raft atomicMax of fp32 when the inputs are NaNs
mdoijade Sep 30, 2021
aa8ef09
remove redundant processing.hpp included in fused_l2_knn
mdoijade Oct 5, 2021
6072281
refactor fused L2 KNN main function to call both L2 expanded/unexpand…
mdoijade Oct 6, 2021
ae14f75
revert ball cover test to use brute_force_knn function instead of exp…
mdoijade Oct 6, 2021
53b6415
use isnan only if DeviceMax/Min operations in atomicCAS based functio…
mdoijade Oct 7, 2021
1d9ade3
fix clang format issues
mdoijade Oct 7, 2021
62cff7b
revert prtest.config changes, move fusedL2kNN launch/selection code t…
mdoijade Oct 11, 2021
9164a64
fix bug in updateSortedWarpQ for NN > 32, disable use of sqrt as it i…
mdoijade Oct 13, 2021
abc2b11
allocate workspace when resize is required for using prod-cons mutexes
mdoijade Oct 13, 2021
ec0cc32
add unit test for fused L2 KNN exp/unexp cases using faiss bfknn as g…
mdoijade Nov 2, 2021
700318d
merge branch-21.12 and update fused_l2_knn.cuh with those changes
mdoijade Nov 2, 2021
2b64775
move customAtomicMax to generic atomicMax specialization, and remove …
mdoijade Nov 2, 2021
ef9a898
fix clang format errors
mdoijade Nov 2, 2021
b317a12
call faiss before fusedL2knn kernel in the test
mdoijade Nov 3, 2021
9e2e19e
fix issues in verification function as it can happen that 2 vectors w…
mdoijade Nov 3, 2021
395beff
Merge branch 'branch-22.02' into fusedL2ExpandedKNN
mdoijade Nov 17, 2021
f0fd7b4
revert ball_cover test to use compute_bfknn which is wrapper for brut…
mdoijade Nov 17, 2021
bb099ca
Merge branch 'branch-21.12' into fusedL2ExpandedKNN
cjnolet Nov 17, 2021
a2f1dee
Merge branch 'branch-22.02' into fusedL2ExpandedKNN
cjnolet Nov 23, 2021
bdce263
Adjusting rng.cuh
cjnolet Nov 23, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 26 additions & 3 deletions cpp/include/raft/device_atomics.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,15 @@ struct genericAtomicOperationImpl<T, Op, 4> {
__forceinline__ __device__ T operator()(T* addr, T const& update_value,
Op op) {
using T_int = unsigned int;

T old_value = *addr;
T assumed{old_value};

if constexpr (std::is_same<T, float>{} && (std::is_same<Op, DeviceMin>{})) {
if (isnan(update_value)) {
return old_value;
}
}

do {
assumed = old_value;
const T new_value = op(old_value, update_value);
Expand All @@ -191,13 +196,32 @@ struct genericAtomicOperationImpl<T, Op, 4> {
type_reinterpret<T_int, T>(assumed),
type_reinterpret<T_int, T>(new_value));
old_value = type_reinterpret<T, T_int>(ret);

} while (assumed != old_value);

return old_value;
}
};

// 4 bytes fp32 atomic Max operation
template <>
struct genericAtomicOperationImpl<float, DeviceMax, 4> {
using T = float;
__forceinline__ __device__ T operator()(T* addr, T const& update_value,
DeviceMax op) {
if (isnan(update_value)) {
return *addr;
}

T old =
(update_value >= 0)
? __int_as_float(atomicMax((int*)addr, __float_as_int(update_value)))
: __uint_as_float(
atomicMin((unsigned int*)addr, __float_as_uint(update_value)));

return old;
}
};

// 8 bytes atomic operation
template <typename T, typename Op>
struct genericAtomicOperationImpl<T, Op, 8> {
Expand Down Expand Up @@ -423,7 +447,6 @@ struct typesAtomicCASImpl<T, 4> {
T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
type_reinterpret<T_int, T>(compare),
type_reinterpret<T_int, T>(update_value));

return type_reinterpret<T, T_int>(ret);
}
};
Expand Down
Loading