Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lds and sts inline ptx instructions to force vector instruction generation #273

Merged
merged 25 commits into from
Jun 21, 2021
Merged
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
3a4ec66
Refactor fusedL2NN to use pairwiseDistance class. invert block y/x di…
mdoijade May 12, 2021
76f9a72
-- add grid stride support to pairwise distance based cosine, l2, l1 …
mdoijade May 17, 2021
af89085
--Add grid stride based fusedL2NN kernel, this gives approx 1.67x spe…
mdoijade May 19, 2021
9c71c4a
Add note on reason to use thread 0 from each warp to write final redu…
mdoijade May 19, 2021
4d76b57
fix clangformat and copyright year
mdoijade May 19, 2021
da2d768
Merge branch 'branch-21.06' into gridStridedDist
mdoijade May 19, 2021
4ada29e
--Add additional Mblk + Nblk shmem for storing norms, and reuse xNorm…
mdoijade May 20, 2021
2e804c2
Use cudaOccupancyMaxActiveBlocksPerSM instead of hard-coded launch bo…
mdoijade May 24, 2021
3408a40
Merge branch 'branch-21.06' into gridStridedDist
mdoijade May 27, 2021
69b316d
initialize regx and regy during each prolog call
mdoijade Jun 1, 2021
9a30a87
initialize ldgX, ldgY in prolog
mdoijade Jun 1, 2021
969c65a
Merge branch 'branch-21.08' into gridStridedDist
mdoijade Jun 3, 2021
9c4d5a0
add syncthreads post epilog calc for non-norm distance metrics to mak…
mdoijade Jun 3, 2021
4fb00e6
remove syncthreads in epilog and instead use ping-pong buffers in nex…
mdoijade Jun 4, 2021
b5b3c51
use ping-pong buffers for safely grid striding
mdoijade Jun 4, 2021
0f2c03d
fix build failure of mst and knn test by adding cuda stream arg to rm…
mdoijade Jun 7, 2021
484b082
temp commit for test rerun
mdoijade Jun 7, 2021
04f656f
use ucx-py version 0.21 to temp resolve ci build failures
mdoijade Jun 7, 2021
753f612
Merge branch 'fix_mst_knn_test' into gridStridedDist
mdoijade Jun 7, 2021
f73471c
merge branch-21.08
mdoijade Jun 7, 2021
8007d7a
Merge branch 'fix_mst_knn_test' into gridStridedDist
mdoijade Jun 7, 2021
45dc556
Merge branch 'branch-21.08' into gridStridedDist
mdoijade Jun 8, 2021
4f5b2cf
Add lds and sts inline ptx instructions to force vector instruction g…
mdoijade Jun 9, 2021
f71759f
Merge branch 'branch-21.08' into vectorized_LDS_STS
mdoijade Jun 11, 2021
923e9a1
document the shmem ptr alignment requirement for vector load/stores
mdoijade Jun 11, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 68 additions & 33 deletions cpp/include/raft/common/device_loads_stores.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,60 +24,95 @@ namespace raft {
* @defgroup SmemStores Shared memory store operations
* @{
* @brief Stores to shared memory (both vectorized and non-vectorized forms)
* @param[out] addr shared memory address
* requires the given shmem pointer to be aligned by the vector
length, like for float4 lds/sts shmem pointer should be aligned
by 16 bytes else it might silently fail or can also give
runtime error.
* @param[out] addr shared memory address (should be aligned to vector size)
* @param[in] x data to be stored at this address
*/
DI void sts(float* addr, const float& x) { *addr = x; }
DI void sts(float* addr, const float (&x)[1]) { *addr = x[0]; }
DI void sts(float* addr, const float& x) {
auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x));
}
DI void sts(float* addr, const float (&x)[1]) {
auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0]));
}
DI void sts(float* addr, const float (&x)[2]) {
float2 v2 = make_float2(x[0], x[1]);
auto* s2 = reinterpret_cast<float2*>(addr);
*s2 = v2;
auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
asm volatile("st.shared.v2.f32 [%0], {%1, %2};"
:
: "l"(s2), "f"(x[0]), "f"(x[1]));
}
DI void sts(float* addr, const float (&x)[4]) {
float4 v4 = make_float4(x[0], x[1], x[2], x[3]);
auto* s4 = reinterpret_cast<float4*>(addr);
*s4 = v4;
auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};"
:
: "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3]));
}

DI void sts(double* addr, const double& x) {
auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x));
}
DI void sts(double* addr, const double (&x)[1]) {
auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0]));
}
DI void sts(double* addr, const double& x) { *addr = x; }
DI void sts(double* addr, const double (&x)[1]) { *addr = x[0]; }
DI void sts(double* addr, const double (&x)[2]) {
double2 v2 = make_double2(x[0], x[1]);
auto* s2 = reinterpret_cast<double2*>(addr);
*s2 = v2;
auto s2 = __cvta_generic_to_shared(reinterpret_cast<double2*>(addr));
asm volatile("st.shared.v2.f64 [%0], {%1, %2};"
:
: "l"(s2), "d"(x[0]), "d"(x[1]));
}
/** @} */

/**
* @defgroup SmemLoads Shared memory load operations
* @{
* @brief Loads from shared memory (both vectorized and non-vectorized forms)
requires the given shmem pointer to be aligned by the vector
length, like for float4 lds/sts shmem pointer should be aligned
by 16 bytes else it might silently fail or can also give
runtime error.
* @param[out] x the data to be loaded
* @param[in] addr shared memory address from where to load
* (should be aligned to vector size)
*/
DI void lds(float& x, float* addr) { x = *addr; }
DI void lds(float (&x)[1], float* addr) { x[0] = *addr; }
DI void lds(float& x, float* addr) {
auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1));
}
DI void lds(float (&x)[1], float* addr) {
auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1));
}
DI void lds(float (&x)[2], float* addr) {
auto* s2 = reinterpret_cast<float2*>(addr);
auto v2 = *s2;
x[0] = v2.x;
x[1] = v2.y;
auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];"
: "=f"(x[0]), "=f"(x[1])
: "l"(s2));
}
DI void lds(float (&x)[4], float* addr) {
auto* s4 = reinterpret_cast<float4*>(addr);
auto v4 = *s4;
x[0] = v4.x;
x[1] = v4.y;
x[2] = v4.z;
x[3] = v4.w;
}
DI void lds(double& x, double* addr) { x = *addr; }
DI void lds(double (&x)[1], double* addr) { x[0] = *addr; }
auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];"
: "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
: "l"(s4));
}
DI void lds(double& x, double* addr) {
auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1));
}
DI void lds(double (&x)[1], double* addr) {
auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1));
}
DI void lds(double (&x)[2], double* addr) {
auto* s2 = reinterpret_cast<double2*>(addr);
auto v2 = *s2;
x[0] = v2.x;
x[1] = v2.y;
auto s2 = __cvta_generic_to_shared(reinterpret_cast<double2*>(addr));
asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];"
: "=d"(x[0]), "=d"(x[1])
: "l"(s2));
}
/** @} */

Expand Down