Skip to content

Commit

Permalink
[NVPTX] Fix 64 bits rotations with large shift values
Browse files Browse the repository at this point in the history
ROTL and ROTR can take a shift amount larger than the element size, in
which case the effective shift amount should be the shift amount modulo
the element size.

This patch adds the modulo step when the shift amount isn't known at
compile time. Without it the existing implementation would end up
shifting beyond the type size and give incorrect results.
  • Loading branch information
npmiller committed Apr 29, 2024
1 parent 6f02120 commit 138f196
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 4 deletions.
10 changes: 6 additions & 4 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1752,8 +1752,9 @@ def ROTL64reg_sw :
".reg .b64 %lhs;\n\t"
".reg .b64 %rhs;\n\t"
".reg .u32 %amt2;\n\t"
"shl.b64 \t%lhs, $src, $amt;\n\t"
"sub.u32 \t%amt2, 64, $amt;\n\t"
"and.b32 \t%amt2, $amt, 63;\n\t"
"shl.b64 \t%lhs, $src, %amt2;\n\t"
"sub.u32 \t%amt2, 64, %amt2;\n\t"
"shr.b64 \t%rhs, $src, %amt2;\n\t"
"add.u64 \t$dst, %lhs, %rhs;\n\t"
"}}",
Expand All @@ -1765,8 +1766,9 @@ def ROTR64reg_sw :
".reg .b64 %lhs;\n\t"
".reg .b64 %rhs;\n\t"
".reg .u32 %amt2;\n\t"
"shr.b64 \t%lhs, $src, $amt;\n\t"
"sub.u32 \t%amt2, 64, $amt;\n\t"
"and.b32 \t%amt2, $amt, 63;\n\t"
"shr.b64 \t%lhs, $src, %amt2;\n\t"
"sub.u32 \t%amt2, 64, %amt2;\n\t"
"shl.b64 \t%rhs, $src, %amt2;\n\t"
"add.u64 \t$dst, %lhs, %rhs;\n\t"
"}}",
Expand Down
43 changes: 43 additions & 0 deletions llvm/test/CodeGen/NVPTX/rotate.ll
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,46 @@ define i32 @rotl0(i32 %x) {
%t2 = or i32 %t0, %t1
ret i32 %t2
}

declare i64 @llvm.fshl.i64(i64, i64, i64)
declare i64 @llvm.fshr.i64(i64, i64, i64)

; SM35: rotl64
define i64 @rotl64(i64 %a, i64 %n) {
; SM35: and.b32 {{.*}}, 63;
; SM35: shl.b64
; SM35: sub.u32
; SM35: shr.b64
; SM35: add.u64
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n)
ret i64 %val
}

; SM35: rotl64_imm
define i64 @rotl64_imm(i64 %a) {
; SM35: shl.b64 {{.*}}, 2;
; SM35: shr.b64 {{.*}}, 62;
; SM35: add.u64
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
ret i64 %val
}

; SM35: rotr64
define i64 @rotr64(i64 %a, i64 %n) {
; SM35: and.b32 {{.*}}, 63;
; SM35: shr.b64
; SM35: sub.u32
; SM35: shl.b64
; SM35: add.u64
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n)
ret i64 %val
}

; SM35: rotr64_imm
define i64 @rotr64_imm(i64 %a) {
; SM35: shl.b64 {{.*}}, 62;
; SM35: shr.b64 {{.*}}, 2;
; SM35: add.u64
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
ret i64 %val
}

0 comments on commit 138f196

Please sign in to comment.