-
-
Notifications
You must be signed in to change notification settings - Fork 55
Shared memory + multiple function exits cause invalid results #4
Comments
Not reproducible with this code anymore, but |
New repro, again using shared memory + bounds checking, but this time the invalid value is the result of using CUDAdrv, CUDAnative
function kernel(ptr::Ptr{Cint})
shared = @cuStaticSharedMem(Cint, 4)
lane = (threadIdx().x-1) % warpsize
if lane == 0
@boundscheck Base.checkbounds(shared, threadIdx().x)
unsafe_store!(shared.ptr, 0, threadIdx().x)
end
sync_threads()
val = shfl_down(Cint(32), 1, 4)
if lane == 0
unsafe_store!(ptr, val)
end
return
end
dev = CuDevice(0)
ctx = CuContext(dev)
gpu_val = CuArray(Cint, 1)
@cuda dev (1,4) kernel(gpu_val.ptr)
val = Array(gpu_val)[1]
println(val)
destroy(ctx) Returns 0 with checkbounds, 32 without. |
Managed to reduce to two sets of LLVM IR, executed using the following snippet: using CUDAdrv, CUDAnative, LLVM
dev = CuDevice(0)
ctx = CuContext(dev)
for ir_fn in ["bug-working.ll", "bug-broken.ll"]
gpu_val = CuArray(Cint[42])
ir = readstring(ir_fn)
mod = parse(LLVM.Module, ir)
fn = "kernel"
entry = get(functions(mod), "kernel")
ptx = CUDAnative.mcgen(mod, entry, v"3.0")
cuda_mod = CuModule(ptx)
cuda_fun = CuFunction(cuda_mod, fn)
cudacall(cuda_fun, 1, 4, (Ptr{Cint},), gpu_val.ptr)
val = Array(gpu_val)[1]
println(val)
end
destroy(ctx) Working IR: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
@shmem = internal addrspace(3) global [4 x i32] zeroinitializer, align 4
define void @kernel(i32*) {
top:
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%2 = and i32 %1, 31
%3 = icmp eq i32 %2, 0
br i1 %3, label %lane0_boundscheck, label %sync_shfl
lane0_boundscheck:
%4 = icmp ugt i32 %1, 3
br i1 %4, label %lane0_oob, label %lane0_shmem
lane0_oob:
tail call void @llvm.trap()
unreachable
sync_shfl:
tail call void @llvm.nvvm.barrier0()
%5 = tail call i32 @llvm.nvvm.shfl.down.i32(i32 32, i32 1, i32 7199)
br i1 %3, label %lane0_writeback, label %end
lane0_shmem:
%6 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @shmem, i32 0, i32 %1
store i32 0, i32 addrspace(3)* %6, align 8
br label %sync_shfl
lane0_writeback:
store i32 %5, i32* %0, align 8
br label %end
end:
ret void
}
declare void @llvm.trap()
declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32)
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
declare void @llvm.nvvm.barrier0() Broken IR: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
@shmem = internal addrspace(3) global [4 x i32] zeroinitializer, align 4
define void @kernel(i32*) {
top:
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%2 = and i32 %1, 31
%3 = icmp eq i32 %2, 0
br i1 %3, label %lane0_boundscheck, label %sync_shfl
lane0_boundscheck:
%4 = icmp ugt i32 %1, 3
br i1 %4, label %lane0_oob, label %lane0_shmem
sync_shfl:
tail call void @llvm.nvvm.barrier0()
%5 = tail call i32 @llvm.nvvm.shfl.down.i32(i32 32, i32 1, i32 7199)
br i1 %3, label %lane0_writeback, label %end
lane0_oob:
tail call void @llvm.trap()
unreachable
lane0_shmem:
%6 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @shmem, i32 0, i32 %1
store i32 0, i32 addrspace(3)* %6, align 8
br label %sync_shfl
lane0_writeback:
store i32 %5, i32* %0, align 8
br label %end
end:
ret void
}
declare void @llvm.trap()
declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32)
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
declare void @llvm.nvvm.barrier0() That's right, the only difference between those two is the placement of the |
One layer deeper... Working PTX:
Broken PTX:
Loader:
Only difference: the bounds-check branch (>3 or <4): $ diff bug-working.ptx bug-broken.ptx *[master]
22,24c22,24
< setp.lt.u32 %p2, %r1, 4;
< @%p2 bra BB_SHMEM;
< bra.uni BB_OOB;
---
> setp.gt.u32 %p2, %r1, 3;
> @%p2 bra BB_OOB;
> bra.uni BB_SHMEM; Probably an assembler bug. |
Alternative loader, using using CUDAdrv
dev = CuDevice(0)
ctx = CuContext(dev)
fn = "kernel"
for name in ["kernel-working", "kernel-broken"]
gpu_val = CuArray(Cint[42])
run(`ptxas -arch=sm_61 -o $name.cubin $name.ptx`)
cuda_mod = CuModule(read("$name.cubin"))
cuda_fun = CuFunction(cuda_mod, fn)
cudacall(cuda_fun, 1, 4, (Ptr{Cint},), gpu_val.ptr)
val = Array(gpu_val)[1]
println(val)
end
destroy(ctx) |
Almost definitely looks like an assembler bug. See the following annotated & prettified Pascal SASS ( Working version:
Broken version:
The broken version clearly messes up its reconvergence stack, not pushing anything on it despite multiple conditional branches (for some info on how this works, see this paper by Bialas and Strzelecki)... |
And a C++ loader, for reporting purposes. #include <stdio.h>
#include <cuda.h>
#define CHECK(err) __check(err, __FILE__, __LINE__)
inline void __check(CUresult err, const char *file, const int line) {
if (CUDA_SUCCESS != err) {
const char *name, *descr;
cuGetErrorName(err, &name);
cuGetErrorString(err, &name);
fprintf(stderr, "CUDA error #%s: %s at %s:%i\n", name, descr, file, line);
abort();
}
}
int test(const char *path) {
CUmodule mod;
cuModuleLoad(&mod, path);
CUfunction fun;
CHECK(cuModuleGetFunction(&fun, mod, "kernel"));
int *gpu_val;
CHECK(cuMemAlloc((CUdeviceptr*) &gpu_val, sizeof(int)));
void *args[1] = {&gpu_val};
cuLaunchKernel(fun, 1, 1, 1, 4, 1, 1, 0, NULL, args, NULL);
int val;
CHECK(cuMemcpyDtoH(&val, (CUdeviceptr) gpu_val, sizeof(int)));
CHECK(cuModuleUnload(mod));
return val;
}
int main() {
CHECK(cuInit(0));
CUdevice dev;
CHECK(cuDeviceGet(&dev, 0));
CUcontext ctx;
CHECK(cuCtxCreate(&ctx, 0, dev));
printf("working: %d\n", test("kernel-working.ptx"));
printf("broken: %d\n", test("kernel-broken.ptx"));
CHECK(cuCtxDestroy(ctx));
return 0;
} Will probably submit this to NVIDIA soon, unless anybody still spots us doing something wrong. |
Reported this repro to NVIDIA, bug #1833004. Will disable bounds checking for the time being. |
Could we fix this on the LLVM side? Any bugfix to the assembler is going to be deployed slowly. |
I haven't figured out what PTX pattern exactly triggers the SASS emission bug. Probably the branch to a trap BB. I've asked NVIDIA for some background on the bug, if they deem it a bug, so I'm going to wait for them to respond before sinking more time into this. |
Status update from NVIDIA:
... but I haven't got access to their bug tracker (I'm only on its CC list), so I can't look at or ask for more details 😕 |
At least you know it is in fact their fault :) |
No idea how / starting which version / ... though (still don't allow me access to the bug tracker). |
Revisited this issue. Seems like it's still there, at least on NVIDIA driver 375.39, but I found out that it only reproduces on Anyone with |
I only have access to |
Great! Please send me the output (verify the bug is still there), SASS files generated by |
It might take me a week or so.
…On Thu, 27 Apr 2017, 20:33 Tim Besard, ***@***.***> wrote:
Great! Please send me the output (verify the bug is still there), SASS
files generated by ptx.jl (remove existing ones first), and the driver
version. No hurry though, it's not like we can do much about it. But given
some extra data points, it might be possible to re-enable bounds checking...
—
You are receiving this because you commented.
Reply to this email directly, view it on GitHub
<#4 (comment)>,
or mute the thread
<https://github.com/notifications/unsubscribe-auth/AAI3akQIlKponJ9Fkru2M0ORdxPFs5yqks5r0IstgaJpZM4J_KJM>
.
|
Bug still there on 375.66 (current long-lived). |
It looks like you've discovered https://bugs.llvm.org/show_bug.cgi?id=27738, or something related. Unfortunately we've gotten zero movement from nvidia on this in the ~1.5 years since we discovered it ourselves and brought it to their attention. It's possible that CUDA 9's ptxas will be better, but I don't expect a proper fix except inasmuch as "buy a Volta card and use the new Yours is the cleanest reduction of this bug I've seen, btw. |
FYI, @timshen91 is rolling out an incomplete fix for this in LLVM, and working on the full fix. He'll post details in the bug. Empirically, the partial fix he has in hand fixes this problem for everything we've seen on our end. We'd be curious to hear if it fixes anything for you all. |
Oh cool, thanks for the ping! I'll have a look about reproducing, since it's a while ago since I last looked at this. We also mentioned this issue to NVIDIA and they were going to look into giving us more info; if that happens I'll update here. |
The partial fix is https://reviews.llvm.org/D45008 and https://reviews.llvm.org/D45070. Once they are committed, I'll update with the revision number that needs to be sync'ed pass. |
Any LLVM who's revision is larger than or equal to r328885 should include my partial fix. I tried to use 367.48 nvcc and ptxas (but with newer driver) to reproduce the bug but failed. I'll wait for @maleadt for a short period of time and see what will happen. :) |
Similarly, I had to revert to 375.66, as I could not reproduce the issue on 384.111 (Debian stable BPO). Testing on r329021, it seems like the bug is still there though (on
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
@shmem = internal addrspace(3) global [4 x i32] zeroinitializer, align 4
define void @kernel(i32*) {
top:
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%2 = and i32 %1, 31
%3 = icmp eq i32 %2, 0
br i1 %3, label %lane0_boundscheck, label %sync_shfl
lane0_boundscheck:
%4 = icmp ugt i32 %1, 3
br i1 %4, label %lane0_oob, label %lane0_shmem
lane0_oob:
tail call void @llvm.trap()
unreachable
sync_shfl:
tail call void @llvm.nvvm.barrier0()
%5 = tail call i32 @llvm.nvvm.shfl.down.i32(i32 32, i32 1, i32 7199)
br i1 %3, label %lane0_writeback, label %end
lane0_shmem:
%6 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @shmem, i32 0, i32 %1
store i32 0, i32 addrspace(3)* %6, align 8
br label %sync_shfl
lane0_writeback:
store i32 %5, i32* %0, align 8
br label %end
end:
ret void
}
declare void @llvm.trap()
declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32)
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
declare void @llvm.nvvm.barrier0()
!nvvm.annotations = !{!0}
!0 = !{void (i32*)* @kernel, !"kernel", i32 1}
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
@shmem = internal addrspace(3) global [4 x i32] zeroinitializer, align 4
define void @kernel(i32*) {
top:
%1 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%2 = and i32 %1, 31
%3 = icmp eq i32 %2, 0
br i1 %3, label %lane0_boundscheck, label %sync_shfl
lane0_boundscheck:
%4 = icmp ugt i32 %1, 3
br i1 %4, label %lane0_oob, label %lane0_shmem
sync_shfl:
tail call void @llvm.nvvm.barrier0()
%5 = tail call i32 @llvm.nvvm.shfl.down.i32(i32 32, i32 1, i32 7199)
br i1 %3, label %lane0_writeback, label %end
lane0_oob:
tail call void @llvm.trap()
unreachable
lane0_shmem:
%6 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @shmem, i32 0, i32 %1
store i32 0, i32 addrspace(3)* %6, align 8
br label %sync_shfl
lane0_writeback:
store i32 %5, i32* %0, align 8
br label %end
end:
ret void
}
declare void @llvm.trap()
declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32)
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
declare void @llvm.nvvm.barrier0()
!nvvm.annotations = !{!0}
!0 = !{void (i32*)* @kernel, !"kernel", i32 1} Compile to PTX:
Loader for PTX code: #include <stdio.h>
#include <cuda.h>
#define CHECK(err) __check(err, __FILE__, __LINE__)
inline void __check(CUresult err, const char *file, const int line) {
if (CUDA_SUCCESS != err) {
const char *name, *descr;
cuGetErrorName(err, &name);
cuGetErrorString(err, &name);
fprintf(stderr, "CUDA error #%s: %s at %s:%i\n", name, descr, file, line);
abort();
}
}
int test(const char *path) {
CUmodule mod;
CHECK(cuModuleLoad(&mod, path));
CUfunction fun;
CHECK(cuModuleGetFunction(&fun, mod, "kernel"));
int *gpu_val;
CHECK(cuMemAlloc((CUdeviceptr*) &gpu_val, sizeof(int)));
void *args[1] = {&gpu_val};
CHECK(cuLaunchKernel(fun, 1, 1, 1, 4, 1, 1, 0, NULL, args, NULL));
int val;
CHECK(cuMemcpyDtoH(&val, (CUdeviceptr) gpu_val, sizeof(int)));
CHECK(cuModuleUnload(mod));
return val;
}
int main() {
CHECK(cuInit(0));
CUdevice dev;
CHECK(cuDeviceGet(&dev, 0));
CUcontext ctx;
CHECK(cuCtxCreate(&ctx, 0, dev));
printf("working: %d\n", test("working.ptx"));
printf("broken: %d\n", test("broken.ptx"));
CHECK(cuCtxDestroy(ctx));
return 0;
} Output:
Even though the generated PTX does differ between LLVM 6.0 and LLVM ToT (but differs identically wrt. the working or broken versions): --- working_6.0.ptx 2018-04-03 10:34:01.000000000 +0200
+++ working_ToT.ptx 2018-04-03 09:57:20.000000000 +0200
@@ -39,12 +39,12 @@
mov.u32 %r5, 32;
shfl.down.b32 %r3, %r5, 1, 7199;
@%p3 bra LBB0_5;
-// %bb.6: // %end
- ret;
+ bra.uni LBB0_6;
LBB0_5: // %lane0_writeback
ld.param.u64 %rd2, [kernel_param_0];
cvta.to.global.u64 %rd1, %rd2;
st.global.u32 [%rd1], %r3;
+LBB0_6: // %end
ret;
LBB0_2: // %lane0_oob
trap; |
I suspect that this is because the driver contains a copy of ptxas, so changing the driver version changes the ptxas version you're using. If you compiled all the way to SASS for your GPU (dunno if your frontend does this) ahead of time using ptxas, then the driver version shouldn't matter. I can link you to how we do this in XLA if it'd be helpful. Will leave the analysis here to @timshen91. |
I also reproduced the ptxas miscompile on sm_61 with ptxas 8.0. I modified the launcher to call kernel<<<...>>>(...), and link the pre-compiled ptx into the launcher. It looks like the I attempted four different variations: (a) and (b) attempted to fix the control flow graph (CFG) region structure, but they didn't work. Both (c) and (d) work, but I can' extract a principled heuristic from (c) or (d). Hopefully the new ptxas fixes this kind of issue(s) once for all. |
Yeah, I've been deliberately using the driver for this because I assume it to be faster than having to call
Right, I assume this breaks the structured CFG requirement. I'll just avoid emitting By the way, any suggestions on similar fatal error reporting mechanisms? |
XLA doesn't require this functionality at the moment, but we have talked about adding an assert/trap instruction to XLA. Our idea for implementing it was to use a global variable. Which is ugly for sure. But I'm not sure how to do the global variable and prevent future kernels from running. That's really what |
Pretty sure I just ran into another occurrence of this bug: using CUDAnative, CUDAdrv
function cpu(input)
output = Vector{Cint}(2)
for i in 1:2
output[i] = input[1]
end
return output
end
function kernel(input, output, n)
i = threadIdx().x
temp = @cuStaticSharedMem(Cint, 1)
if i == 1
1 <= n || ccall("llvm.trap", llvmcall, Cvoid, ())
temp[1] = input
end
sync_threads()
i <= n || ccall("llvm.trap", llvmcall, Cvoid, ())
unsafe_store!(output, temp[1], i)
end
function gpu(input)
output_gpu = Mem.alloc(Cint, 2)
@cuda threads=2 kernel(input, convert(Ptr{eltype(input)}, output_gpu.ptr), 42)
return Mem.download(Cint, output_gpu, 2)
end
using Test
function main()
input = rand(Cint(1):Cint(100))
@test cpu(input) == gpu(input)
end
Was going to reduce this further (SASS, C++ loader) but these PTX files now seems to hang in both |
Ugh, these ptxas bugs are the worst. :( The ptx LLVM is generating here does not look particularly well-structurized to me, though. It's conceivable that better structurization in LLVM would resolve this. I think @timshen91 had been hoping that the current amount of structurization we apply (really, iirc, it's that we turned off passes that would make the graph less structured) would be sufficient, but maybe you're proving that's not the case. |
Hi @maleadt, Do you have the LLVM IR, and possibly the set of LLVM flags used to generate the ptx? |
Sure. I'll dump as much relevant info as possible. The original high-level source code is as follows: function kernel(input::Int32, output::Ptr{Int32}, yes::Bool=true)
i = threadIdx().x
temp = @cuStaticSharedMem(Cint, 1)
if i == 1
yes || no()
temp[1] = input
end
sync_threads()
yes || no()
unsafe_store!(output, temp[1], i)
end
function no()
ccall("llvm.trap", llvmcall, Cvoid, ())
end That is compiled to the following LLVM IR: ; ModuleID = 'KernelWrapper'
source_filename = "KernelWrapper"
target triple = "nvptx64-nvidia-cuda"
%jl_value_t = type opaque
@shmem1 = addrspace(3) global [1 x i32] zeroinitializer, align 16
define i64 @julia_kernel_36616(i32, i64, i8) local_unnamed_addr {
top:
%3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
%4 = icmp eq i32 %3, 0
%5 = and i8 %2, 1
br i1 %4, label %L17, label %L27
L17: ; preds = %top
%6 = icmp eq i8 %5, 0
br i1 %6, label %L19, label %L22
L19: ; preds = %L17
call void @llvm.trap()
unreachable
L22: ; preds = %L17
store i32 %0, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @shmem1, i64 0, i64 0), align 16, !tbaa !3
br label %L27
L27: ; preds = %top, %L22
call void @llvm.nvvm.barrier0()
%7 = icmp eq i8 %5, 0
br i1 %7, label %L30, label %L33
L30: ; preds = %L27
call void @llvm.trap()
unreachable
L33: ; preds = %L27
%8 = load i32, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @shmem1, i64 0, i64 0), align 16, !tbaa !3
%9 = zext i32 %3 to i64
%10 = inttoptr i64 %1 to i32*
%11 = getelementptr inbounds i32, i32* %10, i64 %9
store i32 %8, i32* %11, align 1, !tbaa !6
ret i64 %1
}
; Function Attrs: noreturn nounwind
declare void @llvm.trap() #0
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier0() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
define void @ptxcall_kernel_1(i32, i64, i8) local_unnamed_addr {
entry:
%3 = call i64 @julia_kernel_36616(i32 %0, i64 %1, i8 %2)
ret void
}
attributes #0 = { noreturn nounwind }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
attributes #3 = { allocsize(1) }
!llvm.module.flags = !{!0}
!nvvm.annotations = !{!1}
!0 = !{i32 1, !"Debug Info Version", i32 3}
!1 = !{void (i32, i64, i8)* @ptxcall_kernel_1, !"kernel", i32 1}
!2 = !{i32 0, i32 1023}
!3 = !{!4, !4, i64 0, i64 0}
!4 = !{!"ptxtbaa_shared", !5, i64 0}
!5 = !{!"ptxtbaa"}
!6 = !{!7, !7, i64 0}
!7 = !{!"jtbaa_data", !8, i64 0}
!8 = !{!"jtbaa"} Which in turn generates the following PTX:
This code generates broken SASS:
Interestingly, changing the kernel wrapper to pass a literal define void @ptxcall_kernel_1(i32, i64) local_unnamed_addr {
entry:
%2 = call i64 @julia_kernel_36630(i32 %0, i64 %1, i8 1), !dbg !65
ret void
}
You actually don't need the @0 = internal unnamed_addr constant [27 x i8] c"go home ptxas you're drunk\00"
define i64 @julia_kernel_36648(i32, i64, i8) local_unnamed_addr {
top:
%3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
%4 = icmp eq i32 %3, 0
%5 = and i8 %2, 1
br i1 %4, label %L17, label %L30
L17: ; preds = %top
%6 = icmp eq i8 %5, 0
br i1 %6, label %L19, label %L25
L19: ; preds = %L17
%7 = call i32 @vprintf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @0, i64 0, i64 0), i8* null)
unreachable
L25: ; preds = %L17
store i32 %0, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @shmem1, i64 0, i64 0), align 16, !tbaa !3
br label %L30
L30: ; preds = %top, %L25
call void @llvm.nvvm.barrier0()
%8 = icmp eq i8 %5, 0
br i1 %8, label %L33, label %L39
L33: ; preds = %L30
%9 = call i32 @vprintf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @0, i64 0, i64 0), i8* null)
unreachable
L39: ; preds = %L30
%10 = load i32, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @shmem1, i64 0, i64 0), align 16, !tbaa !3
%11 = zext i32 %3 to i64
%12 = inttoptr i64 %1 to i32*
%13 = getelementptr inbounds i32, i32* %12, i64 %11
store i32 %10, i32* %13, align 1, !tbaa !6
ret i64 %1
} All this is done with LLVM 6.0, with quite some patches but none specific to NVPXT. What LLVM flags are relevant here? PTX MC target is initialized with only a ISA flag set , targeting |
Observations after having lost some more time on this:
Currently trying out some fairly horrible transformations that replace It seems to work OK and passes our fairly comprehensive tests, some of which consistently fail without these transformations (toolkit v10, driver v410.57). Closing this for now, I don't think we can do much better (apart from improving the transformations / moving to LLVM / convincing NVIDIA to fix their stuff). |
Cause seems to be an added
checkbounds
, if that even makes sense.Repro:
Result without
checkbounds
:[1; 1]
. With:[1; 0]
.cc @cfoket
The text was updated successfully, but these errors were encountered: