-
-
Notifications
You must be signed in to change notification settings - Fork 55
Simplify rewriting of unreachable blocks. #277
Conversation
@vchuravy could you try this PR to see how it affects register pressure on your application? I've switched approaches, instead of removing |
This is what it looks like: julia> kernel(a) = (a[1]=1; nothing)
kernel (generic function with 1 method)
julia> CUDAnative.code_llvm(kernel, Tuple{CuDeviceArray{Int,2,AS.Global}}) define void @julia_kernel_1({ [2 x i64], { i64 } } addrspace(11)* nocapture nonnull readonly dereferenceable(24)) local_unnamed_addr {
top:
%1 = getelementptr { [2 x i64], { i64 } }, { [2 x i64], { i64 } } addrspace(11)* %0, i64 0, i32 0, i64 0
%2 = getelementptr { [2 x i64], { i64 } }, { [2 x i64], { i64 } } addrspace(11)* %0, i64 0, i32 0, i64 1
%3 = load i64, i64 addrspace(11)* %1, align 8, !tbaa !1, !invariant.load !4
%4 = load i64, i64 addrspace(11)* %2, align 8, !tbaa !1, !invariant.load !4
%5 = mul i64 %4, %3
%6 = icmp slt i64 %5, 1
br i1 %6, label %L14, label %L17
L14: ; preds = %top
call fastcc void @ptx_throw_boundserror()
call fastcc void @opaque_exit()
br label %opaque_unreachable
L17: ; preds = %top
%7 = getelementptr inbounds { [2 x i64], { i64 } }, { [2 x i64], { i64 } } addrspace(11)* %0, i64 0, i32 1, i32 0
%8 = bitcast i64 addrspace(11)* %7 to i64* addrspace(11)*
%9 = load i64*, i64* addrspace(11)* %8, align 8, !tbaa !1, !invariant.load !4
%10 = addrspacecast i64* %9 to i64 addrspace(1)*
store i64 1, i64 addrspace(1)* %10, align 8, !tbaa !5
ret void
opaque_unreachable: ; preds = %L14, %opaque_unreachable
br label %opaque_unreachable
}
define internal fastcc void @opaque_exit() unnamed_addr {
entry:
%0 = icmp eq i32 1, 1
br i1 %0, label %trap.preheader, label %loop.preheader
loop.preheader: ; preds = %entry
br label %loop
trap.preheader: ; preds = %entry
br label %trap
trap: ; preds = %trap.preheader, %trap
call void asm sideeffect "exit;", ""() #0
br label %trap
loop: ; preds = %loop.preheader, %loop
br label %loop
}
|
This is looking good. It reduced the register pressure even more (another 6 if I recall) in the kernels I was looking at. The only snag I hit was
|
Now this code doesn't work on this branch: using Test
using Random
using CuArrays
using GPUArrays
using CUDAnative
function main()
@eval CUDAnative globalUnique=0
empty!(CUDAnative.compilecache)
Random.seed!(0)
A = rand(1:10, 100)
@show cpu = mapreduce(identity, +, A)
dA = CuArray(A)
@show gpu = mapreduce(identity, +, dA)
@test cpu ≈ gpu
end
function Base.mapreduce(f::Function, op::Function, A::CuArray{T, N}) where {T, N}
OT = Int
v0 = 0
out = CuArray{OT,1}(undef, 1)
@cuda threads=64 reduce_kernel(f, op, v0, A, out)
Array(out)[1]
end
function reduce_kernel(f, op, v0::T, A, result) where {T}
tmp_local = @cuStaticSharedMem(T, 64)
acc = v0
# Loop sequentially over chunks of input vector
i = threadIdx().x
while i <= length(A)
element = f(A[i])
acc = op(acc, element)
i += blockDim().x
end
# Perform parallel reduction
@inbounds tmp_local[threadIdx().x] = acc
sync_threads()
offset = blockDim().x ÷ 2
while offset > 0
@inbounds if threadIdx().x <= offset
other = tmp_local[(threadIdx().x - 1) + offset + 1]
mine = tmp_local[threadIdx().x]
tmp_local[threadIdx().x] = op(mine, other)
end
sync_threads()
offset = offset ÷ 2
end
if threadIdx().x == 1
result[blockIdx().x] = @inbounds tmp_local[1]
end
return
end eerily similar to #4. even fails without the call to |
Sometimes getting |
Turns out we can leave the @vchuravy could you have another look how this impacts register usage? EDIT: aw crap this still causes test failures. argh. |
OK, works again when using |
Compared to current |
No description provided.