Skip to content

Commit

Permalink
Optimize run_spin_excitation! for GPU (#462)
Browse files Browse the repository at this point in the history
  • Loading branch information
rkierulf authored Aug 19, 2024
1 parent 1457a4c commit 1b6c5be
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 4 deletions.
46 changes: 45 additions & 1 deletion KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
include("KernelFunctions.jl")

"""These properties are redundant with seq.Δt and seq.ADC, but it is much faster
to calculate them on the CPU before the simulation is run."""
struct SeqBlockProperties{T<:Real}
Expand Down Expand Up @@ -64,6 +66,10 @@ end
struct BlochGPUPrealloc{T} <: PreallocResult{T}
seq_properties::AbstractVector{SeqBlockProperties{T}}
Bz::AbstractMatrix{T}
B::AbstractMatrix{T}
φ::AbstractMatrix{T}
ΔT1::AbstractMatrix{T}
ΔT2::AbstractMatrix{T}
Δϕ::AbstractMatrix{T}
ϕ::AbstractArray{T}
Mxy::AbstractMatrix{Complex{T}}
Expand All @@ -77,6 +83,10 @@ function prealloc_block(p::BlochGPUPrealloc{T}, i::Integer) where {T<:Real}
return BlochGPUPrealloc(
[seq_block],
view(p.Bz,:,1:seq_block.length),
view(p.B,:,1:seq_block.length),
view(p.φ,:,1:seq_block.length-1),
view(p.ΔT1,:,1:seq_block.length-1),
view(p.ΔT2,:,1:seq_block.length-1),
view(p.Δϕ,:,1:seq_block.length-1),
seq_block.nADC > 0 ? view(p.ϕ,:,1:seq_block.length-1) : view(p.ϕ,:,1),
view(p.Mxy,:,1:seq_block.nADC),
Expand All @@ -91,6 +101,10 @@ function prealloc(sim_method::Bloch, backend::KA.GPU, obj::Phantom{T}, M::Mag{T}
return BlochGPUPrealloc(
precalc.seq_properties,
KA.zeros(backend, T, (size(obj.x, 1), max_block_length)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, Complex{T}, (size(obj.x, 1), max_block_length)),
Expand Down Expand Up @@ -150,4 +164,34 @@ function run_spin_precession!(
M.xy .= M.xy .* exp.(-seq_block.dur ./ p.T2) .* _cis.(pre.ϕ[:,end])

return nothing
end
end

function run_spin_excitation!(
p::Phantom{T},
seq::DiscreteSequence{T},
sig::AbstractArray{Complex{T}},
M::Mag{T},
sim_method::Bloch,
backend::KA.GPU,
pre::BlochGPUPrealloc
) where {T<:Real}
#Motion
x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')

#Effective Field
pre.Bz .= (x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz') .+ pre.ΔBz .- seq.Δf' ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
pre.B .= sqrt.(abs.(seq.B1') .^ 2 .+ abs.(pre.Bz) .^ 2)

#Spinor Rotation
pre.φ .= T(-π .* γ) .* (pre.B[:,1:end-1] .* seq.Δt') # TODO: Use trapezoidal integration here (?), this is just Forward Euler

#Relaxation
pre.ΔT1 .= exp.(-seq.Δt' ./ p.T1)
pre.ΔT2 .= exp.(-seq.Δt' ./ p.T2)

#Excitation
apply_excitation!(backend, 512)(M.xy, M.z, pre.φ, seq.B1, pre.Bz, pre.B, pre.ΔT1, pre.ΔT2, p.ρ, ndrange=size(M.xy,1))
KA.synchronize(backend)

return nothing
end
61 changes: 61 additions & 0 deletions KomaMRICore/src/simulation/SimMethods/Bloch/KernelFunctions.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
using KernelAbstractions: @kernel, @Const, @index, @uniform, @groupsize, @localmem

## COV_EXCL_START

@kernel function apply_excitation!(Mxy, Mz, @Const(φ), @Const(B1), @Const(Bz), @Const(B), @Const(ΔT1), @Const(ΔT2), @Const(ρ))
i_g = @index(Global)
i_l = @index(Local)

@uniform T = eltype(φ)
@uniform N = @groupsize()[1]
@uniform N_Δt = size(φ, 2)

s_α_r = @localmem T (N,)
s_α_i = @localmem T (N,)
s_β_i = @localmem T (N,)
s_β_r = @localmem T (N,)
s_Mxy_r = @localmem T (N,)
s_Mxy_i = @localmem T (N,)
s_Mxy_new_r = @localmem T (N,)
s_Mxy_new_i = @localmem T (N,)
s_Mz = @localmem T (N,)
s_Mz_new = @localmem T (N,)
s_ρ = @localmem T (N,)

@inbounds s_Mxy_r[i_l] = real(Mxy[i_g])
@inbounds s_Mxy_i[i_l] = imag(Mxy[i_g])
@inbounds s_Mz[i_l] = Mz[i_g]
@inbounds s_ρ[i_l] = ρ[i_g]

@inbounds for t = 1 : N_Δt
sin_φ = sin(φ[i_g, t]) #TO-DO: use sincos once oneAPI releases version with https://github.com/JuliaGPU/oneAPI.jl/commit/260a4dda0ea223dbf0893de7b4a13d994ae27bd1
cos_φ = cos(φ[i_g, t])
s_α_r[i_l] = cos_φ
if (iszero(B[i_g, t]))
s_α_i[i_l] = -(Bz[i_g, t] / (B[i_g, t] + eps(T))) * sin_φ
s_β_r[i_l] = (imag(B1[t]) / (B[i_g, t] + eps(T))) * sin_φ
s_β_i[i_l] = -(real(B1[t]) / (B[i_g, t] + eps(T))) * sin_φ
else
s_α_i[i_l] = -(Bz[i_g, t] / B[i_g, t]) * sin_φ
s_β_r[i_l] = (imag(B1[t]) / B[i_g, t]) * sin_φ
s_β_i[i_l] = -(real(B1[t]) / B[i_g, t]) * sin_φ
end
s_Mxy_new_r[i_l] = 2 * (s_Mxy_i[i_l] * (s_α_r[i_l] * s_α_i[i_l] - s_β_r[i_l] * s_β_i[i_l]) +
s_Mz[i_l] * (s_α_i[i_l] * s_β_i[i_l] + s_α_r[i_l] * s_β_r[i_l])) +
s_Mxy_r[i_l] * (s_α_r[i_l]^2 - s_α_i[i_l]^2 - s_β_r[i_l]^2 + s_β_i[i_l]^2)
s_Mxy_new_i[i_l] = -2 * (s_Mxy_r[i_l] * (s_α_r[i_l] * s_α_i[i_l] + s_β_r[i_l] * s_β_i[i_l]) -
s_Mz[i_l] * (s_α_r[i_l] * s_β_i[i_l] - s_α_i[i_l] * s_β_r[i_l])) +
s_Mxy_i[i_l] * (s_α_r[i_l]^2 - s_α_i[i_l]^2 + s_β_r[i_l]^2 - s_β_i[i_l]^2)
s_Mz_new[i_l] = s_Mz[i_l] * (s_α_r[i_l]^2 + s_α_i[i_l]^2 - s_β_r[i_l]^2 - s_β_i[i_l]^2) -
2 * (s_Mxy_r[i_l] * (s_α_r[i_l] * s_β_r[i_l] - s_α_i[i_l] * s_β_i[i_l]) +
s_Mxy_i[i_l] * (s_α_r[i_l] * s_β_i[i_l] + s_α_i[i_l] * s_β_r[i_l]))
s_Mxy_r[i_l] = s_Mxy_new_r[i_l] * ΔT2[i_g, t]
s_Mxy_i[i_l] = s_Mxy_new_i[i_l] * ΔT2[i_g, t]
s_Mz[i_l] = s_Mz_new[i_l] * ΔT1[i_g, t] + s_ρ[i_l] * (1 - ΔT1[i_g, t])
end

@inbounds Mxy[i_g] = s_Mxy_r[i_l] + 1im * s_Mxy_i[i_l]
@inbounds Mz[i_g] = s_Mz[i_l]
end

## COV_EXCL_STOP
4 changes: 1 addition & 3 deletions KomaMRICore/src/simulation/SimulatorCore.jl
Original file line number Diff line number Diff line change
Expand Up @@ -332,9 +332,7 @@ function simulate(
sim_params = default_sim_params(sim_params)
#Warn if user is trying to run on CPU without enabling multi-threading
if (!sim_params["gpu"] && Threads.nthreads() == 1)
@info """
Simulation will be run on the CPU with only 1 thread. To
enable multi-threading, start julia with --threads=auto.
@info """Simulation will be run on the CPU with only 1 thread. To enable multi-threading, start julia with --threads=auto
""" maxlog=1
end
# Simulation init
Expand Down

1 comment on commit 1b6c5be

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

KomaMRI Benchmarks

Benchmark suite Current: 1b6c5be Previous: 1457a4c Ratio
MRI Lab/Bloch/CPU/2 thread(s) 226897725.5 ns 227517325.5 ns 1.00
MRI Lab/Bloch/CPU/4 thread(s) 134821691 ns 135033124 ns 1.00
MRI Lab/Bloch/CPU/8 thread(s) 143505624 ns 171880824 ns 0.83
MRI Lab/Bloch/CPU/1 thread(s) 343336904 ns 396561930.5 ns 0.87
MRI Lab/Bloch/GPU/CUDA 56897573.5 ns 138134905 ns 0.41
MRI Lab/Bloch/GPU/oneAPI 516675827 ns 14155999496.5 ns 0.03649871753158408
MRI Lab/Bloch/GPU/Metal 561507500 ns 3171338479 ns 0.18
MRI Lab/Bloch/GPU/AMDGPU 36448199 ns 75482754 ns 0.48
Slice Selection 3D/Bloch/CPU/2 thread(s) 1157773672 ns 1168211452 ns 0.99
Slice Selection 3D/Bloch/CPU/4 thread(s) 621102018 ns 612565463 ns 1.01
Slice Selection 3D/Bloch/CPU/8 thread(s) 383824312 ns 495427593 ns 0.77
Slice Selection 3D/Bloch/CPU/1 thread(s) 1947221684 ns 2245843835 ns 0.87
Slice Selection 3D/Bloch/GPU/CUDA 101669032 ns 108701927 ns 0.94
Slice Selection 3D/Bloch/GPU/oneAPI 649875888 ns 776956866 ns 0.84
Slice Selection 3D/Bloch/GPU/Metal 565000312.5 ns 769082459 ns 0.73
Slice Selection 3D/Bloch/GPU/AMDGPU 60318280 ns 64232156 ns 0.94

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.