Optimize run_spin_excitation! for GPU (#462)

JuliaHealth · Aug 19, 2024 · 1b6c5be · 1b6c5be · github-actions · Aug 19, 2024
1 parent 1457a4c
commit 1b6c5be
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 4 deletions.
diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl
@@ -1,3 +1,5 @@
+include("KernelFunctions.jl")
+
 """These properties are redundant with seq.Δt and seq.ADC, but it is much faster
 to calculate them on the CPU before the simulation is run."""
 struct SeqBlockProperties{T<:Real}
@@ -64,6 +66,10 @@ end
 struct BlochGPUPrealloc{T} <: PreallocResult{T}
     seq_properties::AbstractVector{SeqBlockProperties{T}}
     Bz::AbstractMatrix{T}
+    B::AbstractMatrix{T}
+    φ::AbstractMatrix{T}
+    ΔT1::AbstractMatrix{T}
+    ΔT2::AbstractMatrix{T}
     Δϕ::AbstractMatrix{T}
     ϕ::AbstractArray{T}
     Mxy::AbstractMatrix{Complex{T}}
@@ -77,6 +83,10 @@ function prealloc_block(p::BlochGPUPrealloc{T}, i::Integer) where {T<:Real}
     return BlochGPUPrealloc(
         [seq_block],
         view(p.Bz,:,1:seq_block.length),
+        view(p.B,:,1:seq_block.length),
+        view(p.φ,:,1:seq_block.length-1),
+        view(p.ΔT1,:,1:seq_block.length-1),
+        view(p.ΔT2,:,1:seq_block.length-1),
         view(p.Δϕ,:,1:seq_block.length-1),
         seq_block.nADC > 0 ? view(p.ϕ,:,1:seq_block.length-1) : view(p.ϕ,:,1),
         view(p.Mxy,:,1:seq_block.nADC),
@@ -91,6 +101,10 @@ function prealloc(sim_method::Bloch, backend::KA.GPU, obj::Phantom{T}, M::Mag{T}
     return BlochGPUPrealloc(
         precalc.seq_properties,
         KA.zeros(backend, T, (size(obj.x, 1), max_block_length)),
+        KA.zeros(backend, T, (size(obj.x, 1), max_block_length)),
+        KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
+        KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
+        KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
         KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
         KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
         KA.zeros(backend, Complex{T}, (size(obj.x, 1), max_block_length)),
@@ -150,4 +164,34 @@ function run_spin_precession!(
     M.xy .= M.xy .* exp.(-seq_block.dur ./ p.T2) .* _cis.(pre.ϕ[:,end])
 
     return nothing
-end 
+end
+
+function run_spin_excitation!(
+    p::Phantom{T},
+    seq::DiscreteSequence{T},
+    sig::AbstractArray{Complex{T}},
+    M::Mag{T},
+    sim_method::Bloch,
+    backend::KA.GPU,
+    pre::BlochGPUPrealloc
+) where {T<:Real}
+    #Motion
+    x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
+
+    #Effective Field
+    pre.Bz .= (x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz') .+ pre.ΔBz .- seq.Δf' ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
+    pre.B .= sqrt.(abs.(seq.B1') .^ 2 .+ abs.(pre.Bz) .^ 2)
+
+    #Spinor Rotation
+    pre.φ .= T(-π .* γ) .* (pre.B[:,1:end-1] .* seq.Δt') # TODO: Use trapezoidal integration here (?),  this is just Forward Euler
+
+    #Relaxation
+    pre.ΔT1 .= exp.(-seq.Δt' ./ p.T1)
+    pre.ΔT2 .= exp.(-seq.Δt' ./ p.T2)
+
+    #Excitation
+    apply_excitation!(backend, 512)(M.xy, M.z, pre.φ, seq.B1, pre.Bz, pre.B, pre.ΔT1, pre.ΔT2, p.ρ, ndrange=size(M.xy,1))
+    KA.synchronize(backend)
+
+    return nothing
+end
diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/KernelFunctions.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/KernelFunctions.jl
@@ -0,0 +1,61 @@
+using KernelAbstractions: @kernel, @Const, @index, @uniform, @groupsize, @localmem
+
+## COV_EXCL_START
+
+@kernel function apply_excitation!(Mxy, Mz, @Const(φ), @Const(B1), @Const(Bz), @Const(B), @Const(ΔT1), @Const(ΔT2), @Const(ρ))
+    i_g = @index(Global)
+    i_l = @index(Local)
+
+    @uniform T = eltype(φ)
+    @uniform N = @groupsize()[1]
+    @uniform N_Δt = size(φ, 2)
+
+    s_α_r = @localmem T (N,)
+    s_α_i = @localmem T (N,)
+    s_β_i = @localmem T (N,)
+    s_β_r = @localmem T (N,)
+    s_Mxy_r = @localmem T (N,)
+    s_Mxy_i = @localmem T (N,)
+    s_Mxy_new_r = @localmem T (N,)
+    s_Mxy_new_i = @localmem T (N,)
+    s_Mz = @localmem T (N,)
+    s_Mz_new = @localmem T (N,)
+    s_ρ = @localmem T (N,)
+
+    @inbounds s_Mxy_r[i_l] = real(Mxy[i_g])
+    @inbounds s_Mxy_i[i_l]  = imag(Mxy[i_g])
+    @inbounds s_Mz[i_l] = Mz[i_g]
+    @inbounds s_ρ[i_l] = ρ[i_g]
+
+    @inbounds for t = 1 : N_Δt
+        sin_φ = sin(φ[i_g, t]) #TO-DO: use sincos once oneAPI releases version with https://github.com/JuliaGPU/oneAPI.jl/commit/260a4dda0ea223dbf0893de7b4a13d994ae27bd1
+        cos_φ = cos(φ[i_g, t])
+        s_α_r[i_l] = cos_φ
+        if (iszero(B[i_g, t]))
+            s_α_i[i_l] = -(Bz[i_g, t] / (B[i_g, t] + eps(T))) * sin_φ
+            s_β_r[i_l] = (imag(B1[t]) / (B[i_g, t] + eps(T))) * sin_φ
+            s_β_i[i_l] = -(real(B1[t]) / (B[i_g, t] + eps(T))) * sin_φ
+        else
+            s_α_i[i_l] = -(Bz[i_g, t] / B[i_g, t]) * sin_φ
+            s_β_r[i_l] = (imag(B1[t]) / B[i_g, t]) * sin_φ
+            s_β_i[i_l] = -(real(B1[t]) / B[i_g, t]) * sin_φ
+        end
+            s_Mxy_new_r[i_l] = 2 * (s_Mxy_i[i_l] * (s_α_r[i_l] * s_α_i[i_l] - s_β_r[i_l] * s_β_i[i_l]) +
+                                s_Mz[i_l] * (s_α_i[i_l] * s_β_i[i_l] + s_α_r[i_l] * s_β_r[i_l])) +
+                                s_Mxy_r[i_l] * (s_α_r[i_l]^2 - s_α_i[i_l]^2 - s_β_r[i_l]^2 + s_β_i[i_l]^2)
+            s_Mxy_new_i[i_l] = -2 * (s_Mxy_r[i_l] * (s_α_r[i_l] * s_α_i[i_l] + s_β_r[i_l] * s_β_i[i_l]) -
+                                s_Mz[i_l] * (s_α_r[i_l] * s_β_i[i_l] - s_α_i[i_l] * s_β_r[i_l])) +
+                                s_Mxy_i[i_l] * (s_α_r[i_l]^2 - s_α_i[i_l]^2 + s_β_r[i_l]^2 - s_β_i[i_l]^2)
+            s_Mz_new[i_l] = s_Mz[i_l] * (s_α_r[i_l]^2 + s_α_i[i_l]^2 - s_β_r[i_l]^2 - s_β_i[i_l]^2) -
+                            2 * (s_Mxy_r[i_l] * (s_α_r[i_l] * s_β_r[i_l] - s_α_i[i_l] * s_β_i[i_l]) +
+                            s_Mxy_i[i_l] * (s_α_r[i_l] * s_β_i[i_l] + s_α_i[i_l] * s_β_r[i_l]))
+        s_Mxy_r[i_l] = s_Mxy_new_r[i_l] * ΔT2[i_g, t]
+        s_Mxy_i[i_l] = s_Mxy_new_i[i_l] * ΔT2[i_g, t]
+        s_Mz[i_l] = s_Mz_new[i_l] * ΔT1[i_g, t] + s_ρ[i_l] * (1 - ΔT1[i_g, t])
+    end
+
+    @inbounds Mxy[i_g] = s_Mxy_r[i_l] + 1im * s_Mxy_i[i_l]
+    @inbounds Mz[i_g] = s_Mz[i_l]
+end
+
+## COV_EXCL_STOP
diff --git a/KomaMRICore/src/simulation/SimulatorCore.jl b/KomaMRICore/src/simulation/SimulatorCore.jl
@@ -332,9 +332,7 @@ function simulate(
     sim_params = default_sim_params(sim_params)
     #Warn if user is trying to run on CPU without enabling multi-threading
     if (!sim_params["gpu"] && Threads.nthreads() == 1)
-        @info """
-            Simulation will be run on the CPU with only 1 thread. To
-            enable multi-threading, start julia with --threads=auto.
+        @info """Simulation will be run on the CPU with only 1 thread. To enable multi-threading, start julia with --threads=auto
         """ maxlog=1
     end
     # Simulation init
Benchmark suite	Current: `1b6c5be`	Previous: `1457a4c`	Ratio
`MRI Lab/Bloch/CPU/2 thread(s)`	`226897725.5` ns	`227517325.5` ns	`1.00`
`MRI Lab/Bloch/CPU/4 thread(s)`	`134821691` ns	`135033124` ns	`1.00`
`MRI Lab/Bloch/CPU/8 thread(s)`	`143505624` ns	`171880824` ns	`0.83`
`MRI Lab/Bloch/CPU/1 thread(s)`	`343336904` ns	`396561930.5` ns	`0.87`
`MRI Lab/Bloch/GPU/CUDA`	`56897573.5` ns	`138134905` ns	`0.41`
`MRI Lab/Bloch/GPU/oneAPI`	`516675827` ns	`14155999496.5` ns	`0.03649871753158408`
`MRI Lab/Bloch/GPU/Metal`	`561507500` ns	`3171338479` ns	`0.18`
`MRI Lab/Bloch/GPU/AMDGPU`	`36448199` ns	`75482754` ns	`0.48`
`Slice Selection 3D/Bloch/CPU/2 thread(s)`	`1157773672` ns	`1168211452` ns	`0.99`
`Slice Selection 3D/Bloch/CPU/4 thread(s)`	`621102018` ns	`612565463` ns	`1.01`
`Slice Selection 3D/Bloch/CPU/8 thread(s)`	`383824312` ns	`495427593` ns	`0.77`
`Slice Selection 3D/Bloch/CPU/1 thread(s)`	`1947221684` ns	`2245843835` ns	`0.87`
`Slice Selection 3D/Bloch/GPU/CUDA`	`101669032` ns	`108701927` ns	`0.94`
`Slice Selection 3D/Bloch/GPU/oneAPI`	`649875888` ns	`776956866` ns	`0.84`
`Slice Selection 3D/Bloch/GPU/Metal`	`565000312.5` ns	`769082459` ns	`0.73`
`Slice Selection 3D/Bloch/GPU/AMDGPU`	`60318280` ns	`64232156` ns	`0.94`