Optimize run_spin_precession! for GPU (#459)

Optimize run_spin_precession! for GPU
JuliaHealth · Jul 31, 2024 · 1457a4c · 1457a4c · github-actions · Jul 31, 2024
1 parent b74a9ff
commit 1457a4c
Show file tree

Hide file tree

Showing 17 changed files with 264 additions and 93 deletions.
diff --git a/KomaMRIBase/src/KomaMRIBase.jl b/KomaMRIBase/src/KomaMRIBase.jl
@@ -14,7 +14,7 @@ using MRIBase
     Profile, RawAcquisitionData, AcquisitionData, AcquisitionHeader, EncodingCounters, Limit
 using MAT   # For loading example phantoms
 
-global γ = 42.5774688e6 # Hz/T gyromagnetic constant for H1, JEMRIS uses 42.5756 MHz/T
+const global γ = 42.5774688e6 # Hz/T gyromagnetic constant for H1, JEMRIS uses 42.5756 MHz/T
 
 # Hardware
 include("datatypes/Scanner.jl")

diff --git a/KomaMRIBase/src/timing/TrapezoidalIntegration.jl b/KomaMRIBase/src/timing/TrapezoidalIntegration.jl
@@ -45,12 +45,12 @@ Trapezoidal cumulative integration over time for every spin of a phantom.
     phantom
 """
 function cumtrapz(Δt::AbstractArray{T}, x::AbstractArray{T}) where {T<:Real}
-    y =  (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt / 2)
+    y =  (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt ./ 2)
     y = cumsum(y, dims=2)
     return y
 end
 function cumtrapz(Δt::AbstractVector{T}, x::AbstractVector{T}) where {T<:Real}
-    y = (x[2:end] .+ x[1:end-1]) .* (Δt / 2)
+    y = (x[2:end] .+ x[1:end-1]) .* (Δt ./ 2)
     y = cumsum(y)
     return y
 end
diff --git a/KomaMRICore/Project.toml b/KomaMRICore/Project.toml
@@ -31,7 +31,7 @@ CUDA = "3, 4, 5"
 Functors = "0.4"
 KernelAbstractions = "0.9"
 KomaMRIBase = "0.9"
-Metal = "1"
+Metal = "1.2"
 ProgressMeter = "1"
 Reexport = "1"
 ThreadsX = "0.1"

diff --git a/KomaMRICore/ext/KomaAMDGPUExt.jl b/KomaMRICore/ext/KomaAMDGPUExt.jl
@@ -9,6 +9,7 @@ KomaMRICore.isfunctional(::ROCBackend) = AMDGPU.functional()
 KomaMRICore.set_device!(::ROCBackend, dev_idx::Integer) = AMDGPU.device_id!(dev_idx)
 KomaMRICore.set_device!(::ROCBackend, dev::AMDGPU.HIPDevice) = AMDGPU.device!(dev)
 KomaMRICore.device_name(::ROCBackend) = AMDGPU.HIP.name(AMDGPU.device())
+@inline KomaMRICore._cis(x) = cis(x)
 
 function KomaMRICore._print_devices(::ROCBackend)
     devices = [

diff --git a/KomaMRICore/ext/KomaCUDAExt.jl b/KomaMRICore/ext/KomaCUDAExt.jl
@@ -8,6 +8,7 @@ KomaMRICore.name(::CUDABackend) = "CUDA"
 KomaMRICore.isfunctional(::CUDABackend) = CUDA.functional()
 KomaMRICore.set_device!(::CUDABackend, val) = CUDA.device!(val)
 KomaMRICore.device_name(::CUDABackend) = CUDA.name(CUDA.device())
+@inline KomaMRICore._cis(x) = cis(x)
 
 function KomaMRICore._print_devices(::CUDABackend)
     devices = [

diff --git a/KomaMRICore/ext/KomaMetalExt.jl b/KomaMRICore/ext/KomaMetalExt.jl
@@ -11,21 +11,14 @@ KomaMRICore.isfunctional(::MetalBackend) = Metal.functional()
 KomaMRICore.set_device!(::MetalBackend, device_index::Integer) = device_index == 1 || @warn "Metal does not support multiple gpu devices. Ignoring the device setting."
 KomaMRICore.set_device!(::MetalBackend, dev::Metal.MTLDevice) = Metal.device!(dev)
 KomaMRICore.device_name(::MetalBackend) = String(Metal.current_device().name)
+@inline KomaMRICore._cis(x) = cis(x)
 
 function KomaMRICore._print_devices(::MetalBackend)
     @info "Metal device type: $(KomaMRICore.device_name(MetalBackend()))"
 end
 
-#Temporary workaround for https://github.com/JuliaGPU/Metal.jl/issues/348
-#Once run_spin_excitation! and run_spin_precession! are kernel-based, this code
-#can be removed
-Base.cumsum(x::MtlVector{T}) where T = convert(MtlVector{T}, cumsum(KomaMRICore.cpu(x)))
-Base.cumsum(x::MtlArray{T}; dims) where T = convert(MtlArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
-Base.findall(x::MtlVector{Bool}) = convert(MtlVector, findall(KomaMRICore.cpu(x)))
-
 function __init__()
     push!(KomaMRICore.LOADED_BACKENDS[], MetalBackend())
-    @warn "Metal does not support all array operations used by KomaMRI (https://github.com/JuliaGPU/Metal.jl/issues/348). GPU performance may be slower than expected"
 end
 
 end

diff --git a/KomaMRICore/ext/KomaoneAPIExt.jl b/KomaMRICore/ext/KomaoneAPIExt.jl
@@ -8,6 +8,7 @@ KomaMRICore.name(::oneAPIBackend) = "oneAPI"
 KomaMRICore.isfunctional(::oneAPIBackend) = oneAPI.functional()
 KomaMRICore.set_device!(::oneAPIBackend, val) = oneAPI.device!(val)
 KomaMRICore.device_name(::oneAPIBackend) = oneAPI.properties(oneAPI.device()).name
+@inline KomaMRICore._cis(x) = cos(x) + im * sin(x)
 
 function KomaMRICore._print_devices(::oneAPIBackend)
     devices = [
@@ -20,9 +21,41 @@ end
 #Temporary workaround since oneAPI.jl (similar to Metal) does not support some array operations
 #Once run_spin_excitation! and run_spin_precession! are kernel-based, this code can be removed
 Base.cumsum(x::oneVector{T}) where T = convert(oneVector{T}, cumsum(KomaMRICore.cpu(x)))
-Base.cumsum(x::oneArray{T}; dims) where T = convert(oneArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
 Base.findall(x::oneVector{Bool}) = convert(oneVector, findall(KomaMRICore.cpu(x)))
 
+using KernelAbstractions: @index, @kernel, @Const, synchronize
+
+"""Naive cumsum implementation for matrix, parallelizes along the first dimension"""
+Base.cumsum(A::oneArray{T}; dims) where T = begin
+    dims == 2 || @error "oneAPI cumsum implementation only supports keyword argument dims=2"
+    backend = oneAPIBackend()
+    B = similar(A)
+    cumsum_kernel = naive_cumsum!(backend)
+    cumsum_kernel(B, A, ndrange=size(A,1))
+    synchronize(backend)
+    return B
+end
+
+Base.cumsum!(B::oneArray{T}, A::oneArray{T}; dims) where T = begin
+    dims == 2 || @error "oneAPI cumsum implementation only supports keyword argument dims=2"
+    backend = oneAPIBackend()
+    cumsum_kernel = naive_cumsum!(backend)
+    cumsum_kernel(B, A, ndrange=size(A,1))
+    synchronize(backend)
+end
+
+## COV_EXCL_START
+@kernel function naive_cumsum!(B, @Const(A))
+    i = @index(Global)
+
+    cur_val = 0.0f0
+    for k ∈ 1:size(A, 2)
+        @inbounds cur_val += A[i, k]
+        @inbounds B[i, k] = cur_val
+    end
+end
+## COV_EXCL_STOP
+
 function __init__()
     push!(KomaMRICore.LOADED_BACKENDS[], oneAPIBackend())
     @warn "oneAPI does not support all array operations used by KomaMRI. GPU performance may be slower than expected"

diff --git a/KomaMRICore/src/simulation/GPUFunctions.jl b/KomaMRICore/src/simulation/GPUFunctions.jl
@@ -9,6 +9,10 @@ _print_devices(::KA.CPU) = @info "CPU: $(length(Sys.cpu_info())) x $(Sys.cpu_inf
 name(::KA.CPU) = "CPU"
 set_device!(backend, val) = @error "set_device! called with invalid parameter types: '$(typeof(backend))', '$(typeof(val))'" 
 
+#oneAPI.jl doesn't support cis (https://github.com/JuliaGPU/oneAPI.jl/pull/443), so
+#for now we use a custom function for each backend to implement
+function _cis end
+
 """
     get_backend(use_gpu)
 

diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
@@ -1,11 +1,11 @@
-"""Stores preallocated structs for use in Bloch CPU run_spin_precession function."""
+"""Stores preallocated structs for use in Bloch CPU run_spin_precession! and run_spin_excitation! functions."""
 struct BlochCPUPrealloc{T} <: PreallocResult{T}
     M::Mag{T}                               # Mag{T}
     Bz_old::AbstractVector{T}               # Vector{T}(Nspins x 1)
     Bz_new::AbstractVector{T}               # Vector{T}(Nspins x 1)
     ϕ::AbstractVector{T}                    # Vector{T}(Nspins x 1)
-    φ::AbstractVector{T}                    # Vector{T}(Nspins x 1)
     Rot::Spinor{T}                          # Spinor{T}
+    ΔBz::AbstractVector{T}            # Vector{T}(Nspins x 1)
 end
 
 Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
@@ -14,13 +14,13 @@ Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
         p.Bz_old[i],
         p.Bz_new[i],
         p.ϕ[i],
-        p.φ[i],
-        p.Rot[i]
+        p.Rot[i],
+        p.ΔBz[i]
     )
 end
 
-"""Preallocates arrays for use in run_spin_precession."""
-function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}) where {T<:Real}
+"""Preallocates arrays for use in run_spin_precession! and run_spin_excitation!."""
+function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real}
     return BlochCPUPrealloc(
         Mag(
             similar(M.xy),
@@ -29,11 +29,11 @@ function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}
         similar(obj.x),
         similar(obj.x),
         similar(obj.x),
-        similar(obj.x),
         Spinor(
             similar(M.xy),
             similar(M.xy)
-        )
+        ),
+        obj.Δw ./ T(2π .* γ)
     )
 end
 
@@ -63,8 +63,9 @@ function run_spin_precession!(
     Bz_new = prealloc.Bz_new
     ϕ = prealloc.ϕ
     Mxy = prealloc.M.xy
+    ΔBz = prealloc.ΔBz
     fill!(ϕ, zero(T))
-    @. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + p.Δw / T(2π * γ)
+    @. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + ΔBz
 
     # Fill sig[1] if needed
     ADC_idx = 1
@@ -79,7 +80,7 @@ function run_spin_precession!(
         t_seq += seq.Δt[seq_idx-1]
 
         #Effective Field
-        @. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + p.Δw / T(2π * γ)
+        @. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + ΔBz
 
         #Rotation
         @. ϕ += (Bz_old + Bz_new) * T(-π * γ) * seq.Δt[seq_idx-1]
@@ -116,18 +117,15 @@ function run_spin_excitation!(
     backend::KA.CPU,
     prealloc::BlochCPUPrealloc
 ) where {T<:Real}
-    ΔBz = prealloc.Bz_old
-    Bz = prealloc.Bz_new
-    B = prealloc.ϕ
-    φ = prealloc.φ
+    Bz = prealloc.Bz_old
+    B = prealloc.Bz_new
+    φ = prealloc.ϕ
     α = prealloc.Rot.α
     β = prealloc.Rot.β
+    ΔBz = prealloc.ΔBz
     Maux_xy = prealloc.M.xy
     Maux_z = prealloc.M.z
 
-    #Can be calculated outside of loop
-    @. ΔBz = p.Δw / T(2π * γ)
-
     #Simulation
     for s in seq #This iterates over seq, "s = seq[i,:]"
         #Motion

diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl
@@ -0,0 +1,153 @@
+"""These properties are redundant with seq.Δt and seq.ADC, but it is much faster
+to calculate them on the CPU before the simulation is run."""
+struct SeqBlockProperties{T<:Real}
+    length::Int64
+    nADC::Int64
+    first_ADC::Bool
+    ADC_indices::AbstractVector{Int64}
+    tp_ADC::AbstractVector{T}
+    dur::T
+end
+
+@functor SeqBlockProperties
+
+"""Stores information added to the prealloc struct for use in run_spin_precession!
+and run_spin_excitation!. This information is calculated on the CPU before the
+simulation is run."""
+struct BlochGPUPrecalc{T} <: PrecalcResult{T}
+    seq_properties::AbstractVector{SeqBlockProperties{T}}
+end
+
+@functor BlochGPUPrecalc
+
+"""Precalculates sequence properties for use in run_spin_precession!"""
+function precalculate(
+    sim_method::Bloch, 
+    backend::KA.GPU,
+    seq::DiscreteSequence{T},
+    parts::Vector{UnitRange{S}}, 
+    excitation_bool::Vector{Bool}
+) where {T<:Real,S<:Integer}
+    seq_properties = SeqBlockProperties{T}[]
+    for (block, p) in enumerate(parts)
+        seq_block = @view seq[p]
+        if excitation_bool[block]
+            push!(seq_properties, SeqBlockProperties(
+                length(seq_block.t),
+                count(seq_block.ADC),
+                false,
+                Int64[],
+                T[],
+                zero(T)
+            ))
+        else
+            ADC_indices = findall(seq_block.ADC) .- 1
+            if seq_block.ADC[1]
+                deleteat!(ADC_indices, 1)
+            end
+            tp = cumsum(seq_block.Δt)
+            push!(seq_properties, SeqBlockProperties(
+                length(seq_block.t),
+                count(seq_block.ADC),
+                seq_block.ADC[1],
+                ADC_indices,
+                tp[ADC_indices],
+                last(tp)
+            ))
+        end
+    end
+
+    return BlochGPUPrecalc(seq_properties)
+end
+
+"""Stores preallocated structs for use in Bloch GPU run_spin_precession function."""
+struct BlochGPUPrealloc{T} <: PreallocResult{T}
+    seq_properties::AbstractVector{SeqBlockProperties{T}}
+    Bz::AbstractMatrix{T}
+    Δϕ::AbstractMatrix{T}
+    ϕ::AbstractArray{T}
+    Mxy::AbstractMatrix{Complex{T}}
+    ΔBz::AbstractVector{T}
+end
+
+Base.view(p::BlochGPUPrealloc{T}, i::UnitRange) where {T<:Real} = p
+function prealloc_block(p::BlochGPUPrealloc{T}, i::Integer) where {T<:Real}
+    seq_block = p.seq_properties[i]
+
+    return BlochGPUPrealloc(
+        [seq_block],
+        view(p.Bz,:,1:seq_block.length),
+        view(p.Δϕ,:,1:seq_block.length-1),
+        seq_block.nADC > 0 ? view(p.ϕ,:,1:seq_block.length-1) : view(p.ϕ,:,1),
+        view(p.Mxy,:,1:seq_block.nADC),
+        p.ΔBz
+    )
+end
+
+"""Preallocates arrays for use in run_spin_precession! and run_spin_excitation!."""
+function prealloc(sim_method::Bloch, backend::KA.GPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real}
+    if !(precalc isa BlochGPUPrecalc) @error """Sim method Bloch() does not support calling run_sim_time_iter directly. Use method BlochSimple() instead.""" end
+
+    return BlochGPUPrealloc(
+        precalc.seq_properties,
+        KA.zeros(backend, T, (size(obj.x, 1), max_block_length)),
+        KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
+        KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
+        KA.zeros(backend, Complex{T}, (size(obj.x, 1), max_block_length)),
+        obj.Δw ./ T(2π .* γ)
+    )
+end
+
+@inline function calculate_precession!(Δϕ::AbstractArray{T}, Δt::AbstractArray{T}, Bz::AbstractArray{T}) where {T<:Real}
+    Δϕ .= (Bz[:,2:end] .+ Bz[:,1:end-1]) .* Δt .* T(-π .* γ)
+end
+@inline function apply_precession!(ϕ::AbstractVector{T}, Δϕ::AbstractArray{T}) where {T<:Real}
+    ϕ .= sum(Δϕ, dims=2)
+end
+function apply_precession!(ϕ::AbstractArray{T}, Δϕ::AbstractArray{T}) where {T<:Real}
+    cumsum!(ϕ, Δϕ, dims=2)
+end
+
+function run_spin_precession!(
+    p::Phantom{T},
+    seq::DiscreteSequence{T},
+    sig::AbstractArray{Complex{T}},
+    M::Mag{T},
+    sim_method::Bloch,
+    backend::KA.GPU,
+    pre::BlochGPUPrealloc
+) where {T<:Real}
+    #Simulation
+    #Motion
+    x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
+
+    #Sequence block info
+    seq_block = pre.seq_properties[1]
+
+    #Effective field
+    pre.Bz .= x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ pre.ΔBz
+
+    #Rotation
+    calculate_precession!(pre.Δϕ, seq.Δt', pre.Bz)
+    # Reduces Δϕ Nspins x Nt to ϕ Nspins x Nt, if Nadc = 0, to Nspins x 1 
+    apply_precession!(pre.ϕ, pre.Δϕ)
+
+    #Acquired signal
+    if seq_block.nADC > 0
+        ϕ_ADC = @view pre.ϕ[:,seq_block.ADC_indices]
+        if seq_block.first_ADC
+            pre.Mxy[:,1] .= M.xy
+            pre.Mxy[:,2:end] .= M.xy .* exp.(-seq_block.tp_ADC' ./ p.T2) .* _cis.(ϕ_ADC)
+        else
+            pre.Mxy .= M.xy .* exp.(-seq_block.tp_ADC' ./ p.T2) .* _cis.(ϕ_ADC)
+        end
+
+        sig .= transpose(sum(pre.Mxy; dims=1))
+    end
+
+    #Mxy precession and relaxation, and Mz relaxation
+    M.z  .= M.z .* exp.(-seq_block.dur ./ p.T1) .+ p.ρ .* (T(1) .- exp.(-seq_block.dur ./ p.T1))
+    M.xy .= M.xy .* exp.(-seq_block.dur ./ p.T2) .* _cis.(pre.ϕ[:,end])
+
+    return nothing
+end 
diff --git a/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDict.jl b/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDict.jl
@@ -45,7 +45,7 @@ function run_spin_precession!(
     Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw ./ T(2π .* γ)
     #Rotation
     if is_ADC_on(seq)
-        ϕ = T(-2π .* γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
+        ϕ = T(-2π .* γ) .* cumtrapz(seq.Δt', Bz)
     else
         ϕ = T(-2π .* γ) .* trapz(seq.Δt', Bz)
     end

diff --git a/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl b/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl
@@ -37,7 +37,7 @@ function run_spin_precession!(
     Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw ./ T(2π .* γ)
     #Rotation
     if is_ADC_on(seq)
-        ϕ = T(-2π .* γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
+        ϕ = T(-2π .* γ) .* cumtrapz(seq.Δt', Bz)
     else
         ϕ = T(-2π .* γ) .* trapz(seq.Δt', Bz)
     end
Benchmark suite	Current: `1457a4c`	Previous: `24b21f4`	Ratio
`MRI Lab/Bloch/CPU/2 thread(s)`	`227517325.5` ns	`235229966.5` ns	`0.97`
`MRI Lab/Bloch/CPU/4 thread(s)`	`135033124` ns	`140495905` ns	`0.96`
`MRI Lab/Bloch/CPU/8 thread(s)`	`171880824` ns	`169591756.5` ns	`1.01`
`MRI Lab/Bloch/CPU/1 thread(s)`	`396561930.5` ns	`419227547` ns	`0.95`
`MRI Lab/Bloch/GPU/CUDA`	`138134905` ns	`135837984` ns	`1.02`
`MRI Lab/Bloch/GPU/oneAPI`	`14155999496.5` ns	`18356788557` ns	`0.77`
`MRI Lab/Bloch/GPU/Metal`	`3171338479` ns	`2931106125` ns	`1.08`
`MRI Lab/Bloch/GPU/AMDGPU`	`75482754` ns	`1750964243` ns	`0.04310924926180803`
`Slice Selection 3D/Bloch/CPU/2 thread(s)`	`1168211452` ns	`1174040352` ns	`1.00`
`Slice Selection 3D/Bloch/CPU/4 thread(s)`	`612565463` ns	`622515059.5` ns	`0.98`
`Slice Selection 3D/Bloch/CPU/8 thread(s)`	`495427593` ns	`492840880` ns	`1.01`
`Slice Selection 3D/Bloch/CPU/1 thread(s)`	`2245843835` ns	`2264093136` ns	`0.99`
`Slice Selection 3D/Bloch/GPU/CUDA`	`108701927` ns	`257306603` ns	`0.42`
`Slice Selection 3D/Bloch/GPU/oneAPI`	`776956866` ns	`1678945735.5` ns	`0.46`
`Slice Selection 3D/Bloch/GPU/Metal`	`769082459` ns	`1129875875` ns	`0.68`
`Slice Selection 3D/Bloch/GPU/AMDGPU`	`64232156` ns	`679066674` ns	`0.0945888797953292`