Skip to content

Commit

Permalink
Optimize run_spin_precession! for GPU (#459)
Browse files Browse the repository at this point in the history
Optimize run_spin_precession! for GPU
  • Loading branch information
rkierulf authored Jul 31, 2024
1 parent b74a9ff commit 1457a4c
Show file tree
Hide file tree
Showing 17 changed files with 264 additions and 93 deletions.
2 changes: 1 addition & 1 deletion KomaMRIBase/src/KomaMRIBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ using MRIBase
Profile, RawAcquisitionData, AcquisitionData, AcquisitionHeader, EncodingCounters, Limit
using MAT # For loading example phantoms

global γ = 42.5774688e6 # Hz/T gyromagnetic constant for H1, JEMRIS uses 42.5756 MHz/T
const global γ = 42.5774688e6 # Hz/T gyromagnetic constant for H1, JEMRIS uses 42.5756 MHz/T

# Hardware
include("datatypes/Scanner.jl")
Expand Down
4 changes: 2 additions & 2 deletions KomaMRIBase/src/timing/TrapezoidalIntegration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,12 @@ Trapezoidal cumulative integration over time for every spin of a phantom.
phantom
"""
function cumtrapz(Δt::AbstractArray{T}, x::AbstractArray{T}) where {T<:Real}
y = (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt / 2)
y = (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt ./ 2)
y = cumsum(y, dims=2)
return y
end
function cumtrapz(Δt::AbstractVector{T}, x::AbstractVector{T}) where {T<:Real}
y = (x[2:end] .+ x[1:end-1]) .* (Δt / 2)
y = (x[2:end] .+ x[1:end-1]) .* (Δt ./ 2)
y = cumsum(y)
return y
end
2 changes: 1 addition & 1 deletion KomaMRICore/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ CUDA = "3, 4, 5"
Functors = "0.4"
KernelAbstractions = "0.9"
KomaMRIBase = "0.9"
Metal = "1"
Metal = "1.2"
ProgressMeter = "1"
Reexport = "1"
ThreadsX = "0.1"
Expand Down
1 change: 1 addition & 0 deletions KomaMRICore/ext/KomaAMDGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ KomaMRICore.isfunctional(::ROCBackend) = AMDGPU.functional()
KomaMRICore.set_device!(::ROCBackend, dev_idx::Integer) = AMDGPU.device_id!(dev_idx)
KomaMRICore.set_device!(::ROCBackend, dev::AMDGPU.HIPDevice) = AMDGPU.device!(dev)
KomaMRICore.device_name(::ROCBackend) = AMDGPU.HIP.name(AMDGPU.device())
@inline KomaMRICore._cis(x) = cis(x)

function KomaMRICore._print_devices(::ROCBackend)
devices = [
Expand Down
1 change: 1 addition & 0 deletions KomaMRICore/ext/KomaCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ KomaMRICore.name(::CUDABackend) = "CUDA"
KomaMRICore.isfunctional(::CUDABackend) = CUDA.functional()
KomaMRICore.set_device!(::CUDABackend, val) = CUDA.device!(val)
KomaMRICore.device_name(::CUDABackend) = CUDA.name(CUDA.device())
@inline KomaMRICore._cis(x) = cis(x)

function KomaMRICore._print_devices(::CUDABackend)
devices = [
Expand Down
9 changes: 1 addition & 8 deletions KomaMRICore/ext/KomaMetalExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,14 @@ KomaMRICore.isfunctional(::MetalBackend) = Metal.functional()
KomaMRICore.set_device!(::MetalBackend, device_index::Integer) = device_index == 1 || @warn "Metal does not support multiple gpu devices. Ignoring the device setting."
KomaMRICore.set_device!(::MetalBackend, dev::Metal.MTLDevice) = Metal.device!(dev)
KomaMRICore.device_name(::MetalBackend) = String(Metal.current_device().name)
@inline KomaMRICore._cis(x) = cis(x)

function KomaMRICore._print_devices(::MetalBackend)
@info "Metal device type: $(KomaMRICore.device_name(MetalBackend()))"
end

#Temporary workaround for https://github.com/JuliaGPU/Metal.jl/issues/348
#Once run_spin_excitation! and run_spin_precession! are kernel-based, this code
#can be removed
Base.cumsum(x::MtlVector{T}) where T = convert(MtlVector{T}, cumsum(KomaMRICore.cpu(x)))
Base.cumsum(x::MtlArray{T}; dims) where T = convert(MtlArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
Base.findall(x::MtlVector{Bool}) = convert(MtlVector, findall(KomaMRICore.cpu(x)))

function __init__()
push!(KomaMRICore.LOADED_BACKENDS[], MetalBackend())
@warn "Metal does not support all array operations used by KomaMRI (https://github.com/JuliaGPU/Metal.jl/issues/348). GPU performance may be slower than expected"
end

end
Expand Down
35 changes: 34 additions & 1 deletion KomaMRICore/ext/KomaoneAPIExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ KomaMRICore.name(::oneAPIBackend) = "oneAPI"
KomaMRICore.isfunctional(::oneAPIBackend) = oneAPI.functional()
KomaMRICore.set_device!(::oneAPIBackend, val) = oneAPI.device!(val)
KomaMRICore.device_name(::oneAPIBackend) = oneAPI.properties(oneAPI.device()).name
@inline KomaMRICore._cis(x) = cos(x) + im * sin(x)

function KomaMRICore._print_devices(::oneAPIBackend)
devices = [
Expand All @@ -20,9 +21,41 @@ end
#Temporary workaround since oneAPI.jl (similar to Metal) does not support some array operations
#Once run_spin_excitation! and run_spin_precession! are kernel-based, this code can be removed
Base.cumsum(x::oneVector{T}) where T = convert(oneVector{T}, cumsum(KomaMRICore.cpu(x)))
Base.cumsum(x::oneArray{T}; dims) where T = convert(oneArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
Base.findall(x::oneVector{Bool}) = convert(oneVector, findall(KomaMRICore.cpu(x)))

using KernelAbstractions: @index, @kernel, @Const, synchronize

"""Naive cumsum implementation for matrix, parallelizes along the first dimension"""
Base.cumsum(A::oneArray{T}; dims) where T = begin
dims == 2 || @error "oneAPI cumsum implementation only supports keyword argument dims=2"
backend = oneAPIBackend()
B = similar(A)
cumsum_kernel = naive_cumsum!(backend)
cumsum_kernel(B, A, ndrange=size(A,1))
synchronize(backend)
return B
end

Base.cumsum!(B::oneArray{T}, A::oneArray{T}; dims) where T = begin
dims == 2 || @error "oneAPI cumsum implementation only supports keyword argument dims=2"
backend = oneAPIBackend()
cumsum_kernel = naive_cumsum!(backend)
cumsum_kernel(B, A, ndrange=size(A,1))
synchronize(backend)
end

## COV_EXCL_START
@kernel function naive_cumsum!(B, @Const(A))
i = @index(Global)

cur_val = 0.0f0
for k 1:size(A, 2)
@inbounds cur_val += A[i, k]
@inbounds B[i, k] = cur_val
end
end
## COV_EXCL_STOP

function __init__()
push!(KomaMRICore.LOADED_BACKENDS[], oneAPIBackend())
@warn "oneAPI does not support all array operations used by KomaMRI. GPU performance may be slower than expected"
Expand Down
4 changes: 4 additions & 0 deletions KomaMRICore/src/simulation/GPUFunctions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ _print_devices(::KA.CPU) = @info "CPU: $(length(Sys.cpu_info())) x $(Sys.cpu_inf
name(::KA.CPU) = "CPU"
set_device!(backend, val) = @error "set_device! called with invalid parameter types: '$(typeof(backend))', '$(typeof(val))'"

#oneAPI.jl doesn't support cis (https://github.com/JuliaGPU/oneAPI.jl/pull/443), so
#for now we use a custom function for each backend to implement
function _cis end

"""
get_backend(use_gpu)
Expand Down
32 changes: 15 additions & 17 deletions KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""Stores preallocated structs for use in Bloch CPU run_spin_precession function."""
"""Stores preallocated structs for use in Bloch CPU run_spin_precession! and run_spin_excitation! functions."""
struct BlochCPUPrealloc{T} <: PreallocResult{T}
M::Mag{T} # Mag{T}
Bz_old::AbstractVector{T} # Vector{T}(Nspins x 1)
Bz_new::AbstractVector{T} # Vector{T}(Nspins x 1)
ϕ::AbstractVector{T} # Vector{T}(Nspins x 1)
φ::AbstractVector{T} # Vector{T}(Nspins x 1)
Rot::Spinor{T} # Spinor{T}
ΔBz::AbstractVector{T} # Vector{T}(Nspins x 1)
end

Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
Expand All @@ -14,13 +14,13 @@ Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
p.Bz_old[i],
p.Bz_new[i],
p.ϕ[i],
p.φ[i],
p.Rot[i]
p.Rot[i],
p.ΔBz[i]
)
end

"""Preallocates arrays for use in run_spin_precession."""
function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}) where {T<:Real}
"""Preallocates arrays for use in run_spin_precession! and run_spin_excitation!."""
function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real}
return BlochCPUPrealloc(
Mag(
similar(M.xy),
Expand All @@ -29,11 +29,11 @@ function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}
similar(obj.x),
similar(obj.x),
similar(obj.x),
similar(obj.x),
Spinor(
similar(M.xy),
similar(M.xy)
)
),
obj.Δw ./ T(2π .* γ)
)
end

Expand Down Expand Up @@ -63,8 +63,9 @@ function run_spin_precession!(
Bz_new = prealloc.Bz_new
ϕ = prealloc.ϕ
Mxy = prealloc.M.xy
ΔBz = prealloc.ΔBz
fill!(ϕ, zero(T))
@. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + p.Δw / T(2π * γ)
@. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + ΔBz

# Fill sig[1] if needed
ADC_idx = 1
Expand All @@ -79,7 +80,7 @@ function run_spin_precession!(
t_seq += seq.Δt[seq_idx-1]

#Effective Field
@. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + p.Δw / T(2π * γ)
@. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + ΔBz

#Rotation
@. ϕ += (Bz_old + Bz_new) * T(-π * γ) * seq.Δt[seq_idx-1]
Expand Down Expand Up @@ -116,18 +117,15 @@ function run_spin_excitation!(
backend::KA.CPU,
prealloc::BlochCPUPrealloc
) where {T<:Real}
ΔBz = prealloc.Bz_old
Bz = prealloc.Bz_new
B = prealloc.ϕ
φ = prealloc.φ
Bz = prealloc.Bz_old
B = prealloc.Bz_new
φ = prealloc.ϕ
α = prealloc.Rot.α
β = prealloc.Rot.β
ΔBz = prealloc.ΔBz
Maux_xy = prealloc.M.xy
Maux_z = prealloc.M.z

#Can be calculated outside of loop
@. ΔBz = p.Δw / T(2π * γ)

#Simulation
for s in seq #This iterates over seq, "s = seq[i,:]"
#Motion
Expand Down
153 changes: 153 additions & 0 deletions KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""These properties are redundant with seq.Δt and seq.ADC, but it is much faster
to calculate them on the CPU before the simulation is run."""
struct SeqBlockProperties{T<:Real}
length::Int64
nADC::Int64
first_ADC::Bool
ADC_indices::AbstractVector{Int64}
tp_ADC::AbstractVector{T}
dur::T
end

@functor SeqBlockProperties

"""Stores information added to the prealloc struct for use in run_spin_precession!
and run_spin_excitation!. This information is calculated on the CPU before the
simulation is run."""
struct BlochGPUPrecalc{T} <: PrecalcResult{T}
seq_properties::AbstractVector{SeqBlockProperties{T}}
end

@functor BlochGPUPrecalc

"""Precalculates sequence properties for use in run_spin_precession!"""
function precalculate(
sim_method::Bloch,
backend::KA.GPU,
seq::DiscreteSequence{T},
parts::Vector{UnitRange{S}},
excitation_bool::Vector{Bool}
) where {T<:Real,S<:Integer}
seq_properties = SeqBlockProperties{T}[]
for (block, p) in enumerate(parts)
seq_block = @view seq[p]
if excitation_bool[block]
push!(seq_properties, SeqBlockProperties(
length(seq_block.t),
count(seq_block.ADC),
false,
Int64[],
T[],
zero(T)
))
else
ADC_indices = findall(seq_block.ADC) .- 1
if seq_block.ADC[1]
deleteat!(ADC_indices, 1)
end
tp = cumsum(seq_block.Δt)
push!(seq_properties, SeqBlockProperties(
length(seq_block.t),
count(seq_block.ADC),
seq_block.ADC[1],
ADC_indices,
tp[ADC_indices],
last(tp)
))
end
end

return BlochGPUPrecalc(seq_properties)
end

"""Stores preallocated structs for use in Bloch GPU run_spin_precession function."""
struct BlochGPUPrealloc{T} <: PreallocResult{T}
seq_properties::AbstractVector{SeqBlockProperties{T}}
Bz::AbstractMatrix{T}
Δϕ::AbstractMatrix{T}
ϕ::AbstractArray{T}
Mxy::AbstractMatrix{Complex{T}}
ΔBz::AbstractVector{T}
end

Base.view(p::BlochGPUPrealloc{T}, i::UnitRange) where {T<:Real} = p
function prealloc_block(p::BlochGPUPrealloc{T}, i::Integer) where {T<:Real}
seq_block = p.seq_properties[i]

return BlochGPUPrealloc(
[seq_block],
view(p.Bz,:,1:seq_block.length),
view(p.Δϕ,:,1:seq_block.length-1),
seq_block.nADC > 0 ? view(p.ϕ,:,1:seq_block.length-1) : view(p.ϕ,:,1),
view(p.Mxy,:,1:seq_block.nADC),
p.ΔBz
)
end

"""Preallocates arrays for use in run_spin_precession! and run_spin_excitation!."""
function prealloc(sim_method::Bloch, backend::KA.GPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real}
if !(precalc isa BlochGPUPrecalc) @error """Sim method Bloch() does not support calling run_sim_time_iter directly. Use method BlochSimple() instead.""" end

return BlochGPUPrealloc(
precalc.seq_properties,
KA.zeros(backend, T, (size(obj.x, 1), max_block_length)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, Complex{T}, (size(obj.x, 1), max_block_length)),
obj.Δw ./ T(2π .* γ)
)
end

@inline function calculate_precession!(Δϕ::AbstractArray{T}, Δt::AbstractArray{T}, Bz::AbstractArray{T}) where {T<:Real}
Δϕ .= (Bz[:,2:end] .+ Bz[:,1:end-1]) .* Δt .* T(-π .* γ)
end
@inline function apply_precession!::AbstractVector{T}, Δϕ::AbstractArray{T}) where {T<:Real}
ϕ .= sum(Δϕ, dims=2)
end
function apply_precession!::AbstractArray{T}, Δϕ::AbstractArray{T}) where {T<:Real}
cumsum!(ϕ, Δϕ, dims=2)
end

function run_spin_precession!(
p::Phantom{T},
seq::DiscreteSequence{T},
sig::AbstractArray{Complex{T}},
M::Mag{T},
sim_method::Bloch,
backend::KA.GPU,
pre::BlochGPUPrealloc
) where {T<:Real}
#Simulation
#Motion
x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')

#Sequence block info
seq_block = pre.seq_properties[1]

#Effective field
pre.Bz .= x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ pre.ΔBz

#Rotation
calculate_precession!(pre.Δϕ, seq.Δt', pre.Bz)
# Reduces Δϕ Nspins x Nt to ϕ Nspins x Nt, if Nadc = 0, to Nspins x 1
apply_precession!(pre.ϕ, pre.Δϕ)

#Acquired signal
if seq_block.nADC > 0
ϕ_ADC = @view pre.ϕ[:,seq_block.ADC_indices]
if seq_block.first_ADC
pre.Mxy[:,1] .= M.xy
pre.Mxy[:,2:end] .= M.xy .* exp.(-seq_block.tp_ADC' ./ p.T2) .* _cis.(ϕ_ADC)
else
pre.Mxy .= M.xy .* exp.(-seq_block.tp_ADC' ./ p.T2) .* _cis.(ϕ_ADC)
end

sig .= transpose(sum(pre.Mxy; dims=1))
end

#Mxy precession and relaxation, and Mz relaxation
M.z .= M.z .* exp.(-seq_block.dur ./ p.T1) .+ p.ρ .* (T(1) .- exp.(-seq_block.dur ./ p.T1))
M.xy .= M.xy .* exp.(-seq_block.dur ./ p.T2) .* _cis.(pre.ϕ[:,end])

return nothing
end
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ function run_spin_precession!(
Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw ./ T(2π .* γ)
#Rotation
if is_ADC_on(seq)
ϕ = T(-2π .* γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
ϕ = T(-2π .* γ) .* cumtrapz(seq.Δt', Bz)
else
ϕ = T(-2π .* γ) .* trapz(seq.Δt', Bz)
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ function run_spin_precession!(
Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw ./ T(2π .* γ)
#Rotation
if is_ADC_on(seq)
ϕ = T(-2π .* γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
ϕ = T(-2π .* γ) .* cumtrapz(seq.Δt', Bz)
else
ϕ = T(-2π .* γ) .* trapz(seq.Δt', Bz)
end
Expand Down
Loading

1 comment on commit 1457a4c

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

KomaMRI Benchmarks

Benchmark suite Current: 1457a4c Previous: 24b21f4 Ratio
MRI Lab/Bloch/CPU/2 thread(s) 227517325.5 ns 235229966.5 ns 0.97
MRI Lab/Bloch/CPU/4 thread(s) 135033124 ns 140495905 ns 0.96
MRI Lab/Bloch/CPU/8 thread(s) 171880824 ns 169591756.5 ns 1.01
MRI Lab/Bloch/CPU/1 thread(s) 396561930.5 ns 419227547 ns 0.95
MRI Lab/Bloch/GPU/CUDA 138134905 ns 135837984 ns 1.02
MRI Lab/Bloch/GPU/oneAPI 14155999496.5 ns 18356788557 ns 0.77
MRI Lab/Bloch/GPU/Metal 3171338479 ns 2931106125 ns 1.08
MRI Lab/Bloch/GPU/AMDGPU 75482754 ns 1750964243 ns 0.04310924926180803
Slice Selection 3D/Bloch/CPU/2 thread(s) 1168211452 ns 1174040352 ns 1.00
Slice Selection 3D/Bloch/CPU/4 thread(s) 612565463 ns 622515059.5 ns 0.98
Slice Selection 3D/Bloch/CPU/8 thread(s) 495427593 ns 492840880 ns 1.01
Slice Selection 3D/Bloch/CPU/1 thread(s) 2245843835 ns 2264093136 ns 0.99
Slice Selection 3D/Bloch/GPU/CUDA 108701927 ns 257306603 ns 0.42
Slice Selection 3D/Bloch/GPU/oneAPI 776956866 ns 1678945735.5 ns 0.46
Slice Selection 3D/Bloch/GPU/Metal 769082459 ns 1129875875 ns 0.68
Slice Selection 3D/Bloch/GPU/AMDGPU 64232156 ns 679066674 ns 0.0945888797953292

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.