Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize run_spin_precession! for GPU #459

Merged
merged 13 commits into from
Jul 31, 2024
2 changes: 1 addition & 1 deletion KomaMRIBase/src/KomaMRIBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ using MRIBase
Profile, RawAcquisitionData, AcquisitionData, AcquisitionHeader, EncodingCounters, Limit
using MAT # For loading example phantoms

global γ = 42.5774688e6 # Hz/T gyromagnetic constant for H1, JEMRIS uses 42.5756 MHz/T
const global γ = 42.5774688e6 # Hz/T gyromagnetic constant for H1, JEMRIS uses 42.5756 MHz/T

# Hardware
include("datatypes/Scanner.jl")
Expand Down
4 changes: 2 additions & 2 deletions KomaMRIBase/src/timing/TrapezoidalIntegration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,12 @@
phantom
"""
function cumtrapz(Δt::AbstractArray{T}, x::AbstractArray{T}) where {T<:Real}
y = (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt / 2)
y = (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt ./ 2)
y = cumsum(y, dims=2)
return y
end
function cumtrapz(Δt::AbstractVector{T}, x::AbstractVector{T}) where {T<:Real}
y = (x[2:end] .+ x[1:end-1]) .* (Δt / 2)
y = (x[2:end] .+ x[1:end-1]) .* (Δt ./ 2)

Check warning on line 53 in KomaMRIBase/src/timing/TrapezoidalIntegration.jl

View check run for this annotation

Codecov / codecov/patch

KomaMRIBase/src/timing/TrapezoidalIntegration.jl#L53

Added line #L53 was not covered by tests
y = cumsum(y)
return y
end
2 changes: 1 addition & 1 deletion KomaMRICore/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ CUDA = "3, 4, 5"
Functors = "0.4"
KernelAbstractions = "0.9"
KomaMRIBase = "0.9"
Metal = "1"
Metal = "1.2"
ProgressMeter = "1"
Reexport = "1"
ThreadsX = "0.1"
Expand Down
1 change: 1 addition & 0 deletions KomaMRICore/ext/KomaAMDGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
KomaMRICore.set_device!(::ROCBackend, dev_idx::Integer) = AMDGPU.device_id!(dev_idx)
KomaMRICore.set_device!(::ROCBackend, dev::AMDGPU.HIPDevice) = AMDGPU.device!(dev)
KomaMRICore.device_name(::ROCBackend) = AMDGPU.HIP.name(AMDGPU.device())
@inline KomaMRICore._cis(x) = cis(x)

Check warning on line 12 in KomaMRICore/ext/KomaAMDGPUExt.jl

View check run for this annotation

Codecov / codecov/patch

KomaMRICore/ext/KomaAMDGPUExt.jl#L12

Added line #L12 was not covered by tests

function KomaMRICore._print_devices(::ROCBackend)
devices = [
Expand Down
1 change: 1 addition & 0 deletions KomaMRICore/ext/KomaCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
KomaMRICore.isfunctional(::CUDABackend) = CUDA.functional()
KomaMRICore.set_device!(::CUDABackend, val) = CUDA.device!(val)
KomaMRICore.device_name(::CUDABackend) = CUDA.name(CUDA.device())
@inline KomaMRICore._cis(x) = cis(x)

Check warning on line 11 in KomaMRICore/ext/KomaCUDAExt.jl

View check run for this annotation

Codecov / codecov/patch

KomaMRICore/ext/KomaCUDAExt.jl#L11

Added line #L11 was not covered by tests

function KomaMRICore._print_devices(::CUDABackend)
devices = [
Expand Down
9 changes: 1 addition & 8 deletions KomaMRICore/ext/KomaMetalExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,14 @@ KomaMRICore.isfunctional(::MetalBackend) = Metal.functional()
KomaMRICore.set_device!(::MetalBackend, device_index::Integer) = device_index == 1 || @warn "Metal does not support multiple gpu devices. Ignoring the device setting."
KomaMRICore.set_device!(::MetalBackend, dev::Metal.MTLDevice) = Metal.device!(dev)
KomaMRICore.device_name(::MetalBackend) = String(Metal.current_device().name)
@inline KomaMRICore._cis(x) = cis(x)

function KomaMRICore._print_devices(::MetalBackend)
@info "Metal device type: $(KomaMRICore.device_name(MetalBackend()))"
end

#Temporary workaround for https://github.com/JuliaGPU/Metal.jl/issues/348
#Once run_spin_excitation! and run_spin_precession! are kernel-based, this code
#can be removed
Base.cumsum(x::MtlVector{T}) where T = convert(MtlVector{T}, cumsum(KomaMRICore.cpu(x)))
Base.cumsum(x::MtlArray{T}; dims) where T = convert(MtlArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
Base.findall(x::MtlVector{Bool}) = convert(MtlVector, findall(KomaMRICore.cpu(x)))

function __init__()
push!(KomaMRICore.LOADED_BACKENDS[], MetalBackend())
@warn "Metal does not support all array operations used by KomaMRI (https://github.com/JuliaGPU/Metal.jl/issues/348). GPU performance may be slower than expected"
end

end
Expand Down
35 changes: 34 additions & 1 deletion KomaMRICore/ext/KomaoneAPIExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
KomaMRICore.isfunctional(::oneAPIBackend) = oneAPI.functional()
KomaMRICore.set_device!(::oneAPIBackend, val) = oneAPI.device!(val)
KomaMRICore.device_name(::oneAPIBackend) = oneAPI.properties(oneAPI.device()).name
@inline KomaMRICore._cis(x) = cos(x) + im * sin(x)

Check warning on line 11 in KomaMRICore/ext/KomaoneAPIExt.jl

View check run for this annotation

Codecov / codecov/patch

KomaMRICore/ext/KomaoneAPIExt.jl#L11

Added line #L11 was not covered by tests

function KomaMRICore._print_devices(::oneAPIBackend)
devices = [
Expand All @@ -20,9 +21,41 @@
#Temporary workaround since oneAPI.jl (similar to Metal) does not support some array operations
#Once run_spin_excitation! and run_spin_precession! are kernel-based, this code can be removed
Base.cumsum(x::oneVector{T}) where T = convert(oneVector{T}, cumsum(KomaMRICore.cpu(x)))
Base.cumsum(x::oneArray{T}; dims) where T = convert(oneArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
Base.findall(x::oneVector{Bool}) = convert(oneVector, findall(KomaMRICore.cpu(x)))

using KernelAbstractions: @index, @kernel, @Const, synchronize

"""Naive cumsum implementation for matrix, parallelizes along the first dimension"""
Base.cumsum(A::oneArray{T}; dims) where T = begin
dims == 2 || @error "oneAPI cumsum implementation only supports keyword argument dims=2"
backend = oneAPIBackend()
B = similar(A)
cumsum_kernel = naive_cumsum!(backend)
cumsum_kernel(B, A, ndrange=size(A,1))
synchronize(backend)
return B
end

Base.cumsum!(B::oneArray{T}, A::oneArray{T}; dims) where T = begin
dims == 2 || @error "oneAPI cumsum implementation only supports keyword argument dims=2"
backend = oneAPIBackend()
cumsum_kernel = naive_cumsum!(backend)
cumsum_kernel(B, A, ndrange=size(A,1))
synchronize(backend)
end

## COV_EXCL_START
@kernel function naive_cumsum!(B, @Const(A))
i = @index(Global)

cur_val = 0.0f0
for k ∈ 1:size(A, 2)
@inbounds cur_val += A[i, k]
@inbounds B[i, k] = cur_val
end
end
## COV_EXCL_STOP

function __init__()
push!(KomaMRICore.LOADED_BACKENDS[], oneAPIBackend())
@warn "oneAPI does not support all array operations used by KomaMRI. GPU performance may be slower than expected"
Expand Down
4 changes: 4 additions & 0 deletions KomaMRICore/src/simulation/GPUFunctions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ _print_devices(::KA.CPU) = @info "CPU: $(length(Sys.cpu_info())) x $(Sys.cpu_inf
name(::KA.CPU) = "CPU"
set_device!(backend, val) = @error "set_device! called with invalid parameter types: '$(typeof(backend))', '$(typeof(val))'"

#oneAPI.jl doesn't support cis (https://github.com/JuliaGPU/oneAPI.jl/pull/443), so
#for now we use a custom function for each backend to implement
function _cis end

"""
get_backend(use_gpu)

Expand Down
32 changes: 15 additions & 17 deletions KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""Stores preallocated structs for use in Bloch CPU run_spin_precession function."""
"""Stores preallocated structs for use in Bloch CPU run_spin_precession! and run_spin_excitation! functions."""
struct BlochCPUPrealloc{T} <: PreallocResult{T}
M::Mag{T} # Mag{T}
Bz_old::AbstractVector{T} # Vector{T}(Nspins x 1)
Bz_new::AbstractVector{T} # Vector{T}(Nspins x 1)
ϕ::AbstractVector{T} # Vector{T}(Nspins x 1)
φ::AbstractVector{T} # Vector{T}(Nspins x 1)
Rot::Spinor{T} # Spinor{T}
ΔBz::AbstractVector{T} # Vector{T}(Nspins x 1)
end

Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
Expand All @@ -14,13 +14,13 @@ Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
p.Bz_old[i],
p.Bz_new[i],
p.ϕ[i],
p.φ[i],
p.Rot[i]
p.Rot[i],
p.ΔBz[i]
)
end

"""Preallocates arrays for use in run_spin_precession."""
function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}) where {T<:Real}
"""Preallocates arrays for use in run_spin_precession! and run_spin_excitation!."""
function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real}
return BlochCPUPrealloc(
Mag(
similar(M.xy),
Expand All @@ -29,11 +29,11 @@ function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}
similar(obj.x),
similar(obj.x),
similar(obj.x),
similar(obj.x),
Spinor(
similar(M.xy),
similar(M.xy)
)
),
obj.Δw ./ T(2π .* γ)
)
end

Expand Down Expand Up @@ -63,8 +63,9 @@ function run_spin_precession!(
Bz_new = prealloc.Bz_new
ϕ = prealloc.ϕ
Mxy = prealloc.M.xy
ΔBz = prealloc.ΔBz
fill!(ϕ, zero(T))
@. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + p.Δw / T(2π * γ)
@. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + ΔBz

# Fill sig[1] if needed
ADC_idx = 1
Expand All @@ -79,7 +80,7 @@ function run_spin_precession!(
t_seq += seq.Δt[seq_idx-1]

#Effective Field
@. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + p.Δw / T(2π * γ)
@. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + ΔBz

#Rotation
@. ϕ += (Bz_old + Bz_new) * T(-π * γ) * seq.Δt[seq_idx-1]
Expand Down Expand Up @@ -116,18 +117,15 @@ function run_spin_excitation!(
backend::KA.CPU,
prealloc::BlochCPUPrealloc
) where {T<:Real}
ΔBz = prealloc.Bz_old
Bz = prealloc.Bz_new
B = prealloc.ϕ
φ = prealloc.φ
Bz = prealloc.Bz_old
B = prealloc.Bz_new
φ = prealloc.ϕ
α = prealloc.Rot.α
β = prealloc.Rot.β
ΔBz = prealloc.ΔBz
Maux_xy = prealloc.M.xy
Maux_z = prealloc.M.z

#Can be calculated outside of loop
@. ΔBz = p.Δw / T(2π * γ)

#Simulation
for s in seq #This iterates over seq, "s = seq[i,:]"
#Motion
Expand Down
153 changes: 153 additions & 0 deletions KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""These properties are redundant with seq.Δt and seq.ADC, but it is much faster
to calculate them on the CPU before the simulation is run."""
struct SeqBlockProperties{T<:Real}
length::Int64
nADC::Int64
first_ADC::Bool
ADC_indices::AbstractVector{Int64}
tp_ADC::AbstractVector{T}
dur::T
end

@functor SeqBlockProperties

"""Stores information added to the prealloc struct for use in run_spin_precession!
and run_spin_excitation!. This information is calculated on the CPU before the
simulation is run."""
struct BlochGPUPrecalc{T} <: PrecalcResult{T}
seq_properties::AbstractVector{SeqBlockProperties{T}}
end

@functor BlochGPUPrecalc

"""Precalculates sequence properties for use in run_spin_precession!"""
function precalculate(
sim_method::Bloch,
backend::KA.GPU,
seq::DiscreteSequence{T},
parts::Vector{UnitRange{S}},
excitation_bool::Vector{Bool}
) where {T<:Real,S<:Integer}
seq_properties = SeqBlockProperties{T}[]
for (block, p) in enumerate(parts)
seq_block = @view seq[p]
if excitation_bool[block]
push!(seq_properties, SeqBlockProperties(
length(seq_block.t),
count(seq_block.ADC),
false,
Int64[],
T[],
zero(T)
))
else
ADC_indices = findall(seq_block.ADC) .- 1
if seq_block.ADC[1]
deleteat!(ADC_indices, 1)
end
tp = cumsum(seq_block.Δt)
push!(seq_properties, SeqBlockProperties(
length(seq_block.t),
count(seq_block.ADC),
seq_block.ADC[1],
ADC_indices,
tp[ADC_indices],
last(tp)
))
end
end

return BlochGPUPrecalc(seq_properties)
end

"""Stores preallocated structs for use in Bloch GPU run_spin_precession function."""
struct BlochGPUPrealloc{T} <: PreallocResult{T}
seq_properties::AbstractVector{SeqBlockProperties{T}}
Bz::AbstractMatrix{T}
Δϕ::AbstractMatrix{T}
ϕ::AbstractArray{T}
Mxy::AbstractMatrix{Complex{T}}
ΔBz::AbstractVector{T}
end

Base.view(p::BlochGPUPrealloc{T}, i::UnitRange) where {T<:Real} = p
function prealloc_block(p::BlochGPUPrealloc{T}, i::Integer) where {T<:Real}
seq_block = p.seq_properties[i]

return BlochGPUPrealloc(
[seq_block],
view(p.Bz,:,1:seq_block.length),
view(p.Δϕ,:,1:seq_block.length-1),
seq_block.nADC > 0 ? view(p.ϕ,:,1:seq_block.length-1) : view(p.ϕ,:,1),
view(p.Mxy,:,1:seq_block.nADC),
p.ΔBz
)
end

"""Preallocates arrays for use in run_spin_precession! and run_spin_excitation!."""
function prealloc(sim_method::Bloch, backend::KA.GPU, obj::Phantom{T}, M::Mag{T}, max_block_length::Integer, precalc) where {T<:Real}
if !(precalc isa BlochGPUPrecalc) @error """Sim method Bloch() does not support calling run_sim_time_iter directly. Use method BlochSimple() instead.""" end

return BlochGPUPrealloc(
precalc.seq_properties,
KA.zeros(backend, T, (size(obj.x, 1), max_block_length)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, Complex{T}, (size(obj.x, 1), max_block_length)),
obj.Δw ./ T(2π .* γ)
)
end

@inline function calculate_precession!(Δϕ::AbstractArray{T}, Δt::AbstractArray{T}, Bz::AbstractArray{T}) where {T<:Real}
Δϕ .= (Bz[:,2:end] .+ Bz[:,1:end-1]) .* Δt .* T(-π .* γ)
end
@inline function apply_precession!(ϕ::AbstractVector{T}, Δϕ::AbstractArray{T}) where {T<:Real}
ϕ .= sum(Δϕ, dims=2)
end
function apply_precession!(ϕ::AbstractArray{T}, Δϕ::AbstractArray{T}) where {T<:Real}
cumsum!(ϕ, Δϕ, dims=2)
end

function run_spin_precession!(
p::Phantom{T},
seq::DiscreteSequence{T},
sig::AbstractArray{Complex{T}},
M::Mag{T},
sim_method::Bloch,
backend::KA.GPU,
pre::BlochGPUPrealloc
) where {T<:Real}
#Simulation
#Motion
x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')

#Sequence block info
seq_block = pre.seq_properties[1]

#Effective field
pre.Bz .= x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ pre.ΔBz

#Rotation
calculate_precession!(pre.Δϕ, seq.Δt', pre.Bz)
# Reduces Δϕ Nspins x Nt to ϕ Nspins x Nt, if Nadc = 0, to Nspins x 1
apply_precession!(pre.ϕ, pre.Δϕ)

#Acquired signal
if seq_block.nADC > 0
ϕ_ADC = @view pre.ϕ[:,seq_block.ADC_indices]
if seq_block.first_ADC
pre.Mxy[:,1] .= M.xy
pre.Mxy[:,2:end] .= M.xy .* exp.(-seq_block.tp_ADC' ./ p.T2) .* _cis.(ϕ_ADC)
else
pre.Mxy .= M.xy .* exp.(-seq_block.tp_ADC' ./ p.T2) .* _cis.(ϕ_ADC)
end

sig .= transpose(sum(pre.Mxy; dims=1))
end

#Mxy precession and relaxation, and Mz relaxation
M.z .= M.z .* exp.(-seq_block.dur ./ p.T1) .+ p.ρ .* (T(1) .- exp.(-seq_block.dur ./ p.T1))
M.xy .= M.xy .* exp.(-seq_block.dur ./ p.T2) .* _cis.(pre.ϕ[:,end])

return nothing
end
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ function run_spin_precession!(
Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw ./ T(2π .* γ)
#Rotation
if is_ADC_on(seq)
ϕ = T(-2π .* γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
ϕ = T(-2π .* γ) .* cumtrapz(seq.Δt', Bz)
else
ϕ = T(-2π .* γ) .* trapz(seq.Δt', Bz)
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ function run_spin_precession!(
Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw ./ T(2π .* γ)
#Rotation
if is_ADC_on(seq)
ϕ = T(-2π .* γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
ϕ = T(-2π .* γ) .* cumtrapz(seq.Δt', Bz)
else
ϕ = T(-2π .* γ) .* trapz(seq.Δt', Bz)
end
Expand Down
Loading