From e08df27f5281466a2d04fadc6072afd3381c9283 Mon Sep 17 00:00:00 2001 From: Ludvig af Klinteberg Date: Wed, 16 Oct 2024 17:03:48 +0200 Subject: [PATCH 1/2] Add support for GPU type 3 transform * Tests are tentative pending fix of gpu_method=0 bug * Fix some typos along the way --- README.md | 4 +- src/cufinufft.jl | 27 ++++++++++- src/cufinufft_simple.jl | 101 ++++++++++++++++++++++++++++++++++++++++ src/types.jl | 5 +- test/test_cuda.jl | 45 ++++++++++++++++++ test/test_nufft.jl | 2 +- 6 files changed, 178 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index be59824..38a99cb 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This is a full-featured Julia interface to [FINUFFT](https://github.com/flatiron ## Installation -FINUFFT.jl requires Julia v1.6 or later, and has been tested up to v1.10. From the Pkg REPL mode (hit `]` in REPL to enter), run +FINUFFT.jl requires Julia v1.6 or later, and has been tested up to v1.11. From the Pkg REPL mode (hit `]` in REPL to enter), run ```julia add FINUFFT @@ -105,7 +105,7 @@ see [examples/time2d1.jl](examples/time2d1.jl) Finally, the more involved codes [test/test_nufft.jl](test/test_nufft.jl) and [test/test_cuda.jl](test/test_cuda.jl) -tests `dtype=Float64` and `dtype=Float32` precisions for all supported transform types, and can be used as references. +test `dtype=Float64` and `dtype=Float32` precisions for all supported transform types, and can be used as references. The outputs are tested there for mathematical correctness. In the 1D type 1 it also tests a vectorized simple, a guru call and a vectorized guru call. diff --git a/src/cufinufft.jl b/src/cufinufft.jl index dc9d92c..e46e766 100644 --- a/src/cufinufft.jl +++ b/src/cufinufft.jl @@ -122,7 +122,6 @@ function _cufinufft_makeplan(::Type{dtype}, n_modes = ones(Int64,3) if type==3 - throw("Type 3 not implemented yet") @assert ndims(n_modes_or_dim) == 0 dim = n_modes_or_dim else @@ -130,7 +129,7 @@ function _cufinufft_makeplan(::Type{dtype}, dim = length(n_modes_or_dim) n_modes[1:dim] .= n_modes_or_dim end - + if dtype==Float64 tol = Float64(eps) ret = ccall( (:cufinufft_makeplan, libcufinufft), @@ -414,6 +413,30 @@ function cufinufft_exec!(plan::cufinufft_plan{T}, plan.plan_ptr,output,input ) end + elseif type==3 + nk = plan.nk + if ntrans==1 + @assert size(output)==(nk,ntrans) || size(output)==(nk,) + else + @assert size(output)==(nk,ntrans) + end + if T==Float64 + ret = ccall( (:cufinufft_execute, libcufinufft), + Cint, + (cufinufft_plan_c, + CuRef{ComplexF64}, + CuRef{ComplexF64}), + plan.plan_ptr,input,output + ) + else + ret = ccall( (:cufinufftf_execute, libcufinufft), + Cint, + (cufinufft_plan_c, + CuRef{ComplexF32}, + CuRef{ComplexF32}), + plan.plan_ptr,input,output + ) + end else ret = ERR_TYPE_NOTVALID end diff --git a/src/cufinufft_simple.jl b/src/cufinufft_simple.jl index b80fc9b..fa32dd4 100644 --- a/src/cufinufft_simple.jl +++ b/src/cufinufft_simple.jl @@ -60,6 +60,35 @@ function nufft1d2!(xj :: CuArray{T}, cufinufft_destroy!(plan) end +""" + nufft1d3!(xj :: CuArray{Float64} or CuArray{Float32}, + cj :: CuArray{ComplexF64} or CuArray{ComplexF32}, + iflag :: Integer, + eps :: Real, + sk :: CuArray{Float64} or CuArray{Float32}, + fk :: CuArray{ComplexF64} or CuArray{ComplexF32}; + kwargs... + ) + +CUDA version. +""" +function nufft1d3!(xj :: CuArray{T}, + cj :: CuArray{Complex{T}}, + iflag :: Integer, + eps :: Real, + sk :: CuArray{T}, + fk :: CuArray{Complex{T}}; + kwargs...) where T <: finufftReal + (nj, nk) = valid_setpts(3,1,xj,T[],T[],sk) + ntrans = valid_ntr(xj,cj) + + checkkwdtype(T; kwargs...) + plan = _cufinufft_makeplan(T,3,1,iflag,ntrans,eps;kwargs...) + cufinufft_setpts!(plan,xj,CuVector{T}(),CuVector{T}(),sk) + cufinufft_exec!(plan,cj,fk) + cufinufft_destroy!(plan) +end + ## 2D """ @@ -123,6 +152,40 @@ function nufft2d2!(xj :: CuArray{T}, cufinufft_destroy!(plan) end +""" + nufft2d3!(xj :: CuArray{Float64} or CuArray{Float32}, + yj :: CuArray{Float64} or CuArray{Float32}, + cj :: CuArray{ComplexF64} or CuArray{ComplexF32}, + iflag :: Integer, + eps :: Real, + sk :: CuArray{Float64} or CuArray{Float32}, + tk :: CuArray{Float64} or CuArray{Float32}, + fk :: CuArray{ComplexF64} or CuArray{ComplexF32}; + kwargs... + ) + +CUDA version. +""" +function nufft2d3!(xj :: CuArray{T}, + yj :: CuArray{T}, + cj :: CuArray{Complex{T}}, + iflag :: Integer, + eps :: Real, + sk :: CuArray{T}, + tk :: CuArray{T}, + fk :: CuArray{Complex{T}}; + kwargs...) where T <: finufftReal + (nj, nk) = valid_setpts(3,2,xj,yj,T[],sk,tk) + ntrans = valid_ntr(xj,cj) + + checkkwdtype(T; kwargs...) + plan = _cufinufft_makeplan(T,3,2,iflag,ntrans,eps;kwargs...) + cufinufft_setpts!(plan,xj,yj,CuVector{T}(),sk,tk) + cufinufft_exec!(plan,cj,fk) + cufinufft_destroy!(plan) +end + + ## 3D """ @@ -188,3 +251,41 @@ function nufft3d2!(xj :: CuArray{T}, cufinufft_exec!(plan,fk,cj) cufinufft_destroy!(plan) end + +""" + nufft3d3!(xj :: CuArray{Float64} or CuArray{Float32}, + yj :: CuArray{Float64} or CuArray{Float32}, + zj :: CuArray{Float64} or CuArray{Float32}, + cj :: CuArray{ComplexF64} or CuArray{ComplexF32}, + iflag :: Integer, + eps :: Real, + sk :: CuArray{Float64} or CuArray{Float32}, + tk :: CuArray{Float64} or CuArray{Float32}, + uk :: CuArray{Float64} or CuArray{Float32}, + fk :: CuArray{ComplexF64} or CuArray{ComplexF32}; + kwargs... + ) + +CUDA version. +""" +function nufft3d3!(xj :: CuArray{T}, + yj :: CuArray{T}, + zj :: CuArray{T}, + cj :: CuArray{Complex{T}}, + iflag :: Integer, + eps :: Real, + sk :: CuArray{T}, + tk :: CuArray{T}, + uk :: CuArray{T}, + fk :: CuArray{Complex{T}}; + kwargs...) where T <: finufftReal + (nj, nk) = valid_setpts(3,3,xj,yj,zj,sk,tk,uk) + ntrans = valid_ntr(xj,cj) + + checkkwdtype(T; kwargs...) + plan = _cufinufft_makeplan(T,3,3,iflag,ntrans,eps;kwargs...) + cufinufft_setpts!(plan,xj,yj,zj,sk,tk,uk) + cufinufft_exec!(plan,cj,fk) + cufinufft_destroy!(plan) +end + diff --git a/src/types.jl b/src/types.jl index 46506fc..af493b7 100644 --- a/src/types.jl +++ b/src/types.jl @@ -130,6 +130,7 @@ const nufft_c_opts = nufft_opts # for backward compatibility - remove? gpu_stream :: Ptr{Cvoid} modeord :: Cint # (type 1,2 only): 0 CMCL-style increasing mode order # 1 FFT-style mode order + debug :: Cint # 0: no debug, 1: debug end Options struct passed to cuFINUFFT, see C documentation. @@ -162,6 +163,8 @@ mutable struct cufinufft_opts gpu_stream :: Ptr{Cvoid} modeord :: Cint # (type 1,2 only): 0 CMCL-style increasing mode order - # 1 FFT-style mode order + # 1 FFT-style mode order + + debug :: Cint # 0: no debug, 1: debug cufinufft_opts() = new() end diff --git a/test/test_cuda.jl b/test/test_cuda.jl index 9a9cebc..369ad49 100644 --- a/test/test_cuda.jl +++ b/test/test_cuda.jl @@ -50,6 +50,9 @@ function test_cuda(tol::Real, dtype::DataType) y_d = CuArray(y) z_d = CuArray(z) c_d = CuArray(c) + s_d = CuArray(s) + t_d = CuArray(t) + u_d = CuArray(u) F1D_d = CuArray(F1D) F2D_d = CuArray(F2D) F3D_d = CuArray(F3D) @@ -128,6 +131,21 @@ function test_cuda(tol::Real, dtype::DataType) relerr_1d2_guru = norm(vec(out2)-vec(ref), Inf) / norm(vec(ref), Inf) @test relerr_1d2_guru < errfac*tol end + + # 1D3 + @testset "1D3" begin + out_d = CUDA.zeros(Complex{T},nk) + ref = zeros(Complex{T},nk) + for k=1:nk + for j=1:nj + ref[k] += c[j] * exp(1im*s[k]*x[j]) + end + end + nufft1d3!(x_d,c_d,1,tol,s_d,out_d, debug=1, gpu_method=1) # FIXME + relerr_1d3 = norm(vec(Array(out_d))-vec(ref), Inf) / norm(vec(ref), Inf) + @test relerr_1d3 < errfac*tol + end + end ## 2D @@ -177,6 +195,20 @@ function test_cuda(tol::Real, dtype::DataType) relerr_2d2_guru = norm(vec(out2)-vec(ref), Inf) / norm(vec(ref), Inf) @test relerr_2d2_guru < errfac*tol end + + @testset "2D3" begin + # 2D3 + out_d = CUDA.zeros(Complex{T},nk) + ref = zeros(Complex{T},nk) + for k=1:nk + for j=1:nj + ref[k] += c[j] * exp(1im*(s[k]*x[j]+t[k]*y[j])) + end + end + nufft2d3!(x_d,y_d,c_d,1,tol,s_d,t_d,out_d, debug=1, gpu_method=1) # FIXME + relerr_2d3 = norm(vec(Array(out_d))-vec(ref), Inf) / norm(vec(ref), Inf) + @test relerr_2d3 < errfac*tol + end end ## 3D @@ -231,6 +263,19 @@ function test_cuda(tol::Real, dtype::DataType) @test relerr_3d2_guru < errfac*tol end + @testset "3D3" begin + # 3D3 + out_d = CUDA.zeros(Complex{T},nk) + ref = zeros(Complex{T},nk) + for k=1:nk + for j=1:nj + ref[k] += c[j] * exp(1im*(s[k]*x[j]+t[k]*y[j]+u[k]*z[j])) + end + end + nufft3d3!(x_d,y_d,z_d,c_d,1,tol,s_d,t_d,u_d,out_d, debug=1, gpu_method=1) # FIXME + relerr_3d3 = norm(vec(Array(out_d))-vec(ref), Inf) / norm(vec(ref), Inf) + @test relerr_3d3 < errfac*tol + end end end end diff --git a/test/test_nufft.jl b/test/test_nufft.jl index 5aca25f..dfc1163 100644 --- a/test/test_nufft.jl +++ b/test/test_nufft.jl @@ -165,7 +165,7 @@ function test_nufft(tol::Real, dtype::DataType) @test reldiff < errdifffac*tol end - @testset "3D3" begin + @testset "2D3" begin # 2D3 out = zeros(Complex{T},nk) ref = zeros(Complex{T},nk) From af3b3eb65e24b8f7a9cfb1cdab36156f0713d188 Mon Sep 17 00:00:00 2001 From: Ludvig af Klinteberg Date: Fri, 18 Oct 2024 09:06:44 +0200 Subject: [PATCH 2/2] Remove FIXMEs --- test/test_cuda.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_cuda.jl b/test/test_cuda.jl index 369ad49..4c4dc70 100644 --- a/test/test_cuda.jl +++ b/test/test_cuda.jl @@ -141,7 +141,7 @@ function test_cuda(tol::Real, dtype::DataType) ref[k] += c[j] * exp(1im*s[k]*x[j]) end end - nufft1d3!(x_d,c_d,1,tol,s_d,out_d, debug=1, gpu_method=1) # FIXME + nufft1d3!(x_d,c_d,1,tol,s_d,out_d) relerr_1d3 = norm(vec(Array(out_d))-vec(ref), Inf) / norm(vec(ref), Inf) @test relerr_1d3 < errfac*tol end @@ -205,7 +205,7 @@ function test_cuda(tol::Real, dtype::DataType) ref[k] += c[j] * exp(1im*(s[k]*x[j]+t[k]*y[j])) end end - nufft2d3!(x_d,y_d,c_d,1,tol,s_d,t_d,out_d, debug=1, gpu_method=1) # FIXME + nufft2d3!(x_d,y_d,c_d,1,tol,s_d,t_d,out_d) relerr_2d3 = norm(vec(Array(out_d))-vec(ref), Inf) / norm(vec(ref), Inf) @test relerr_2d3 < errfac*tol end @@ -272,7 +272,7 @@ function test_cuda(tol::Real, dtype::DataType) ref[k] += c[j] * exp(1im*(s[k]*x[j]+t[k]*y[j]+u[k]*z[j])) end end - nufft3d3!(x_d,y_d,z_d,c_d,1,tol,s_d,t_d,u_d,out_d, debug=1, gpu_method=1) # FIXME + nufft3d3!(x_d,y_d,z_d,c_d,1,tol,s_d,t_d,u_d,out_d) relerr_3d3 = norm(vec(Array(out_d))-vec(ref), Inf) / norm(vec(ref), Inf) @test relerr_3d3 < errfac*tol end