From 6803abf54669bb89a56e4c92b2e4951be3146f02 Mon Sep 17 00:00:00 2001 From: termi-official Date: Fri, 14 Jul 2023 19:25:04 +0200 Subject: [PATCH 01/16] MatrixFree GPU heat equation prototype. --- docs/src/literate-tutorials/gpu_assembly.jl | 323 ++++++++++++++++++++ 1 file changed, 323 insertions(+) create mode 100644 docs/src/literate-tutorials/gpu_assembly.jl diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl new file mode 100644 index 0000000000..c2adf4c57d --- /dev/null +++ b/docs/src/literate-tutorials/gpu_assembly.jl @@ -0,0 +1,323 @@ +# # [Heat equation](@id tutorial-heat-equation) +# +# ![](heat_square.png) +# +# *Figure 1*: Temperature field on the unit square with an internal uniform heat source +# solved with homogeneous Dirichlet boundary conditions on the boundary. +# +#- +#md # !!! tip +#md # This example is also available as a Jupyter notebook: +#md # [`heat_equation.ipynb`](@__NBVIEWER_ROOT_URL__/examples/heat_equation.ipynb). +#- +# +# ## Introduction +# +# The heat equation is the "Hello, world!" equation of finite elements. +# Here we solve the equation on a unit square, with a uniform internal source. +# The strong form of the (linear) heat equation is given by +# +# ```math +# -\nabla \cdot (k \nabla u) = f \quad \textbf{x} \in \Omega, +# ``` +# +# where $u$ is the unknown temperature field, $k$ the heat conductivity, +# $f$ the heat source and $\Omega$ the domain. For simplicity we set $f = 1$ +# and $k = 1$. We will consider homogeneous Dirichlet boundary conditions such that +# ```math +# u(\textbf{x}) = 0 \quad \textbf{x} \in \partial \Omega, +# ``` +# where $\partial \Omega$ denotes the boundary of $\Omega$. +# The resulting weak form is given given as follows: Find ``u \in \mathbb{U}`` such that +# ```math +# \int_{\Omega} \nabla \delta u \cdot \nabla u \ d\Omega = \int_{\Omega} \delta u \ d\Omega \quad \forall \delta u \in \mathbb{T}, +# ``` +# where $\delta u$ is a test function, and where $\mathbb{U}$ and $\mathbb{T}$ are suitable +# trial and test function sets, respectively. +#- +# ## Commented Program +# +# Now we solve the problem in Ferrite. What follows is a program spliced with comments. +#md # The full program, without comments, can be found in the next [section](@ref heat_equation-plain-program). +# +# First we load Ferrite, and some other packages we need +using Ferrite, CUDA +using IterativeSolvers, LinearAlgebra + +### TODO Extension +import Adapt +using StaticArrays +struct GPUQuadratureRule{N,T,dim} + weights::SVector{N,T} + points::SVector{N,Vec{dim,T}} +end +function Adapt.adapt_structure(to, qr::QuadratureRule{shape,T,dim}) where {shape,T,dim} + N = length(qr.weights) + GPUQuadratureRule{N,T,dim}(SVector{N,T}(qr.weights), SVector{N,Vec{dim,T}}(qr.points)) +end +function Adapt.adapt_structure(to, nodes::Vector{Node}) + CuArray(get_node_coordinate.(nodes)) +end +### TODO Adapt dofhandler + +# TODO not sure how to do this automatically +function unsafe_shape_value(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}, i::Int) + ξ_x = ξ[1] + ξ_y = ξ[2] + i == 1 && return (1 - ξ_x) * (1 - ξ_y) * 0.25 + i == 2 && return (1 + ξ_x) * (1 - ξ_y) * 0.25 + i == 3 && return (1 + ξ_x) * (1 + ξ_y) * 0.25 + i == 4 && return (1 - ξ_x) * (1 + ξ_y) * 0.25 +end + +function shape_values(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}) + ξ_x = ξ[1] + ξ_y = ξ[2] + return @SVector [ + (1 - ξ_x) * (1 - ξ_y) * 0.25, + (1 + ξ_x) * (1 - ξ_y) * 0.25, + (1 + ξ_x) * (1 + ξ_y) * 0.25, + (1 - ξ_x) * (1 + ξ_y) * 0.25, + ] +end + +function shape_gradients(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}) + ξ_x = ξ[1] + ξ_y = ξ[2] + return @SMatrix [ + (0 - 1) * (1 - ξ_y) * 0.25 (1 - ξ_x) * (0 - 1) * 0.25; + (0 + 1) * (1 - ξ_y) * 0.25 (1 + ξ_x) * (0 - 1) * 0.25; + (0 + 1) * (1 + ξ_y) * 0.25 (1 + ξ_x) * (0 + 1) * 0.25; + (0 - 1) * (1 + ξ_y) * 0.25 (1 - ξ_x) * (0 + 1) * 0.25; + ] +end + +function unsafe_shape_gradient(ip::Interpolation, ξ::Vec, i::Int) + return Tensors.gradient(x -> unsafe_shape_value(ip, x, i), ξ) +end + +cellnodes(cell::Quadrilateral, nodes) = (nodes[cell.nodes[1]], nodes[cell.nodes[2]], nodes[cell.nodes[3]], nodes[cell.nodes[4]]) + +# We start by generating a simple grid with 20x20 quadrilateral elements +# using `generate_grid`. The generator defaults to the unit square, +# so we don't need to specify the corners of the domain. +grid = generate_grid(Quadrilateral, (200, 200)); +colors = CuArray.(create_coloring(grid)); + +# ### Trial and test functions +# A `CellValues` facilitates the process of evaluating values and gradients of +# test and trial functions (among other things). To define +# this we need to specify an interpolation space for the shape functions. +# We use Lagrange functions +# based on the two-dimensional reference quadrilateral. We also define a quadrature rule based on +# the same reference element. We combine the interpolation and the quadrature rule +# to a `CellValues` object. +ip = Lagrange{RefQuadrilateral, 1}() +qr = QuadratureRule{RefQuadrilateral}(2) +cellvalues = CellValues(qr, ip); + +# ### Degrees of freedom +# Next we need to define a `DofHandler`, which will take care of numbering +# and distribution of degrees of freedom for our approximated fields. +# We create the `DofHandler` and then add a single scalar field called `:u` based on +# our interpolation `ip` defined above. +# Lastly we `close!` the `DofHandler`, it is now that the dofs are distributed +# for all the elements. +dh = DofHandler(grid) +add!(dh, :u, ip) +close!(dh); + +# ### Boundary conditions +# In Ferrite constraints like Dirichlet boundary conditions +# are handled by a `ConstraintHandler`. +# ch = ConstraintHandler(dh); + +# Next we need to add constraints to `ch`. For this problem we define +# homogeneous Dirichlet boundary conditions on the whole boundary, i.e. +# the `union` of all the face sets on the boundary. +# ∂Ω = union( +# getfaceset(grid, "left"), +# getfaceset(grid, "right"), +# getfaceset(grid, "top"), +# getfaceset(grid, "bottom"), +# ); + +# Now we are set up to define our constraint. We specify which field +# the condition is for, and our combined face set `∂Ω`. The last +# argument is a function of the form $f(\textbf{x})$ or $f(\textbf{x}, t)$, +# where $\textbf{x}$ is the spatial coordinate and +# $t$ the current time, and returns the prescribed value. Since the boundary condition in +# this case do not depend on time we define our function as $f(\textbf{x}) = 0$, i.e. +# no matter what $\textbf{x}$ we return $0$. When we have +# specified our constraint we `add!` it to `ch`. +# dbc = Dirichlet(:u, ∂Ω, (x, t) -> 0) +# add!(ch, dbc); +# close!(ch) + +function gpu_element_mass_action!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{N,T,dim}, ip, ip_geo, xₑ) where {V,N,T,dim} + n_basefuncs = length(uₑin) #getnbasefunctions(cellvalues) + for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues) + ξ = qr.points[q_point] + # TODO recover abstraction layer + J = getJ(ip_geo, xₑ, ξ) + dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point) + for i in 1:n_basefuncs + ϕᵢ = unsafe_shape_value(ip, ξ, i) + for j in 1:n_basefuncs + ϕⱼ = unsafe_shape_value(ip, ξ, j) + uₑout[i] += ϕᵢ*ϕⱼ*uₑin[j]*dΩ + end + end + end + return nothing +end + +function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, xₑ) where {V,N,T,dim,IP,GIP} + n_basefuncs = length(uₑin) #getnbasefunctions(cellvalues) + for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues) + ξ = qr.points[q_point] + shapes = shape_values(ip, ξ) + # TODO recover abstraction layer + J = getJ(ip_geo, xₑ, ξ) + dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point) + for i in 1:n_basefuncs + ϕᵢ = shapes[i] + for j in 1:n_basefuncs + inner = shapes[j]*uₑin[j] + uₑout[i] += inner*ϕᵢ*dΩ + end + end + end + return nothing +end + +# Mass action on a single color +Base.@propagate_inbounds function gpu_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offsets, cell_indices, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, cells::CuDeviceArray{CT}, nodes::CuDeviceArray{Vec{dim,T}}, dofs_per_cell) where {V,N,T,dim,CT,IP,GIP} + index = threadIdx().x # this example only requires linear indexing, so just use `x` + stride = blockDim().x + for i = index:stride:length(cell_indices) + ## Get index of the current cell + cell_index = cell_indices[i] + ## Grab the actual cell + cell = cells[cell_index] + ## Grab the dofs on the cell + cell_dof_range = cell_dof_offsets[cell_index]:(cell_dof_offsets[cell_index]+dofs_per_cell-1) + cell_dofs = @view all_cell_dofs[cell_dof_range] + ## Grab the buffers for the y and x + uₑin = @view uin[cell_dofs] + uₑout = @view uout[cell_dofs] + ## Grab coordinate array + xₑ = cellnodes(cell, nodes) + ## Apply local action for y = Ax + gpu_element_mass_action2!(uₑout, uₑin, qr, ip, ip_geo, xₑ) + end + return nothing +end + +# Mass action of the full operator +function gpu_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR, ip::IP, ip_geo::GIP, colors, gpu_cells, gpu_nodes) where {V, QR, IP, GIP} + # Initialize solution + fill!(uout, 0.0) + synchronize() + + # Apply action one time + dofs_per_cell = ndofs_per_cell(dh, 1) + for color ∈ colors + numthreads = 256 + numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) + # try + @cuda threads=numthreads blocks=numblocks gpu_mass_action_kernel!(uout, u, all_cell_dofs, cell_dof_offsets, color, qr, ip, ip_geo, gpu_cells, gpu_nodes, dofs_per_cell) + synchronize() + # catch err + # code_typed(err; interactive = true) + # end + end +end + +function getJ(ip_geo::GIP, xₑ::NTuple{N,Vec{dim,T}}, ξ::Vec{dim,T}) where {N,GIP,dim,T} + dMdξ = shape_gradients(ip_geo, ξ) + fecv_J = zero(Tensor{2,dim,T}) + for i in 1:length(xₑ) + fecv_J += xₑ[i] ⊗ Vec{dim,T}(dMdξ[i, :]) + end + return fecv_J +end + +# +# Define the operator struct (https://iterativesolvers.julialinearalgebra.org/dev/getting_started/) +struct FerriteGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR} + # "GPUGrid" + gpu_cells::CACELLS + gpu_nodes::CANODES + colors::CACOLORS + # "GPUDofHandler" + all_cell_dofs::CADOFS + cell_dof_offsets::CAOFFSETS + # "GPUValues" + ip::IP + ip_geo::GIP + qr::QR +end + +function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr) + FerriteGPUMassOperator( + CuArray(getcells(Ferrite.get_grid(dh))), + CuArray(get_node_coordinate.(getnodes(Ferrite.get_grid(dh)))), + colors, + CuArray(dh.cell_dofs), + CuArray(dh.cell_dofs_offset), + ip, ip_geo, qr + ) +end + +LinearAlgebra.mul!(y, A::FerriteGPUMassOperator, x) = gpu_mass_action!(y, x, A.all_cell_dofs, A.cell_dof_offsets, A.qr, A.ip, A.ip_geo, A.colors, A.gpu_cells, A.gpu_nodes) +Base.eltype(A::FerriteGPUMassOperator) = Float64 +Base.size(A::FerriteGPUMassOperator, d) = ndofs(dh) # Square operator + +A = FerriteGPUMassOperator(dh, colors, ip, ip, qr) + +function generate_rhs(dh) + rhs = zeros(ndofs(dh)) + for cell in CellIterator(dh) + reinit!(cellvalues, cell) + coords = get_cell_coordinates(cell) + n_basefuncs = getnbasefunctions(cellvalues) + fe = zeros(n_basefuncs) + for q_point in 1:getnquadpoints(cellvalues) + ## Get the quadrature weight + dΩ = getdetJdV(cellvalues, q_point) + x = spatial_coordinate(cellvalues, q_point, coords) + ## Loop over test shape functions + for i in 1:n_basefuncs + δu = shape_value(cellvalues, q_point, i) + ## Add contribution to fe + fe[i] += cos(x[1]/π)*cos(x[2]/π)*δu * dΩ + end + end + rhs[celldofs(cell)] .+= fe + end + return CuArray(rhs) +end +b = generate_rhs(dh) +u = CUDA.fill(0.0, ndofs(dh)); +cg!(u, A, b; verbose=true); + +# ### Exporting to VTK +# To visualize the result we export the grid and our field `u` +# to a VTK-file, which can be viewed in e.g. [ParaView](https://www.paraview.org/). +vtk_grid("heat_equation", dh) do vtk + vtk_point_data(vtk, dh, Array(u)) +end + +## test the result #src +# using Test #src +# @test norm(u) ≈ 3.307743912641305 #src + +#md # ## [Plain program](@id heat_equation-plain-program) +#md # +#md # Here follows a version of the program without any comments. +#md # The file is also available here: [`heat_equation.jl`](heat_equation.jl). +#md # +#md # ```julia +#md # @__CODE__ +#md # ``` From 08256af8bbd8ff99b272262b9c5f808eabe23874 Mon Sep 17 00:00:00 2001 From: termi-official Date: Wed, 11 Oct 2023 18:41:27 +0200 Subject: [PATCH 02/16] Temporary solution for sparsity pattern on GPU. --- src/Dofs/sparsity_pattern.jl | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/Dofs/sparsity_pattern.jl b/src/Dofs/sparsity_pattern.jl index 46ef67c3d3..8171ea3a54 100644 --- a/src/Dofs/sparsity_pattern.jl +++ b/src/Dofs/sparsity_pattern.jl @@ -1,5 +1,5 @@ """ - create_sparsity_pattern(dh::DofHandler; coupling, [topology::Union{Nothing, AbstractTopology}], [cross_coupling]) + create_sparsity_pattern(dh::DofHandler; coupling, topology::Union{Nothing, AbstractTopology}, cross_coupling, matrix_type) Create the sparsity pattern corresponding to the degree of freedom numbering in the [`DofHandler`](@ref). Return a `SparseMatrixCSC` @@ -13,15 +13,17 @@ not. By default full coupling is assumed inside the element with no coupling bet If `topology` and `cross_coupling` are passed, dof of fields with discontinuous interpolations are coupled between elements according to `cross_coupling`. +With + See the [Sparsity Pattern](@ref) section of the manual. """ function create_sparsity_pattern(dh::AbstractDofHandler; coupling=nothing, - topology::Union{Nothing, AbstractTopology} = nothing, cross_coupling = nothing) - return _create_sparsity_pattern(dh, nothing, false, true, coupling, topology, cross_coupling) + topology::Union{Nothing, AbstractTopology} = nothing, cross_coupling = nothing, matrix_type = SparseMatrixCSC{Float64}) + return _create_sparsity_pattern(matrix_type, dh, nothing, false, true, coupling, topology, cross_coupling) end """ - create_symmetric_sparsity_pattern(dh::DofHandler; coupling, topology::Union{Nothing, AbstractTopology}, cross_coupling) + create_symmetric_sparsity_pattern(dh::DofHandler; coupling, topology::Union{Nothing, AbstractTopology}, cross_coupling, matrix_type) Create the symmetric sparsity pattern corresponding to the degree of freedom numbering in the [`DofHandler`](@ref) by only considering the upper @@ -30,32 +32,32 @@ triangle of the matrix. Return a `Symmetric{SparseMatrixCSC}`. See the [Sparsity Pattern](@ref) section of the manual. """ function create_symmetric_sparsity_pattern(dh::AbstractDofHandler; coupling=nothing, - topology::Union{Nothing, AbstractTopology} = nothing, cross_coupling = nothing) - return Symmetric(_create_sparsity_pattern(dh, nothing, true, true, coupling, topology, cross_coupling), :U) + topology::Union{Nothing, AbstractTopology} = nothing, cross_coupling = nothing, matrix_type = SparseMatrixCSC{Float64}) + return Symmetric(_create_sparsity_pattern(matrix_type, dh, nothing, true, true, coupling, topology, cross_coupling), :U) end """ - create_symmetric_sparsity_pattern(dh::AbstractDofHandler, ch::ConstraintHandler; coupling, topology::Union{Nothing, AbstractTopology}, cross_coupling) + create_symmetric_sparsity_pattern(dh::AbstractDofHandler, ch::ConstraintHandler; coupling, topology::Union{Nothing, AbstractTopology}, cross_coupling, matrix_type) Create a symmetric sparsity pattern accounting for affine constraints in `ch`. See the Affine Constraints section of the manual for further details. """ function create_symmetric_sparsity_pattern(dh::AbstractDofHandler, ch::ConstraintHandler; keep_constrained::Bool=true, coupling=nothing, topology::Union{Nothing, AbstractTopology} = nothing, - cross_coupling = nothing) - return Symmetric(_create_sparsity_pattern(dh, ch, true, keep_constrained, coupling, topology, cross_coupling), :U) + cross_coupling = nothing, matrix_type = SparseMatrixCSC{Float64}) + return Symmetric(_create_sparsity_pattern(matrix_type, dh, ch, true, keep_constrained, coupling, topology, cross_coupling), :U) end """ - create_sparsity_pattern(dh::AbstractDofHandler, ch::ConstraintHandler; coupling, topology::Union{Nothing, AbstractTopology} = nothing) + create_sparsity_pattern(dh::AbstractDofHandler, ch::ConstraintHandler; coupling, topology::Union{Nothing, AbstractTopology}, matrix_type) Create a sparsity pattern accounting for affine constraints in `ch`. See the Affine Constraints section of the manual for further details. """ function create_sparsity_pattern(dh::AbstractDofHandler, ch::ConstraintHandler; keep_constrained::Bool=true, coupling=nothing, topology::Union{Nothing, AbstractTopology} = nothing, - cross_coupling = nothing) - return _create_sparsity_pattern(dh, ch, false, keep_constrained, coupling, topology, cross_coupling) + cross_coupling = nothing, matrix_type = SparseMatrixCSC{Float64}) + return _create_sparsity_pattern(matrix_type, dh, ch, false, keep_constrained, coupling, topology, cross_coupling) end # Compute a coupling matrix of size (ndofs_per_cell × ndofs_per_cell) based on the input @@ -163,8 +165,8 @@ function cross_element_coupling!(dh::DofHandler, ch::Union{ConstraintHandler, No return cnt end -function _create_sparsity_pattern(dh::AbstractDofHandler, ch#=::Union{ConstraintHandler, Nothing}=#, sym::Bool, keep_constrained::Bool, coupling::Union{AbstractMatrix{Bool},Nothing}, - topology::Union{Nothing, AbstractTopology}, cross_coupling::Union{AbstractMatrix{Bool},Nothing}) +function _create_sparsity_pattern(matrix_type::Type{SparseMatrixCSC{T}}, dh::AbstractDofHandler, ch#=::Union{ConstraintHandler, Nothing}=#, sym::Bool, keep_constrained::Bool, coupling::Union{AbstractMatrix{Bool},Nothing}, + topology::Union{Nothing, AbstractTopology}, cross_coupling::Union{AbstractMatrix{Bool},Nothing}) where {T} @assert isclosed(dh) if !keep_constrained @assert ch !== nothing && isclosed(ch) @@ -227,7 +229,7 @@ function _create_sparsity_pattern(dh::AbstractDofHandler, ch#=::Union{Constraint end @assert length(I) == length(J) == cnt - K = spzeros!!(Float64, I, J, ndofs(dh), ndofs(dh)) + K = spzeros!!(T, I, J, ndofs(dh), ndofs(dh)) # If ConstraintHandler is given, create the condensation pattern due to affine constraints if ch !== nothing @@ -295,7 +297,7 @@ function _condense_sparsity_pattern!(K::SparseMatrixCSC{T}, dofcoefficients::Vec resize!(J, cnt) # Fill the sparse matrix with a non-zero value so that :+ operation does not remove entries with value zero. - K2 = spzeros!!(Float64, I, J, ndofs, ndofs) + K2 = spzeros!!(T, I, J, ndofs, ndofs) fill!(K2.nzval, 1) K .+= K2 From d8456a4ee41c832a6b4e53f96d901b22bd00ba1c Mon Sep 17 00:00:00 2001 From: termi-official Date: Wed, 11 Oct 2023 19:11:17 +0200 Subject: [PATCH 03/16] Add missing tests. --- test/test_dofs.jl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/test_dofs.jl b/test/test_dofs.jl index 676fa1c6d6..dda5427718 100644 --- a/test/test_dofs.jl +++ b/test/test_dofs.jl @@ -436,6 +436,27 @@ end @test is_stored(K, i, j) end + # Full coupling (default) with reduced precision + K = create_sparsity_pattern(dh; matrix_type=SparseMatrixCSC{Float32}) + @test eltype(K) == Float32 + for j in 1:ndofs(dh), i in 1:ndofs(dh) + @test is_stored(K, i, j) + end + + # Full coupling (default) with reduced precision + K = create_sparsity_pattern(dh; matrix_type=SparseMatrixCSC{Float32}) + @test eltype(K) == Float32 + for j in 1:ndofs(dh), i in 1:ndofs(dh) + @test is_stored(K, i, j) + end + + # Complex operator + K = create_sparsity_pattern(dh; matrix_type=SparseMatrixCSC{ComplexF64}) + @test eltype(K) == Float32 + for j in 1:ndofs(dh), i in 1:ndofs(dh) + @test is_stored(K, i, j) + end + # Field coupling coupling = [ # u p From 7e151bfd9217493b010a48821bd4609e2382b415 Mon Sep 17 00:00:00 2001 From: termi-official Date: Tue, 17 Oct 2023 15:46:00 +0200 Subject: [PATCH 04/16] =?UTF-8?q?Make=20GPU=20catching=20=F0=9F=94=A5.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/src/literate-tutorials/gpu_assembly.jl | 204 ++++++++++++++++---- 1 file changed, 166 insertions(+), 38 deletions(-) diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl index c2adf4c57d..9a6163e8f8 100644 --- a/docs/src/literate-tutorials/gpu_assembly.jl +++ b/docs/src/literate-tutorials/gpu_assembly.jl @@ -58,26 +58,56 @@ end function Adapt.adapt_structure(to, nodes::Vector{Node}) CuArray(get_node_coordinate.(nodes)) end -### TODO Adapt dofhandler +### TODO Adapt dofhandler? # TODO not sure how to do this automatically function unsafe_shape_value(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}, i::Int) ξ_x = ξ[1] ξ_y = ξ[2] - i == 1 && return (1 - ξ_x) * (1 - ξ_y) * 0.25 - i == 2 && return (1 + ξ_x) * (1 - ξ_y) * 0.25 - i == 3 && return (1 + ξ_x) * (1 + ξ_y) * 0.25 - i == 4 && return (1 - ξ_x) * (1 + ξ_y) * 0.25 + i == 1 && return (1 - ξ_x) * (1 - ξ_y) / 4 + i == 2 && return (1 + ξ_x) * (1 - ξ_y) / 4 + i == 3 && return (1 + ξ_x) * (1 + ξ_y) / 4 + i == 4 && return (1 - ξ_x) * (1 + ξ_y) / 4 +end + +function unsafe_shape_value(ip::Lagrange{RefHexahedron, 1}, ξ::Vec{3}, i::Int) + ξ_x = ξ[1] + ξ_y = ξ[2] + ξ_z = ξ[3] + i == 1 && return 0.125(1 - ξ_x) * (1 - ξ_y) * (1 - ξ_z) + i == 2 && return 0.125(1 + ξ_x) * (1 - ξ_y) * (1 - ξ_z) + i == 3 && return 0.125(1 + ξ_x) * (1 + ξ_y) * (1 - ξ_z) + i == 4 && return 0.125(1 - ξ_x) * (1 + ξ_y) * (1 - ξ_z) + i == 5 && return 0.125(1 - ξ_x) * (1 - ξ_y) * (1 + ξ_z) + i == 6 && return 0.125(1 + ξ_x) * (1 - ξ_y) * (1 + ξ_z) + i == 7 && return 0.125(1 + ξ_x) * (1 + ξ_y) * (1 + ξ_z) + i == 8 && return 0.125(1 - ξ_x) * (1 + ξ_y) * (1 + ξ_z) end function shape_values(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}) ξ_x = ξ[1] ξ_y = ξ[2] return @SVector [ - (1 - ξ_x) * (1 - ξ_y) * 0.25, - (1 + ξ_x) * (1 - ξ_y) * 0.25, - (1 + ξ_x) * (1 + ξ_y) * 0.25, - (1 - ξ_x) * (1 + ξ_y) * 0.25, + (1 - ξ_x) * (1 - ξ_y) / 4, + (1 + ξ_x) * (1 - ξ_y) / 4, + (1 + ξ_x) * (1 + ξ_y) / 4, + (1 - ξ_x) * (1 + ξ_y) / 4, + ] +end + +function shape_values(ip::Lagrange{RefHexahedron, 1}, ξ::Vec{3}) + ξ_x = ξ[1] + ξ_y = ξ[2] + ξ_z = ξ[3] + return @SVector [ + (1 - ξ_x) * (1 - ξ_y) * (1 - ξ_z) / 8, + (1 + ξ_x) * (1 - ξ_y) * (1 - ξ_z) / 8, + (1 + ξ_x) * (1 + ξ_y) * (1 - ξ_z) / 8, + (1 - ξ_x) * (1 + ξ_y) * (1 - ξ_z) / 8, + (1 - ξ_x) * (1 - ξ_y) * (1 + ξ_z) / 8, + (1 + ξ_x) * (1 - ξ_y) * (1 + ξ_z) / 8, + (1 + ξ_x) * (1 + ξ_y) * (1 + ξ_z) / 8, + (1 - ξ_x) * (1 + ξ_y) * (1 + ξ_z) / 8, ] end @@ -85,10 +115,26 @@ function shape_gradients(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}) ξ_x = ξ[1] ξ_y = ξ[2] return @SMatrix [ - (0 - 1) * (1 - ξ_y) * 0.25 (1 - ξ_x) * (0 - 1) * 0.25; - (0 + 1) * (1 - ξ_y) * 0.25 (1 + ξ_x) * (0 - 1) * 0.25; - (0 + 1) * (1 + ξ_y) * 0.25 (1 + ξ_x) * (0 + 1) * 0.25; - (0 - 1) * (1 + ξ_y) * 0.25 (1 - ξ_x) * (0 + 1) * 0.25; + (0 - 1) * (1 - ξ_y) / 4 (1 - ξ_x) * (0 - 1) / 4; + (0 + 1) * (1 - ξ_y) / 4 (1 + ξ_x) * (0 - 1) / 4; + (0 + 1) * (1 + ξ_y) / 4 (1 + ξ_x) * (0 + 1) / 4; + (0 - 1) * (1 + ξ_y) / 4 (1 - ξ_x) * (0 + 1) / 4; + ] +end + +function shape_gradients(ip::Lagrange{RefHexahedron, 1}, ξ::Vec{3}) + ξ_x = ξ[1] + ξ_y = ξ[2] + ξ_z = ξ[3] + return @SMatrix [ + (0 - 1) * (1 - ξ_y) * (1 - ξ_z) / 8 (1 - ξ_x) * (0 - 1) * (1 - ξ_z) / 8 (1 - ξ_x) * (1 - ξ_y) * (0 - 1) / 8; + (0 + 1) * (1 - ξ_y) * (1 - ξ_z) / 8 (1 + ξ_x) * (0 - 1) * (1 - ξ_z) / 8 (1 + ξ_x) * (1 - ξ_y) * (0 - 1) / 8; + (0 + 1) * (1 + ξ_y) * (1 - ξ_z) / 8 (1 + ξ_x) * (0 + 1) * (1 - ξ_z) / 8 (1 + ξ_x) * (1 + ξ_y) * (0 - 1) / 8; + (0 - 1) * (1 + ξ_y) * (1 - ξ_z) / 8 (1 - ξ_x) * (0 + 1) * (1 - ξ_z) / 8 (1 - ξ_x) * (1 + ξ_y) * (0 - 1) / 8; + (0 - 1) * (1 - ξ_y) * (1 + ξ_z) / 8 (1 - ξ_x) * (0 - 1) * (1 + ξ_z) / 8 (1 - ξ_x) * (1 - ξ_y) * (0 + 1) / 8; + (0 + 1) * (1 - ξ_y) * (1 + ξ_z) / 8 (1 + ξ_x) * (0 - 1) * (1 + ξ_z) / 8 (1 + ξ_x) * (1 - ξ_y) * (0 + 1) / 8; + (0 + 1) * (1 + ξ_y) * (1 + ξ_z) / 8 (1 + ξ_x) * (0 + 1) * (1 + ξ_z) / 8 (1 + ξ_x) * (1 + ξ_y) * (0 + 1) / 8; + (0 - 1) * (1 + ξ_y) * (1 + ξ_z) / 8 (1 - ξ_x) * (0 + 1) * (1 + ξ_z) / 8 (1 - ξ_x) * (1 + ξ_y) * (0 + 1) / 8; ] end @@ -96,13 +142,15 @@ function unsafe_shape_gradient(ip::Interpolation, ξ::Vec, i::Int) return Tensors.gradient(x -> unsafe_shape_value(ip, x, i), ξ) end +# ntuple fails... cellnodes(cell::Quadrilateral, nodes) = (nodes[cell.nodes[1]], nodes[cell.nodes[2]], nodes[cell.nodes[3]], nodes[cell.nodes[4]]) +cellnodes(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]], nodes[cell.nodes[2]], nodes[cell.nodes[3]], nodes[cell.nodes[4]], nodes[cell.nodes[5]], nodes[cell.nodes[6]], nodes[cell.nodes[7]], nodes[cell.nodes[8]]) # We start by generating a simple grid with 20x20 quadrilateral elements # using `generate_grid`. The generator defaults to the unit square, # so we don't need to specify the corners of the domain. -grid = generate_grid(Quadrilateral, (200, 200)); -colors = CuArray.(create_coloring(grid)); +grid = generate_grid(Hexahedron, (100, 100, 100)); +colors = CuArray.(create_coloring(grid)); # TODO add example without coloring, i.e. using Atomics instead # ### Trial and test functions # A `CellValues` facilitates the process of evaluating values and gradients of @@ -112,8 +160,8 @@ colors = CuArray.(create_coloring(grid)); # based on the two-dimensional reference quadrilateral. We also define a quadrature rule based on # the same reference element. We combine the interpolation and the quadrature rule # to a `CellValues` object. -ip = Lagrange{RefQuadrilateral, 1}() -qr = QuadratureRule{RefQuadrilateral}(2) +ip = Lagrange{RefHexahedron, 1}() +qr = QuadratureRule{RefHexahedron}(2) cellvalues = CellValues(qr, ip); # ### Degrees of freedom @@ -276,36 +324,116 @@ Base.size(A::FerriteGPUMassOperator, d) = ndofs(dh) # Square operator A = FerriteGPUMassOperator(dh, colors, ip, ip, qr) -function generate_rhs(dh) - rhs = zeros(ndofs(dh)) - for cell in CellIterator(dh) - reinit!(cellvalues, cell) - coords = get_cell_coordinates(cell) - n_basefuncs = getnbasefunctions(cellvalues) - fe = zeros(n_basefuncs) - for q_point in 1:getnquadpoints(cellvalues) - ## Get the quadrature weight - dΩ = getdetJdV(cellvalues, q_point) - x = spatial_coordinate(cellvalues, q_point, coords) - ## Loop over test shape functions - for i in 1:n_basefuncs - δu = shape_value(cellvalues, q_point, i) - ## Add contribution to fe - fe[i] += cos(x[1]/π)*cos(x[2]/π)*δu * dΩ - end +struct FerriteGPURHS{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR} + # "GPUGrid" + gpu_cells::CACELLS + gpu_nodes::CANODES + colors::CACOLORS + # "GPUDofHandler" + all_cell_dofs::CADOFS + cell_dof_offsets::CAOFFSETS + # "GPUValues" + ip::IP + ip_geo::GIP + qr::QR +end + +function FerriteGPURHS(dh, colors, ip, ip_geo, qr) + FerriteGPURHS( + CuArray(getcells(Ferrite.get_grid(dh))), + CuArray(get_node_coordinate.(getnodes(Ferrite.get_grid(dh)))), + colors, + CuArray(dh.cell_dofs), + CuArray(dh.cell_dofs_offset), + ip, ip_geo, qr + ) +end + +function generate_rhs(gpurhs::FerriteGPURHS) + rhs = CuArray(zeros(ndofs(dh))) + numthreads = 256 + numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) + dofs_per_cell = ndofs_per_cell(dh, 1) + for color ∈ gpurhs.colors + # try + @cuda threads=numthreads blocks=numblocks gpu_rhs_kernel!(rhs, gpurhs.all_cell_dofs, gpurhs.cell_dof_offsets, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color, gpurhs.gpu_cells, gpurhs.gpu_nodes, dofs_per_cell) + synchronize() + # catch err + # code_typed(err; interactive = true) + # end + end + + # for cell in CellIterator(dh) + # reinit!(cellvalues, cell) + # coords = getcoordinates(cell) + # n_basefuncs = getnbasefunctions(cellvalues) + # fe = zeros(n_basefuncs) + # for q_point in 1:getnquadpoints(cellvalues) + # ## Get the quadrature weight + # dΩ = getdetJdV(cellvalues, q_point) + # x = spatial_coordinate(cellvalues, q_point, coords) + # ## Loop over test shape functions + # for i in 1:n_basefuncs + # δu = shape_value(cellvalues, q_point, i) + # ## Add contribution to fe + # fe[i] += cos(x[1]/π)*cos(x[2]/π)*δu * dΩ + # end + # end + # rhs[celldofs(cell)] .+= fe + # end + return rhs +end + +Base.@propagate_inbounds function gpu_rhs_kernel!(rhs::RHS, all_cell_dofs::CD, cell_dof_offsets::DO, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC, cells::CELLS, nodes::GPUN, dofs_per_cell::DOFSPC) where {RHS, CD, DO, QR, FIP, GIP, GPUC, CELLS, GPUN, DOFSPC} + index = threadIdx().x # this example only requires linear indexing, so just use `x` + stride = blockDim().x + for i = index:stride:length(cell_indices) + ## Get index of the current cell + cell_index = cell_indices[i] + ## Grab the actual cell + cell = cells[cell_index] + ## Grab the dofs on the cell + cell_dof_range = cell_dof_offsets[cell_index]:(cell_dof_offsets[cell_index]+dofs_per_cell-1) + cell_dofs = @view all_cell_dofs[cell_dof_range] + ## Grab the buffers for the y and x + rhsₑ = @view rhs[cell_dofs] + ## Grab coordinate array + coords = cellnodes(cell, nodes) + ## Apply local action for y = Ax + gpu_rhs_kernel2!(rhsₑ, qr, ip, ip_geo, coords) + end +end + +function gpu_rhs_kernel2!(rhsₑ, qr, ip, ip_geo, coords) + n_basefuncs = getnbasefunctions(ip) + for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues) + ξ = qr.points[q_point] + # TODO recover abstraction layer + J = getJ(ip_geo, coords, ξ) + dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point) + + # TODO spatial_coordinate + x = zero(Vec{3,Float64}) + for i in 1:n_basefuncs + x += unsafe_shape_value(ip_geo, ξ, i)*coords[i] #geometric_value(fe_v, q_point, i) * x[i] + end + + for i in 1:n_basefuncs + ϕᵢ = unsafe_shape_value(ip, ξ, i) + rhsₑ[i] += cos(x[1]/π)*cos(x[2]/π)*ϕᵢ * dΩ end - rhs[celldofs(cell)] .+= fe end - return CuArray(rhs) end -b = generate_rhs(dh) + + +b = generate_rhs(FerriteGPURHS(dh, colors, ip, ip, qr)) u = CUDA.fill(0.0, ndofs(dh)); cg!(u, A, b; verbose=true); # ### Exporting to VTK # To visualize the result we export the grid and our field `u` # to a VTK-file, which can be viewed in e.g. [ParaView](https://www.paraview.org/). -vtk_grid("heat_equation", dh) do vtk +vtk_grid("heat_equation", dh) do vtk # The cake is a lie vtk_point_data(vtk, dh, Array(u)) end From 6f33d098e2ae72e791dbbddbdf6d0fad0bb44882 Mon Sep 17 00:00:00 2001 From: termi-official Date: Tue, 17 Oct 2023 15:46:23 +0200 Subject: [PATCH 05/16] Packages. --- docs/Manifest.toml | 185 ++++++++++++++++++++++++++++++++++++++++++++- docs/Project.toml | 3 + 2 files changed, 184 insertions(+), 4 deletions(-) diff --git a/docs/Manifest.toml b/docs/Manifest.toml index e79fd3ef2e..9749a9bbd8 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -1,8 +1,8 @@ # This file is machine-generated - editing it directly is not advised -julia_version = "1.9.1" +julia_version = "1.9.3" manifest_format = "2.0" -project_hash = "a0caf5d07899f1863658c7736f8530d20d28db07" +project_hash = "3221800eb396241cbc64f6deb94d7c003e4f881d" [[deps.ADTypes]] git-tree-sha1 = "5d2e21d7b0d8c22f67483ef95ebdc39c0e6b6003" @@ -14,6 +14,17 @@ git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c" uuid = "a4c015fc-c6ff-483c-b24f-f7ea428134e9" version = "0.0.1" +[[deps.AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "1.5.0" +weakdeps = ["ChainRulesCore", "Test"] + + [deps.AbstractFFTs.extensions] + AbstractFFTsChainRulesCoreExt = "ChainRulesCore" + AbstractFFTsTestExt = "Test" + [[deps.AbstractTrees]] git-tree-sha1 = "faa260e4cb5aba097a73fab382dd4b5819d8ec8c" uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" @@ -80,6 +91,18 @@ weakdeps = ["SparseArrays"] [[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" +[[deps.Atomix]] +deps = ["UnsafeAtomics"] +git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +version = "0.1.0" + +[[deps.BFloat16s]] +deps = ["LinearAlgebra", "Printf", "Random", "Test"] +git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66" +uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" +version = "0.4.2" + [[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" @@ -123,12 +146,45 @@ git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2" uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" version = "1.0.8+0" +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.2" + [[deps.CPUSummary]] deps = ["CpuId", "IfElse", "PrecompileTools", "Static"] git-tree-sha1 = "601f7e7b3d36f18790e2caf83a882d88e9b71ff1" uuid = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9" version = "0.2.4" +[[deps.CUDA]] +deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "Statistics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "f062a48c26ae027f70c44f48f244862aec47bf99" +uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" +version = "5.0.0" +weakdeps = ["SpecialFunctions"] + + [deps.CUDA.extensions] + SpecialFunctionsExt = "SpecialFunctions" + +[[deps.CUDA_Driver_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"] +git-tree-sha1 = "2f64185414751a5f878c4ab616c0edd94ade3419" +uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc" +version = "0.6.0+4" + +[[deps.CUDA_Runtime_Discovery]] +deps = ["Libdl"] +git-tree-sha1 = "bcc4a23cbbd99c8535a5318455dcf0f2546ec536" +uuid = "1af6417a-86b4-443c-805f-a4643ffb695f" +version = "0.2.2" + +[[deps.CUDA_Runtime_jll]] +deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "178a606891f82b6d734f0eadd5336b7aad44d5df" +uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" +version = "0.9.2+1" + [[deps.Cairo_jll]] deps = ["Artifacts", "Bzip2_jll", "CompilerSupportLibraries_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"] git-tree-sha1 = "4b859a208b2397a7a623a03449e4636bdb17bcf2" @@ -205,7 +261,7 @@ weakdeps = ["Dates", "LinearAlgebra"] [[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" -version = "1.0.2+0" +version = "1.0.5+0" [[deps.ConcreteStructs]] git-tree-sha1 = "f749037478283d372048690eb3b5f92a79432b34" @@ -243,11 +299,22 @@ git-tree-sha1 = "fcbb72b032692610bfbdb15018ac16a36cf2e406" uuid = "adafc99b-e345-5852-983c-f28acb93d879" version = "0.3.1" +[[deps.Crayons]] +git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" +uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +version = "4.1.1" + [[deps.DataAPI]] git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c" uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" version = "1.15.0" +[[deps.DataFrames]] +deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8" +uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +version = "1.6.1" + [[deps.DataStructures]] deps = ["Compat", "InteractiveUtils", "OrderedCollections"] git-tree-sha1 = "3dbd312d370723b6bb43ba9d02fc36abade4518d" @@ -558,12 +625,24 @@ deps = ["Artifacts", "Libdl"] uuid = "781609d7-10c4-51f6-84f2-b8444358ff6d" version = "6.2.1+2" +[[deps.GPUArrays]] +deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] +git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8" +uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +version = "9.0.0" + [[deps.GPUArraysCore]] deps = ["Adapt"] git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" uuid = "46192b85-c4d5-4398-a991-12ede77f4527" version = "0.1.5" +[[deps.GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] +git-tree-sha1 = "5e4487558477f191c043166f8301dd0b4be4e2b2" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.24.5" + [[deps.GR]] deps = ["Artifacts", "Base64", "DelimitedFiles", "Downloads", "GR_jll", "HTTP", "JSON", "Libdl", "LinearAlgebra", "Pkg", "Preferences", "Printf", "Random", "Serialization", "Sockets", "TOML", "Tar", "Test", "UUIDs", "p7zip_jll"] git-tree-sha1 = "27442171f28c952804dede8ff72828a96f2bfc1f" @@ -657,6 +736,12 @@ git-tree-sha1 = "5cd07aab533df5170988219191dfad0519391428" uuid = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" version = "0.1.3" +[[deps.InlineStrings]] +deps = ["Parsers"] +git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461" +uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" +version = "1.4.0" + [[deps.IntelOpenMP_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "ad37c091f7d7daf900963171600d7c1c5c3ede32" @@ -667,6 +752,11 @@ version = "2023.2.0+0" deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +[[deps.InvertedIndices]] +git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" +uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" +version = "1.3.0" + [[deps.IrrationalConstants]] git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" @@ -719,12 +809,28 @@ git-tree-sha1 = "6f2675ef130a300a112286de91973805fcc5ffbc" uuid = "aacddb02-875f-59d6-b918-886e6ef4fbf8" version = "2.1.91+0" +[[deps.JuliaNVTXCallbacks_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f" +uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e" +version = "0.2.1+0" + [[deps.KLU]] deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse_jll"] git-tree-sha1 = "884c2968c2e8e7e6bf5956af88cb46aa745c854b" uuid = "ef3ab10e-7fda-4108-b977-705223b18434" version = "0.4.1" +[[deps.KernelAbstractions]] +deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] +git-tree-sha1 = "5f1ecfddb6abde48563d08b2cc7e5116ebcd6c27" +uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +version = "0.9.10" +weakdeps = ["EnzymeCore"] + + [deps.KernelAbstractions.extensions] + EnzymeExt = "EnzymeCore" + [[deps.Krylov]] deps = ["LinearAlgebra", "Printf", "SparseArrays"] git-tree-sha1 = "17e462054b42dcdda73e9a9ba0c67754170c88ae" @@ -743,6 +849,18 @@ git-tree-sha1 = "bf36f528eec6634efc60d7ec062008f171071434" uuid = "88015f11-f218-50d7-93a8-a6af411a945d" version = "3.0.0+1" +[[deps.LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "4ea2928a96acfcf8589e6cd1429eff2a3a82c366" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "6.3.0" + +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "e7c01b69bcbcb93fd4cbc3d0fea7d229541e18d2" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.26+0" + [[deps.LLVMOpenMP_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "f689897ccbe049adb19a065c495e75f372ecd42b" @@ -1053,6 +1171,18 @@ git-tree-sha1 = "019f12e9a1a7880459d0173c182e6a99365d7ac1" uuid = "2774e3e8-f4cf-5e23-947b-6d7e65073b56" version = "4.5.1" +[[deps.NVTX]] +deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"] +git-tree-sha1 = "8bc9ce4233be3c63f8dcd78ccaf1b63a9c0baa34" +uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f" +version = "0.3.3" + +[[deps.NVTX_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b" +uuid = "e98f9f5b-d649-5603-91fd-7774390e6439" +version = "3.1.0+2" + [[deps.NaNMath]] deps = ["OpenLibm_jll"] git-tree-sha1 = "0877504529a3e5c3343c6f8b4c0381e57e4387e4" @@ -1181,7 +1311,7 @@ version = "0.42.2+0" [[deps.Pkg]] deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -version = "1.9.0" +version = "1.9.2" [[deps.PlotThemes]] deps = ["PlotUtils", "Statistics"] @@ -1227,6 +1357,12 @@ git-tree-sha1 = "240d7170f5ffdb285f9427b92333c3463bf65bf6" uuid = "1d0040c9-8b98-4ee7-8388-3f51789ca0ad" version = "0.2.1" +[[deps.PooledArrays]] +deps = ["DataAPI", "Future"] +git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3" +uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +version = "1.4.3" + [[deps.PositiveFactorizations]] deps = ["LinearAlgebra"] git-tree-sha1 = "17275485f373e6673f7e7f97051f703ed5b15b20" @@ -1257,6 +1393,12 @@ git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" uuid = "21216c6a-2e73-6563-6e65-726566657250" version = "1.4.1" +[[deps.PrettyTables]] +deps = ["Crayons", "LaTeXStrings", "Markdown", "Printf", "Reexport", "StringManipulation", "Tables"] +git-tree-sha1 = "ee094908d720185ddbdc58dbe0c1cbe35453ec7a" +uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +version = "2.2.7" + [[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -1281,6 +1423,18 @@ uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +[[deps.Random123]] +deps = ["Random", "RandomNumbers"] +git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3" +uuid = "74087812-796a-5b5d-8853-05524746bad3" +version = "1.6.1" + +[[deps.RandomNumbers]] +deps = ["Random", "Requires"] +git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" +uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" +version = "1.5.3" + [[deps.RecipesBase]] deps = ["PrecompileTools"] git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff" @@ -1409,6 +1563,12 @@ git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" uuid = "6c6a2e73-6563-6170-7368-637461726353" version = "1.2.0" +[[deps.SentinelArrays]] +deps = ["Dates", "Random"] +git-tree-sha1 = "04bdff0b09c65ff3e06a05e3eb7b120223da3d39" +uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +version = "1.4.0" + [[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" @@ -1568,6 +1728,12 @@ git-tree-sha1 = "b765e46ba27ecf6b44faf70df40c57aa3a547dcb" uuid = "69024149-9ee7-55f6-a4c4-859efe599b68" version = "0.3.7" +[[deps.StringManipulation]] +deps = ["PrecompileTools"] +git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5" +uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e" +version = "0.3.4" + [[deps.StructTypes]] deps = ["Dates", "UUIDs"] git-tree-sha1 = "ca4bccb03acf9faaf4137a9abc1881ed1841aa70" @@ -1705,6 +1871,17 @@ git-tree-sha1 = "e2d817cc500e960fdbafcf988ac8436ba3208bfd" uuid = "45397f5d-5981-4c77-b2b3-fc36d6e9b728" version = "1.6.3" +[[deps.UnsafeAtomics]] +git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" +uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" +version = "0.2.1" + +[[deps.UnsafeAtomicsLLVM]] +deps = ["LLVM", "UnsafeAtomics"] +git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e" +uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" +version = "0.1.3" + [[deps.Unzip]] git-tree-sha1 = "ca0969166a028236229f63514992fc073799bb78" uuid = "41fe7b60-77ed-43a1-b4f0-825fd5a5650d" diff --git a/docs/Project.toml b/docs/Project.toml index 2d83ab8a20..752415eee6 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,5 +1,7 @@ [deps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244" Ferrite = "c061ca5d-56c9-439f-9c0e-210fe06d3992" @@ -16,6 +18,7 @@ Optim = "429524aa-4258-5aef-a3af-852621145aeb" OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" From 1e5cb41bbb9cbe01f5e1fad6cc4479e9461f9f74 Mon Sep 17 00:00:00 2001 From: termi-official Date: Wed, 13 Dec 2023 14:13:21 +0100 Subject: [PATCH 06/16] ... --- docs/src/literate-tutorials/gpu_assembly.jl | 64 +++++++++------------ 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl index 9a6163e8f8..bbc751cc82 100644 --- a/docs/src/literate-tutorials/gpu_assembly.jl +++ b/docs/src/literate-tutorials/gpu_assembly.jl @@ -53,10 +53,10 @@ struct GPUQuadratureRule{N,T,dim} end function Adapt.adapt_structure(to, qr::QuadratureRule{shape,T,dim}) where {shape,T,dim} N = length(qr.weights) - GPUQuadratureRule{N,T,dim}(SVector{N,T}(qr.weights), SVector{N,Vec{dim,T}}(qr.points)) + GPUQuadratureRule{N,Float32,dim}(SVector{N,Float32}(qr.weights), SVector{N,Vec{dim,Float32}}(qr.points)) end -function Adapt.adapt_structure(to, nodes::Vector{Node}) - CuArray(get_node_coordinate.(nodes)) +function Adapt.adapt_structure(to, nodes::Vector{Node{dim}}) where dim + CuArray(Vec{dim,Float32}.(get_node_coordinate.(grid.nodes))) end ### TODO Adapt dofhandler? @@ -149,7 +149,7 @@ cellnodes(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]], nodes[cell.nodes[2]] # We start by generating a simple grid with 20x20 quadrilateral elements # using `generate_grid`. The generator defaults to the unit square, # so we don't need to specify the corners of the domain. -grid = generate_grid(Hexahedron, (100, 100, 100)); +grid = generate_grid(Hexahedron, (20, 20, 20)); colors = CuArray.(create_coloring(grid)); # TODO add example without coloring, i.e. using Atomics instead # ### Trial and test functions @@ -202,27 +202,10 @@ close!(dh); # add!(ch, dbc); # close!(ch) -function gpu_element_mass_action!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{N,T,dim}, ip, ip_geo, xₑ) where {V,N,T,dim} - n_basefuncs = length(uₑin) #getnbasefunctions(cellvalues) - for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues) - ξ = qr.points[q_point] - # TODO recover abstraction layer - J = getJ(ip_geo, xₑ, ξ) - dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point) - for i in 1:n_basefuncs - ϕᵢ = unsafe_shape_value(ip, ξ, i) - for j in 1:n_basefuncs - ϕⱼ = unsafe_shape_value(ip, ξ, j) - uₑout[i] += ϕᵢ*ϕⱼ*uₑin[j]*dΩ - end - end - end - return nothing -end - function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, xₑ) where {V,N,T,dim,IP,GIP} n_basefuncs = length(uₑin) #getnbasefunctions(cellvalues) - for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues) + # shapes = MVector{8,Float32}(undef) + @inbounds for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues) ξ = qr.points[q_point] shapes = shape_values(ip, ξ) # TODO recover abstraction layer @@ -240,10 +223,10 @@ function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{ end # Mass action on a single color -Base.@propagate_inbounds function gpu_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offsets, cell_indices, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, cells::CuDeviceArray{CT}, nodes::CuDeviceArray{Vec{dim,T}}, dofs_per_cell) where {V,N,T,dim,CT,IP,GIP} +function gpu_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offsets, cell_indices, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, cells::CuDeviceArray{CT}, nodes::CuDeviceArray{Vec{dim,T}}, dofs_per_cell) where {V,N,T,dim,CT,IP,GIP} index = threadIdx().x # this example only requires linear indexing, so just use `x` stride = blockDim().x - for i = index:stride:length(cell_indices) + @inbounds for i = index:stride:length(cell_indices) ## Get index of the current cell cell_index = cell_indices[i] ## Grab the actual cell @@ -265,7 +248,7 @@ end # Mass action of the full operator function gpu_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR, ip::IP, ip_geo::GIP, colors, gpu_cells, gpu_nodes) where {V, QR, IP, GIP} # Initialize solution - fill!(uout, 0.0) + fill!(uout, 0.f0) synchronize() # Apply action one time @@ -274,8 +257,9 @@ function gpu_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR numthreads = 256 numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) # try - @cuda threads=numthreads blocks=numblocks gpu_mass_action_kernel!(uout, u, all_cell_dofs, cell_dof_offsets, color, qr, ip, ip_geo, gpu_cells, gpu_nodes, dofs_per_cell) + @show @device_code_warntype @cuda threads=numthreads blocks=numblocks gpu_mass_action_kernel!(uout, u, all_cell_dofs, cell_dof_offsets, color, qr, ip, ip_geo, gpu_cells, gpu_nodes, dofs_per_cell) synchronize() + break # catch err # code_typed(err; interactive = true) # end @@ -308,9 +292,10 @@ struct FerriteGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, end function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr) + grid = Ferrite.get_grid(dh) FerriteGPUMassOperator( - CuArray(getcells(Ferrite.get_grid(dh))), - CuArray(get_node_coordinate.(getnodes(Ferrite.get_grid(dh)))), + CuArray(getcells(grid)), + CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))), colors, CuArray(dh.cell_dofs), CuArray(dh.cell_dofs_offset), @@ -339,9 +324,10 @@ struct FerriteGPURHS{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR} end function FerriteGPURHS(dh, colors, ip, ip_geo, qr) + grid = Ferrite.get_grid(dh) FerriteGPURHS( - CuArray(getcells(Ferrite.get_grid(dh))), - CuArray(get_node_coordinate.(getnodes(Ferrite.get_grid(dh)))), + CuArray(getcells(grid)), + CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))), colors, CuArray(dh.cell_dofs), CuArray(dh.cell_dofs_offset), @@ -350,7 +336,7 @@ function FerriteGPURHS(dh, colors, ip, ip_geo, qr) end function generate_rhs(gpurhs::FerriteGPURHS) - rhs = CuArray(zeros(ndofs(dh))) + rhs = CuArray(zeros(Float32,ndofs(dh))) numthreads = 256 numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) dofs_per_cell = ndofs_per_cell(dh, 1) @@ -384,7 +370,7 @@ function generate_rhs(gpurhs::FerriteGPURHS) return rhs end -Base.@propagate_inbounds function gpu_rhs_kernel!(rhs::RHS, all_cell_dofs::CD, cell_dof_offsets::DO, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC, cells::CELLS, nodes::GPUN, dofs_per_cell::DOFSPC) where {RHS, CD, DO, QR, FIP, GIP, GPUC, CELLS, GPUN, DOFSPC} +function gpu_rhs_kernel!(rhs::RHS, all_cell_dofs::CD, cell_dof_offsets::DO, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC, cells::CELLS, nodes::GPUN, dofs_per_cell::DOFSPC) where {RHS, CD, DO, QR, FIP, GIP, GPUC, CELLS, GPUN, DOFSPC} index = threadIdx().x # this example only requires linear indexing, so just use `x` stride = blockDim().x for i = index:stride:length(cell_indices) @@ -427,15 +413,17 @@ end b = generate_rhs(FerriteGPURHS(dh, colors, ip, ip, qr)) -u = CUDA.fill(0.0, ndofs(dh)); -cg!(u, A, b; verbose=true); +u = CUDA.fill(0.f0, ndofs(dh)); +# cg!(u, A, b; verbose=true); +# @benchmark CUDA.@sync mul!($b,$A,$u) +# CUDA.@profile mul!(b,A,u) # ### Exporting to VTK # To visualize the result we export the grid and our field `u` # to a VTK-file, which can be viewed in e.g. [ParaView](https://www.paraview.org/). -vtk_grid("heat_equation", dh) do vtk # The cake is a lie - vtk_point_data(vtk, dh, Array(u)) -end +# vtk_grid("heat_equation", dh) do vtk # The cake is a lie +# vtk_point_data(vtk, dh, Array(u)) +# end ## test the result #src # using Test #src From 07a0e7ae35634daa506dd87c9aba87f2d760d91b Mon Sep 17 00:00:00 2001 From: Dennis Ogiermann Date: Sat, 30 Dec 2023 15:51:38 +0100 Subject: [PATCH 07/16] GPU grid prototype. --- docs/src/literate-tutorials/gpu_assembly.jl | 58 ++++++++++++++++++--- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl index bbc751cc82..f1a346834b 100644 --- a/docs/src/literate-tutorials/gpu_assembly.jl +++ b/docs/src/literate-tutorials/gpu_assembly.jl @@ -60,6 +60,22 @@ function Adapt.adapt_structure(to, nodes::Vector{Node{dim}}) where dim end ### TODO Adapt dofhandler? +struct CuGrid{sdim,C<:Ferrite.AbstractCell,T<:Real, CELA<:CUDA.Const{C,1}, NODA<:CUDA:Const{Vec{sdim,T},1}, COLA<:CUDA.Const{Int,1}} <: Ferrite.AbstractGrid{sdim} + cells::CELA + nodes::NODA + colors::COLA + # TODO subdomains +end + +CuGrid(grid::Grid{<:Any,<:Any,T}) where T = CuGrid(T, grid) +function CuGrid(::Type{T}, grid::Grid) + CuGrid( + Const(CuArray(getcells(grid))), + Const(CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid))))), + Const(CuArray(colors)) + ) +end + # TODO not sure how to do this automatically function unsafe_shape_value(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}, i::Int) ξ_x = ξ[1] @@ -223,7 +239,7 @@ function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{ end # Mass action on a single color -function gpu_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offsets, cell_indices, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, cells::CuDeviceArray{CT}, nodes::CuDeviceArray{Vec{dim,T}}, dofs_per_cell) where {V,N,T,dim,CT,IP,GIP} +function gpu_full_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offsets, cell_indices, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, cells::CuDeviceArray{CT}, nodes::CuDeviceArray{Vec{dim,T}}, dofs_per_cell) where {V,N,T,dim,CT,IP,GIP} index = threadIdx().x # this example only requires linear indexing, so just use `x` stride = blockDim().x @inbounds for i = index:stride:length(cell_indices) @@ -246,7 +262,7 @@ function gpu_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offset end # Mass action of the full operator -function gpu_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR, ip::IP, ip_geo::GIP, colors, gpu_cells, gpu_nodes) where {V, QR, IP, GIP} +function gpu_full_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR, ip::IP, ip_geo::GIP, colors, gpu_cells, gpu_nodes) where {V, QR, IP, GIP} # Initialize solution fill!(uout, 0.f0) synchronize() @@ -257,7 +273,7 @@ function gpu_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR numthreads = 256 numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) # try - @show @device_code_warntype @cuda threads=numthreads blocks=numblocks gpu_mass_action_kernel!(uout, u, all_cell_dofs, cell_dof_offsets, color, qr, ip, ip_geo, gpu_cells, gpu_nodes, dofs_per_cell) + @show @device_code_warntype @cuda threads=numthreads blocks=numblocks gpu_full_mass_action_kernel!(uout, u, all_cell_dofs, cell_dof_offsets, color, qr, ip, ip_geo, gpu_cells, gpu_nodes, dofs_per_cell) synchronize() break # catch err @@ -277,11 +293,9 @@ end # # Define the operator struct (https://iterativesolvers.julialinearalgebra.org/dev/getting_started/) -struct FerriteGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR} +struct FerriteGPUMassOperator{TGRID, CADOFS, CAOFFSETS, IP, GIP, QR} # "GPUGrid" - gpu_cells::CACELLS - gpu_nodes::CANODES - colors::CACOLORS + grid::TGRID # "GPUDofHandler" all_cell_dofs::CADOFS cell_dof_offsets::CAOFFSETS @@ -303,12 +317,40 @@ function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr) ) end -LinearAlgebra.mul!(y, A::FerriteGPUMassOperator, x) = gpu_mass_action!(y, x, A.all_cell_dofs, A.cell_dof_offsets, A.qr, A.ip, A.ip_geo, A.colors, A.gpu_cells, A.gpu_nodes) +LinearAlgebra.mul!(y, A::FerriteGPUMassOperator, x) = gpu_full_mass_action!(y, x, A.all_cell_dofs, A.cell_dof_offsets, A.qr, A.ip, A.ip_geo, A.colors, A.gpu_cells, A.gpu_nodes) Base.eltype(A::FerriteGPUMassOperator) = Float64 Base.size(A::FerriteGPUMassOperator, d) = ndofs(dh) # Square operator A = FerriteGPUMassOperator(dh, colors, ip, ip, qr) +struct FerriteEAGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR, EAMATS <: AbstractArray{3}} + # "GPUGrid" + gpu_cells::CACELLS + gpu_nodes::CANODES + colors::CACOLORS + # "GPUDofHandler" + all_cell_dofs::CADOFS + cell_dof_offsets::CAOFFSETS + # "GPUValues" + ip::IP + ip_geo::GIP + qr::QR + # + element_matrices::EAMATS +end + +function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr) + grid = Ferrite.get_grid(dh) + FerriteGPUMassOperator( + CuArray(getcells(grid)), + CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))), + colors, + CuArray(dh.cell_dofs), + CuArray(dh.cell_dofs_offset), + ip, ip_geo, qr + ) +end + struct FerriteGPURHS{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR} # "GPUGrid" gpu_cells::CACELLS From 0aca0274f9be54d232b498a73d578df9ac8abad2 Mon Sep 17 00:00:00 2001 From: termi-official Date: Sat, 30 Dec 2023 23:50:43 +0100 Subject: [PATCH 08/16] Initial GPUGrid and GPUDofHandler + performance bugfix. Can now generate NaNs at the speed of light. --- docs/src/literate-tutorials/gpu_assembly.jl | 303 +++++++++----------- src/Dofs/DofHandler.jl | 5 +- 2 files changed, 142 insertions(+), 166 deletions(-) diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl index f1a346834b..1a86c28c05 100644 --- a/docs/src/literate-tutorials/gpu_assembly.jl +++ b/docs/src/literate-tutorials/gpu_assembly.jl @@ -47,58 +47,64 @@ using IterativeSolvers, LinearAlgebra ### TODO Extension import Adapt using StaticArrays -struct GPUQuadratureRule{N,T,dim} +struct SmallQuadratureRule{N,T,dim} weights::SVector{N,T} points::SVector{N,Vec{dim,T}} end +SmallQuadratureRule(qr::QuadratureRule) = SmallQuadratureRule(SVector{length(qr.weights)}(qr.weights), SVector{length(qr.points)}(qr.points)) function Adapt.adapt_structure(to, qr::QuadratureRule{shape,T,dim}) where {shape,T,dim} N = length(qr.weights) - GPUQuadratureRule{N,Float32,dim}(SVector{N,Float32}(qr.weights), SVector{N,Vec{dim,Float32}}(qr.points)) + SmallQuadratureRule{N,T,dim}(SVector{N,T}(qr.weights), SVector{N,Vec{dim,T}}(qr.points)) end -function Adapt.adapt_structure(to, nodes::Vector{Node{dim}}) where dim - CuArray(Vec{dim,Float32}.(get_node_coordinate.(grid.nodes))) +function Adapt.adapt_structure(to, nodes::Vector{Node{dim,T}}) where {dim,T} + CuArray(Vec{dim,T}.(get_node_coordinate.(grid.nodes))) end -### TODO Adapt dofhandler? -struct CuGrid{sdim,C<:Ferrite.AbstractCell,T<:Real, CELA<:CUDA.Const{C,1}, NODA<:CUDA:Const{Vec{sdim,T},1}, COLA<:CUDA.Const{Int,1}} <: Ferrite.AbstractGrid{sdim} +struct GPUGrid{sdim,C<:Ferrite.AbstractCell,T<:Real, CELA<:AbstractArray{C,1}, NODA<:AbstractArray{Node{sdim,T},1}} <: Ferrite.AbstractGrid{sdim} cells::CELA nodes::NODA - colors::COLA # TODO subdomains end +function Adapt.adapt_structure(to, grid::GPUGrid) + cells = Adapt.adapt_structure(to, grid.cells) + nodes = Adapt.adapt_structure(to, grid.nodes) + return GPUGrid(cells, nodes) +end -CuGrid(grid::Grid{<:Any,<:Any,T}) where T = CuGrid(T, grid) -function CuGrid(::Type{T}, grid::Grid) - CuGrid( - Const(CuArray(getcells(grid))), - Const(CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid))))), - Const(CuArray(colors)) +GPUGrid(grid::Grid{<:Any,<:Any,T}) where T = GPUGrid(T, grid) +function GPUGrid(::Type{T}, grid::Grid) where T + GPUGrid( + CuArray(getcells(grid)), + # CuArray(Vec{Ferrite.getdim(grid),T}.(get_node_coordinate.(getnodes(grid)))) + CuArray(getnodes(grid)) ) end -# TODO not sure how to do this automatically -function unsafe_shape_value(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}, i::Int) - ξ_x = ξ[1] - ξ_y = ξ[2] - i == 1 && return (1 - ξ_x) * (1 - ξ_y) / 4 - i == 2 && return (1 + ξ_x) * (1 - ξ_y) / 4 - i == 3 && return (1 + ξ_x) * (1 + ξ_y) / 4 - i == 4 && return (1 - ξ_x) * (1 + ξ_y) / 4 +struct GPUDofHandler{GRID <: GPUGrid, CADOFS <: AbstractArray{Int,1}, CAOFFSETS <: AbstractArray{Int,1}} + cell_dofs::CADOFS + cell_dofs_offset::CAOFFSETS + grid::GRID end - -function unsafe_shape_value(ip::Lagrange{RefHexahedron, 1}, ξ::Vec{3}, i::Int) - ξ_x = ξ[1] - ξ_y = ξ[2] - ξ_z = ξ[3] - i == 1 && return 0.125(1 - ξ_x) * (1 - ξ_y) * (1 - ξ_z) - i == 2 && return 0.125(1 + ξ_x) * (1 - ξ_y) * (1 - ξ_z) - i == 3 && return 0.125(1 + ξ_x) * (1 + ξ_y) * (1 - ξ_z) - i == 4 && return 0.125(1 - ξ_x) * (1 + ξ_y) * (1 - ξ_z) - i == 5 && return 0.125(1 - ξ_x) * (1 - ξ_y) * (1 + ξ_z) - i == 6 && return 0.125(1 + ξ_x) * (1 - ξ_y) * (1 + ξ_z) - i == 7 && return 0.125(1 + ξ_x) * (1 + ξ_y) * (1 + ξ_z) - i == 8 && return 0.125(1 - ξ_x) * (1 + ξ_y) * (1 + ξ_z) +function GPUDofHandler(dh::DofHandler) + GPUDofHandler( + CuArray(dh.cell_dofs), + CuArray(dh.cell_dofs_offset), + GPUGrid(dh.grid) + ) end +function GPUDofHandler(::Type{T}, dh::DofHandler) where T + GPUDofHandler( + CuArray(dh.cell_dofs), + CuArray(dh.cell_dofs_offset), + GPUGrid(T, dh.grid) + ) +end +Ferrite.ndofs_per_cell(dh::GPUDofHandler, i::Int) = (cell_dofs_offset[i+1]-1)-cell_dofs_offset[i] +function Ferrite.celldofs(dh::GPUDofHandler, cell_index::Int) + cell_dof_range = dh.cell_dofs_offset[cell_index]:(dh.cell_dofs_offset[cell_index+1]-1) + return @view dh.cell_dofs[cell_dof_range] +end +Adapt.@adapt_structure GPUDofHandler function shape_values(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}) ξ_x = ξ[1] @@ -160,13 +166,15 @@ end # ntuple fails... cellnodes(cell::Quadrilateral, nodes) = (nodes[cell.nodes[1]], nodes[cell.nodes[2]], nodes[cell.nodes[3]], nodes[cell.nodes[4]]) +cellnodesvec(cell::Quadrilateral, nodes) = (nodes[cell.nodes[1]].x, nodes[cell.nodes[2]].x, nodes[cell.nodes[3]].x, nodes[cell.nodes[4]].x) cellnodes(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]], nodes[cell.nodes[2]], nodes[cell.nodes[3]], nodes[cell.nodes[4]], nodes[cell.nodes[5]], nodes[cell.nodes[6]], nodes[cell.nodes[7]], nodes[cell.nodes[8]]) +cellnodesvec(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]].x, nodes[cell.nodes[2]].x, nodes[cell.nodes[3]].x, nodes[cell.nodes[4]].x, nodes[cell.nodes[5]].x, nodes[cell.nodes[6]].x, nodes[cell.nodes[7]].x, nodes[cell.nodes[8]].x) # We start by generating a simple grid with 20x20 quadrilateral elements # using `generate_grid`. The generator defaults to the unit square, # so we don't need to specify the corners of the domain. grid = generate_grid(Hexahedron, (20, 20, 20)); -colors = CuArray.(create_coloring(grid)); # TODO add example without coloring, i.e. using Atomics instead +colors = create_coloring(grid); # TODO add example without coloring, i.e. using Atomics instead # ### Trial and test functions # A `CellValues` facilitates the process of evaluating values and gradients of @@ -218,7 +226,7 @@ close!(dh); # add!(ch, dbc); # close!(ch) -function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, xₑ) where {V,N,T,dim,IP,GIP} +function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::SmallQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, xₑ) where {V,N,T,dim,IP,GIP} n_basefuncs = length(uₑin) #getnbasefunctions(cellvalues) # shapes = MVector{8,Float32}(undef) @inbounds for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues) @@ -239,41 +247,41 @@ function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{ end # Mass action on a single color -function gpu_full_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offsets, cell_indices, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, cells::CuDeviceArray{CT}, nodes::CuDeviceArray{Vec{dim,T}}, dofs_per_cell) where {V,N,T,dim,CT,IP,GIP} - index = threadIdx().x # this example only requires linear indexing, so just use `x` - stride = blockDim().x - @inbounds for i = index:stride:length(cell_indices) +function gpu_full_mass_action_kernel!(uout::AbstractVector, uin::AbstractVector, gpudh::GPUDofHandler, cell_indices::AbstractVector, qr::SmallQuadratureRule, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation) + index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x + stride = gridDim().x * blockDim().x + i = index + while i <= length(cell_indices) ## Get index of the current cell cell_index = cell_indices[i] ## Grab the actual cell - cell = cells[cell_index] + cell = gpudh.grid.cells[cell_index] ## Grab the dofs on the cell - cell_dof_range = cell_dof_offsets[cell_index]:(cell_dof_offsets[cell_index]+dofs_per_cell-1) - cell_dofs = @view all_cell_dofs[cell_dof_range] + cell_dofs = celldofs(gpudh, cell_index) ## Grab the buffers for the y and x uₑin = @view uin[cell_dofs] uₑout = @view uout[cell_dofs] ## Grab coordinate array - xₑ = cellnodes(cell, nodes) + xₑ = cellnodesvec(cell, gpudh.grid.nodes) ## Apply local action for y = Ax gpu_element_mass_action2!(uₑout, uₑin, qr, ip, ip_geo, xₑ) + i += stride end return nothing end # Mass action of the full operator -function gpu_full_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR, ip::IP, ip_geo::GIP, colors, gpu_cells, gpu_nodes) where {V, QR, IP, GIP} +function gpu_full_mass_action!(uout::AbstractVector, u::AbstractVector, dh::GPUDofHandler, qr, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation, colors::AbstractVector) # Initialize solution - fill!(uout, 0.f0) + fill!(uout, 0.0) synchronize() # Apply action one time - dofs_per_cell = ndofs_per_cell(dh, 1) for color ∈ colors numthreads = 256 numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) # try - @show @device_code_warntype @cuda threads=numthreads blocks=numblocks gpu_full_mass_action_kernel!(uout, u, all_cell_dofs, cell_dof_offsets, color, qr, ip, ip_geo, gpu_cells, gpu_nodes, dofs_per_cell) + @cuda threads=numthreads blocks=numblocks gpu_full_mass_action_kernel!(uout, u, dh, color, qr, ip, ip_geo) synchronize() break # catch err @@ -293,98 +301,104 @@ end # # Define the operator struct (https://iterativesolvers.julialinearalgebra.org/dev/getting_started/) -struct FerriteGPUMassOperator{TGRID, CADOFS, CAOFFSETS, IP, GIP, QR} - # "GPUGrid" - grid::TGRID - # "GPUDofHandler" - all_cell_dofs::CADOFS - cell_dof_offsets::CAOFFSETS +struct FerriteMFMassOperator{DH, COLS, IP, GIP, QR} + dh::DH + colors::COLS # "GPUValues" ip::IP ip_geo::GIP qr::QR end -function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr) - grid = Ferrite.get_grid(dh) - FerriteGPUMassOperator( - CuArray(getcells(grid)), - CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))), - colors, - CuArray(dh.cell_dofs), - CuArray(dh.cell_dofs_offset), - ip, ip_geo, qr - ) -end - -LinearAlgebra.mul!(y, A::FerriteGPUMassOperator, x) = gpu_full_mass_action!(y, x, A.all_cell_dofs, A.cell_dof_offsets, A.qr, A.ip, A.ip_geo, A.colors, A.gpu_cells, A.gpu_nodes) -Base.eltype(A::FerriteGPUMassOperator) = Float64 -Base.size(A::FerriteGPUMassOperator, d) = ndofs(dh) # Square operator +LinearAlgebra.mul!(y, A::FerriteMFMassOperator{<:GPUDofHandler}, x) = gpu_full_mass_action!(y, x, A.dh, A.qr, A.ip, A.ip_geo, A.colors) +Base.eltype(A::FerriteMFMassOperator{<:Grid{<:Any,<:Any,T}}) where T = T +Base.eltype(A::FerriteMFMassOperator{<:GPUGrid{<:Any,<:Any,T}}) where T = T +Base.size(A::FerriteMFMassOperator, d) = ndofs(dh) # Square operator + +# struct FerriteEAGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR, EAMATS <: AbstractArray{3}} +# # "GPUGrid" +# gpu_cells::CACELLS +# gpu_nodes::CANODES +# colors::CACOLORS +# # "GPUDofHandler" +# all_cell_dofs::CADOFS +# cell_dofs_offset::CAOFFSETS +# # "GPUValues" +# ip::IP +# ip_geo::GIP +# qr::QR +# # +# element_matrices::EAMATS +# end -A = FerriteGPUMassOperator(dh, colors, ip, ip, qr) +# function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr) +# grid = Ferrite.get_grid(dh) +# FerriteGPUMassOperator( +# CuArray(getcells(grid)), +# CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))), +# colors, +# CuArray(dh.cell_dofs), +# CuArray(dh.cell_dofs_offset), +# ip, ip_geo, qr +# ) +# end -struct FerriteEAGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR, EAMATS <: AbstractArray{3}} - # "GPUGrid" - gpu_cells::CACELLS - gpu_nodes::CANODES - colors::CACOLORS - # "GPUDofHandler" - all_cell_dofs::CADOFS - cell_dof_offsets::CAOFFSETS +struct FerriteRHS{DH, COLS, IP, GIP, QR} + dh::DH + colors::COLS # "GPUValues" ip::IP ip_geo::GIP qr::QR - # - element_matrices::EAMATS end -function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr) - grid = Ferrite.get_grid(dh) - FerriteGPUMassOperator( - CuArray(getcells(grid)), - CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))), - colors, - CuArray(dh.cell_dofs), - CuArray(dh.cell_dofs_offset), - ip, ip_geo, qr - ) -end +function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC) where {RHS, GPUDH, QR, FIP, GIP, GPUC} + index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x + stride = gridDim().x * blockDim().x + i = index + while i <= length(cell_indices) + ## Get index of the current cell + cell_index = cell_indices[i] + ## Grab the actual cell + cell = gpudh.grid.cells[cell_index] + # ## Grab the dofs on the cell + cell_dofs = celldofs(gpudh, cell_index) + ## Grab the buffers for the y and x + rhsₑ = @view rhs[cell_dofs] + ## Grab coordinate array + coords = cellnodesvec(cell, gpudh.grid.nodes) + ## Apply local action for y = Ax + n_basefuncs = getnbasefunctions(ip) + for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues) + ξ = qr.points[q_point] + # TODO recover abstraction layer + J = getJ(ip_geo, coords, ξ) + dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point) + + # # TODO spatial_coordinate + x = zero(Vec{3,Float64}) + ϕᵍ = shape_values(ip_geo, ξ) + for i in 1:n_basefuncs + x += ϕᵍ[i]*coords[i] #geometric_value(fe_v, q_point, i) * x[i] + end -struct FerriteGPURHS{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR} - # "GPUGrid" - gpu_cells::CACELLS - gpu_nodes::CANODES - colors::CACOLORS - # "GPUDofHandler" - all_cell_dofs::CADOFS - cell_dof_offsets::CAOFFSETS - # "GPUValues" - ip::IP - ip_geo::GIP - qr::QR + ϕᶠ = shape_values(ip, ξ) + @inbounds for i in 1:n_basefuncs + rhsₑ[i] += cos(x[1]/π)*cos(x[2]/π)*ϕᶠ[i]*dΩ + end + end + i += stride + end end -function FerriteGPURHS(dh, colors, ip, ip_geo, qr) - grid = Ferrite.get_grid(dh) - FerriteGPURHS( - CuArray(getcells(grid)), - CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))), - colors, - CuArray(dh.cell_dofs), - CuArray(dh.cell_dofs_offset), - ip, ip_geo, qr - ) -end -function generate_rhs(gpurhs::FerriteGPURHS) - rhs = CuArray(zeros(Float32,ndofs(dh))) +function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler}) + rhs = CuArray(zeros(Float64,ndofs(dh))) numthreads = 256 numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) - dofs_per_cell = ndofs_per_cell(dh, 1) for color ∈ gpurhs.colors # try - @cuda threads=numthreads blocks=numblocks gpu_rhs_kernel!(rhs, gpurhs.all_cell_dofs, gpurhs.cell_dof_offsets, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color, gpurhs.gpu_cells, gpurhs.gpu_nodes, dofs_per_cell) + @cuda threads=numthreads blocks=numblocks gpu_rhs_kernel!(rhs, gpurhs.dh, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color) synchronize() # catch err # code_typed(err; interactive = true) @@ -412,52 +426,13 @@ function generate_rhs(gpurhs::FerriteGPURHS) return rhs end -function gpu_rhs_kernel!(rhs::RHS, all_cell_dofs::CD, cell_dof_offsets::DO, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC, cells::CELLS, nodes::GPUN, dofs_per_cell::DOFSPC) where {RHS, CD, DO, QR, FIP, GIP, GPUC, CELLS, GPUN, DOFSPC} - index = threadIdx().x # this example only requires linear indexing, so just use `x` - stride = blockDim().x - for i = index:stride:length(cell_indices) - ## Get index of the current cell - cell_index = cell_indices[i] - ## Grab the actual cell - cell = cells[cell_index] - ## Grab the dofs on the cell - cell_dof_range = cell_dof_offsets[cell_index]:(cell_dof_offsets[cell_index]+dofs_per_cell-1) - cell_dofs = @view all_cell_dofs[cell_dof_range] - ## Grab the buffers for the y and x - rhsₑ = @view rhs[cell_dofs] - ## Grab coordinate array - coords = cellnodes(cell, nodes) - ## Apply local action for y = Ax - gpu_rhs_kernel2!(rhsₑ, qr, ip, ip_geo, coords) - end -end - -function gpu_rhs_kernel2!(rhsₑ, qr, ip, ip_geo, coords) - n_basefuncs = getnbasefunctions(ip) - for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues) - ξ = qr.points[q_point] - # TODO recover abstraction layer - J = getJ(ip_geo, coords, ξ) - dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point) - - # TODO spatial_coordinate - x = zero(Vec{3,Float64}) - for i in 1:n_basefuncs - x += unsafe_shape_value(ip_geo, ξ, i)*coords[i] #geometric_value(fe_v, q_point, i) * x[i] - end - - for i in 1:n_basefuncs - ϕᵢ = unsafe_shape_value(ip, ξ, i) - rhsₑ[i] += cos(x[1]/π)*cos(x[2]/π)*ϕᵢ * dΩ - end - end -end - - -b = generate_rhs(FerriteGPURHS(dh, colors, ip, ip, qr)) -u = CUDA.fill(0.f0, ndofs(dh)); -# cg!(u, A, b; verbose=true); -# @benchmark CUDA.@sync mul!($b,$A,$u) +cudh = GPUDofHandler(dh) +cucols = CuArray.(colors) +Agpu = FerriteMFMassOperator(cudh, cucols, ip, ip, SmallQuadratureRule(qr)) +bgpu = generate_rhs(FerriteRHS(cudh, cucols, ip, ip, SmallQuadratureRule(qr))) +u = CUDA.fill(0.0, ndofs(dh)); +cg!(u, Agpu, bgpu; verbose=true); +# @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu); # CUDA.@profile mul!(b,A,u) # ### Exporting to VTK diff --git a/src/Dofs/DofHandler.jl b/src/Dofs/DofHandler.jl index e6e88dff19..f43fbb039d 100644 --- a/src/Dofs/DofHandler.jl +++ b/src/Dofs/DofHandler.jl @@ -131,7 +131,7 @@ close!(dh) function DofHandler(grid::G) where {dim, G <: AbstractGrid{dim}} ncells = getncells(grid) sdhs = SubDofHandler{DofHandler{dim, G}}[] - DofHandler{dim, G}(sdhs, Symbol[], Int[], zeros(Int, ncells), zeros(Int, ncells), ScalarWrapper(false), grid, ScalarWrapper(-1)) + DofHandler{dim, G}(sdhs, Symbol[], Int[], zeros(Int, ncells+1), zeros(Int, ncells), ScalarWrapper(false), grid, ScalarWrapper(-1)) end function Base.show(io::IO, mime::MIME"text/plain", dh::DofHandler) @@ -264,7 +264,7 @@ function add!(sdh::SubDofHandler, name::Symbol, ip::Interpolation) # TODO: warn if interpolation type is not the same? end end - + # Check that interpolation is compatible with cells it it added to refshape_sdh = getrefshape(getcells(sdh.dh.grid, first(sdh.cellset))) if refshape_sdh !== getrefshape(ip) @@ -374,6 +374,7 @@ function __close!(dh::DofHandler{dim}) where {dim} facedicts, ) end + dh.cell_dofs_offset[end] = length(dh.cell_dofs) + 1 dh.ndofs[] = maximum(dh.cell_dofs; init=0) dh.closed[] = true From 955a233635dadf5499ce3566760663556cea035d Mon Sep 17 00:00:00 2001 From: termi-official Date: Sun, 31 Dec 2023 00:44:19 +0100 Subject: [PATCH 09/16] Derp. --- docs/src/literate-tutorials/gpu_assembly.jl | 74 ++++++++++----------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl index 1a86c28c05..242910c5bd 100644 --- a/docs/src/literate-tutorials/gpu_assembly.jl +++ b/docs/src/literate-tutorials/gpu_assembly.jl @@ -75,7 +75,6 @@ GPUGrid(grid::Grid{<:Any,<:Any,T}) where T = GPUGrid(T, grid) function GPUGrid(::Type{T}, grid::Grid) where T GPUGrid( CuArray(getcells(grid)), - # CuArray(Vec{Ferrite.getdim(grid),T}.(get_node_coordinate.(getnodes(grid)))) CuArray(getnodes(grid)) ) end @@ -109,11 +108,13 @@ Adapt.@adapt_structure GPUDofHandler function shape_values(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}) ξ_x = ξ[1] ξ_y = ξ[2] + Nx = ((1 - ξ_x), (1 + ξ_x)) + Ny = ((1 - ξ_y), (1 + ξ_y)) return @SVector [ - (1 - ξ_x) * (1 - ξ_y) / 4, - (1 + ξ_x) * (1 - ξ_y) / 4, - (1 + ξ_x) * (1 + ξ_y) / 4, - (1 - ξ_x) * (1 + ξ_y) / 4, + Nx[1] * Ny[1] / 4, + Nx[2] * Ny[1] / 4, + Nx[2] * Ny[2] / 4, + Nx[1] * Ny[2] / 4, ] end @@ -121,15 +122,18 @@ function shape_values(ip::Lagrange{RefHexahedron, 1}, ξ::Vec{3}) ξ_x = ξ[1] ξ_y = ξ[2] ξ_z = ξ[3] + Nx = ((1 - ξ_x), (1 + ξ_x)) + Ny = ((1 - ξ_y), (1 + ξ_y)) + Nz = ((1 - ξ_z), (1 + ξ_z)) return @SVector [ - (1 - ξ_x) * (1 - ξ_y) * (1 - ξ_z) / 8, - (1 + ξ_x) * (1 - ξ_y) * (1 - ξ_z) / 8, - (1 + ξ_x) * (1 + ξ_y) * (1 - ξ_z) / 8, - (1 - ξ_x) * (1 + ξ_y) * (1 - ξ_z) / 8, - (1 - ξ_x) * (1 - ξ_y) * (1 + ξ_z) / 8, - (1 + ξ_x) * (1 - ξ_y) * (1 + ξ_z) / 8, - (1 + ξ_x) * (1 + ξ_y) * (1 + ξ_z) / 8, - (1 - ξ_x) * (1 + ξ_y) * (1 + ξ_z) / 8, + Nx[1] * Ny[1] * Nz[1] / 8, + Nx[2] * Ny[1] * Nz[1] / 8, + Nx[2] * Ny[2] * Nz[1] / 8, + Nx[1] * Ny[2] * Nz[1] / 8, + Nx[1] * Ny[1] * Nz[2] / 8, + Nx[2] * Ny[1] * Nz[2] / 8, + Nx[2] * Ny[2] * Nz[2] / 8, + Nx[1] * Ny[2] * Nz[2] / 8, ] end @@ -238,8 +242,8 @@ function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::SmallQuadratureRul for i in 1:n_basefuncs ϕᵢ = shapes[i] for j in 1:n_basefuncs - inner = shapes[j]*uₑin[j] - uₑout[i] += inner*ϕᵢ*dΩ + ϕⱼ = shapes[j] + uₑout[i] += ϕᵢ*ϕⱼ*uₑin[j]*dΩ end end end @@ -250,10 +254,9 @@ end function gpu_full_mass_action_kernel!(uout::AbstractVector, uin::AbstractVector, gpudh::GPUDofHandler, cell_indices::AbstractVector, qr::SmallQuadratureRule, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation) index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x stride = gridDim().x * blockDim().x - i = index - while i <= length(cell_indices) + while index <= length(cell_indices) ## Get index of the current cell - cell_index = cell_indices[i] + cell_index = cell_indices[index] ## Grab the actual cell cell = gpudh.grid.cells[cell_index] ## Grab the dofs on the cell @@ -265,7 +268,7 @@ function gpu_full_mass_action_kernel!(uout::AbstractVector, uin::AbstractVector, xₑ = cellnodesvec(cell, gpudh.grid.nodes) ## Apply local action for y = Ax gpu_element_mass_action2!(uₑout, uₑin, qr, ip, ip_geo, xₑ) - i += stride + index += stride end return nothing end @@ -278,12 +281,11 @@ function gpu_full_mass_action!(uout::AbstractVector, u::AbstractVector, dh::GPUD # Apply action one time for color ∈ colors - numthreads = 256 - numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) + #numthreads = 1 + #numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) # try - @cuda threads=numthreads blocks=numblocks gpu_full_mass_action_kernel!(uout, u, dh, color, qr, ip, ip_geo) + @cuda gpu_full_mass_action_kernel!(uout, u, dh, color, qr, ip, ip_geo) synchronize() - break # catch err # code_typed(err; interactive = true) # end @@ -355,10 +357,9 @@ end function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC) where {RHS, GPUDH, QR, FIP, GIP, GPUC} index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x stride = gridDim().x * blockDim().x - i = index - while i <= length(cell_indices) + while index <= length(cell_indices) ## Get index of the current cell - cell_index = cell_indices[i] + cell_index = cell_indices[index] ## Grab the actual cell cell = gpudh.grid.cells[cell_index] # ## Grab the dofs on the cell @@ -368,7 +369,6 @@ function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, c ## Grab coordinate array coords = cellnodesvec(cell, gpudh.grid.nodes) ## Apply local action for y = Ax - n_basefuncs = getnbasefunctions(ip) for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues) ξ = qr.points[q_point] # TODO recover abstraction layer @@ -378,27 +378,27 @@ function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, c # # TODO spatial_coordinate x = zero(Vec{3,Float64}) ϕᵍ = shape_values(ip_geo, ξ) - for i in 1:n_basefuncs + for i in 1:getnbasefunctions(ip_geo) x += ϕᵍ[i]*coords[i] #geometric_value(fe_v, q_point, i) * x[i] end ϕᶠ = shape_values(ip, ξ) - @inbounds for i in 1:n_basefuncs + @inbounds for i in 1:getnbasefunctions(ip) rhsₑ[i] += cos(x[1]/π)*cos(x[2]/π)*ϕᶠ[i]*dΩ end end - i += stride + index += stride end end function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler}) rhs = CuArray(zeros(Float64,ndofs(dh))) - numthreads = 256 - numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) + # numthreads = 1 + # numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) for color ∈ gpurhs.colors # try - @cuda threads=numthreads blocks=numblocks gpu_rhs_kernel!(rhs, gpurhs.dh, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color) + @cuda gpu_rhs_kernel!(rhs, gpurhs.dh, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color) synchronize() # catch err # code_typed(err; interactive = true) @@ -427,12 +427,12 @@ function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler}) end cudh = GPUDofHandler(dh) -cucols = CuArray.(colors) -Agpu = FerriteMFMassOperator(cudh, cucols, ip, ip, SmallQuadratureRule(qr)) -bgpu = generate_rhs(FerriteRHS(cudh, cucols, ip, ip, SmallQuadratureRule(qr))) +cucolors = CuArray.(colors) +Agpu = FerriteMFMassOperator(cudh, cucolors, ip, ip, SmallQuadratureRule(qr)) +bgpu = generate_rhs(FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr))) u = CUDA.fill(0.0, ndofs(dh)); cg!(u, Agpu, bgpu; verbose=true); -# @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu); +# @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu) # CUDA.@profile mul!(b,A,u) # ### Exporting to VTK From 75c6dd716fe747876bd85cb40a42dcf530263a4b Mon Sep 17 00:00:00 2001 From: termi-official Date: Sun, 31 Dec 2023 01:03:27 +0100 Subject: [PATCH 10/16] Back to Float32. --- docs/src/literate-tutorials/gpu_assembly.jl | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl index 242910c5bd..bfde120974 100644 --- a/docs/src/literate-tutorials/gpu_assembly.jl +++ b/docs/src/literate-tutorials/gpu_assembly.jl @@ -177,7 +177,7 @@ cellnodesvec(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]].x, nodes[cell.node # We start by generating a simple grid with 20x20 quadrilateral elements # using `generate_grid`. The generator defaults to the unit square, # so we don't need to specify the corners of the domain. -grid = generate_grid(Hexahedron, (20, 20, 20)); +grid = generate_grid(Hexahedron, (20, 20, 20), Vec{3}((-1.f0,-1.f0,-1.f0)), Vec{3}((1.f0,1.f0,1.f0))); colors = create_coloring(grid); # TODO add example without coloring, i.e. using Atomics instead # ### Trial and test functions @@ -189,7 +189,7 @@ colors = create_coloring(grid); # TODO add example without coloring, i.e. using # the same reference element. We combine the interpolation and the quadrature rule # to a `CellValues` object. ip = Lagrange{RefHexahedron, 1}() -qr = QuadratureRule{RefHexahedron}(2) +qr = QuadratureRule{RefHexahedron,Float32}(2) cellvalues = CellValues(qr, ip); # ### Degrees of freedom @@ -354,7 +354,7 @@ struct FerriteRHS{DH, COLS, IP, GIP, QR} qr::QR end -function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC) where {RHS, GPUDH, QR, FIP, GIP, GPUC} +function gpu_rhs_kernel!(rhs::CuDeviceVector{T}, gpudh::GPUDofHandler{<:GPUGrid{<:Any,<:Any,T}}, qr::SmallQuadratureRule{<:Any,T}, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation, cell_indices::AbstractVector) where {T} index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x stride = gridDim().x * blockDim().x while index <= length(cell_indices) @@ -376,7 +376,7 @@ function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, c dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point) # # TODO spatial_coordinate - x = zero(Vec{3,Float64}) + x = zero(Vec{3,T}) ϕᵍ = shape_values(ip_geo, ξ) for i in 1:getnbasefunctions(ip_geo) x += ϕᵍ[i]*coords[i] #geometric_value(fe_v, q_point, i) * x[i] @@ -392,8 +392,8 @@ function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, c end -function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler}) - rhs = CuArray(zeros(Float64,ndofs(dh))) +function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler{<:GPUGrid{<:Any,<:Any,T}}}) where T + rhs = CuArray(zeros(T,ndofs(dh))) # numthreads = 1 # numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) for color ∈ gpurhs.colors @@ -429,8 +429,9 @@ end cudh = GPUDofHandler(dh) cucolors = CuArray.(colors) Agpu = FerriteMFMassOperator(cudh, cucolors, ip, ip, SmallQuadratureRule(qr)) -bgpu = generate_rhs(FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr))) -u = CUDA.fill(0.0, ndofs(dh)); +rhsop = FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr)) +bgpu = generate_rhs(rhsop) +u = CUDA.fill(0.f0, ndofs(dh)); cg!(u, Agpu, bgpu; verbose=true); # @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu) # CUDA.@profile mul!(b,A,u) From 10d6a51085d606f1fc3326fd0ab36636fad21f1c Mon Sep 17 00:00:00 2001 From: termi-official Date: Sun, 31 Dec 2023 03:01:20 +0100 Subject: [PATCH 11/16] Fix Float32 2D grid generator. --- src/Grid/grid_generators.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Grid/grid_generators.jl b/src/Grid/grid_generators.jl index 38bebc88f0..b2d119bf2e 100644 --- a/src/Grid/grid_generators.jl +++ b/src/Grid/grid_generators.jl @@ -83,7 +83,7 @@ function generate_grid(::Type{QuadraticLine}, nel::NTuple{1,Int}, left::Vec{1,T} return Grid(cells, nodes, facesets=facesets, boundary_matrix=boundary_matrix) end -function _generate_2d_nodes!(nodes, nx, ny, LL, LR, UR, UL) +function _generate_2d_nodes!(nodes::AbstractVector{<:Node{2,T}}, nx, ny, LL, LR, UR, UL) where T for i in 0:ny-1 ratio_bounds = i / (ny-1) @@ -97,7 +97,7 @@ function _generate_2d_nodes!(nodes, nx, ny, LL, LR, UR, UL) ratio = j / (nx-1) x = x0 * (1 - ratio) + ratio * x1 y = y0 * (1 - ratio) + ratio * y1 - push!(nodes, Node((x, y))) + push!(nodes, Node((T(x), T(y)))) end end end From 778d57b075764ef7396d90b4c6ef1d56b56d3ea4 Mon Sep 17 00:00:00 2001 From: termi-official Date: Sun, 31 Dec 2023 03:01:42 +0100 Subject: [PATCH 12/16] Fix performance and add element assembly. --- docs/src/literate-tutorials/gpu_assembly.jl | 171 +++++++++++++------- 1 file changed, 117 insertions(+), 54 deletions(-) diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl index bfde120974..252d1887fd 100644 --- a/docs/src/literate-tutorials/gpu_assembly.jl +++ b/docs/src/literate-tutorials/gpu_assembly.jl @@ -46,7 +46,7 @@ using IterativeSolvers, LinearAlgebra ### TODO Extension import Adapt -using StaticArrays +using StaticArrays, SparseArrays struct SmallQuadratureRule{N,T,dim} weights::SVector{N,T} points::SVector{N,Vec{dim,T}} @@ -177,32 +177,81 @@ cellnodesvec(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]].x, nodes[cell.node # We start by generating a simple grid with 20x20 quadrilateral elements # using `generate_grid`. The generator defaults to the unit square, # so we don't need to specify the corners of the domain. -grid = generate_grid(Hexahedron, (20, 20, 20), Vec{3}((-1.f0,-1.f0,-1.f0)), Vec{3}((1.f0,1.f0,1.f0))); +# grid = generate_grid(Hexahedron, (2, 1, 1), Vec{3}((-1.f0,-1.f0,-1.f0)), Vec{3}((1.f0,1.f0,1.f0))); +grid = generate_grid(Quadrilateral, (2, 1), Vec((-1.f0,-1.f0)), Vec((1.f0,1.f0))); colors = create_coloring(grid); # TODO add example without coloring, i.e. using Atomics instead -# ### Trial and test functions -# A `CellValues` facilitates the process of evaluating values and gradients of -# test and trial functions (among other things). To define -# this we need to specify an interpolation space for the shape functions. -# We use Lagrange functions -# based on the two-dimensional reference quadrilateral. We also define a quadrature rule based on -# the same reference element. We combine the interpolation and the quadrature rule -# to a `CellValues` object. -ip = Lagrange{RefHexahedron, 1}() -qr = QuadratureRule{RefHexahedron,Float32}(2) +ip = Lagrange{RefQuadrilateral, 1}() +qr = QuadratureRule{RefQuadrilateral,Float32}(2) cellvalues = CellValues(qr, ip); -# ### Degrees of freedom -# Next we need to define a `DofHandler`, which will take care of numbering -# and distribution of degrees of freedom for our approximated fields. -# We create the `DofHandler` and then add a single scalar field called `:u` based on -# our interpolation `ip` defined above. -# Lastly we `close!` the `DofHandler`, it is now that the dofs are distributed -# for all the elements. dh = DofHandler(grid) add!(dh, :u, ip) close!(dh); +function assemble_element!(Ke::AbstractMatrix, cellvalues::CellValues) + n_basefuncs = getnbasefunctions(cellvalues) + fill!(Ke, 0) + for q_point in 1:getnquadpoints(cellvalues) + dΩ = getdetJdV(cellvalues, q_point) + for i in 1:n_basefuncs + δu = shape_value(cellvalues, q_point, i) + for j in 1:n_basefuncs + u = shape_value(cellvalues, q_point, j) + Ke[i, j] += (δu ⋅ u) * dΩ + end + end + end + return Ke +end + +function assemble_global(cellvalues::CellValues, K::SparseMatrixCSC, dh::DofHandler) + n_basefuncs = getnbasefunctions(cellvalues) + Ke = zeros(n_basefuncs, n_basefuncs) + assembler = start_assemble(K) + for cell in CellIterator(dh) + reinit!(cellvalues, cell) + assemble_element!(Ke, cellvalues) + assemble!(assembler, celldofs(cell), Ke) + end + return K +end + +function assemble_global(cellvalues::CellValues, K::Array{<:Any,3}, dh::DofHandler) + for cell in CellIterator(dh) + ## Reinitialize cellvalues for this cell + reinit!(cellvalues, cell) + ## Compute element contribution + Ke = @view K[:,:,cellid(cell)] + assemble_element!(Ke, cellvalues) + end + return K +end + +Kcpu = create_sparsity_pattern(dh) +assemble_global(cellvalues, Kcpu, dh) + +Keacpu = zeros(Float32, ndofs_per_cell(dh,1),ndofs_per_cell(dh,1),length(dh.grid.cells)) +assemble_global(cellvalues, Keacpu, dh) + +function d2eop(dh::DofHandler) + I = Int32[] + J = Int32[] + next_edof_idx = 1 + for ei ∈ 1:length(dh.grid.cells) + cell_dofs = celldofs(dh, ei) + for cell_dof ∈ cell_dofs + push!(I, Int32(next_edof_idx)) + push!(J, Int32(cell_dof)) + next_edof_idx += 1 + end + end + V = ones(Bool, next_edof_idx-1) + return sparse(I,J,V) +end + +d2e = d2eop(dh) + # ### Boundary conditions # In Ferrite constraints like Dirichlet boundary conditions # are handled by a `ConstraintHandler`. @@ -280,11 +329,11 @@ function gpu_full_mass_action!(uout::AbstractVector, u::AbstractVector, dh::GPUD synchronize() # Apply action one time + numthreads = 128 for color ∈ colors - #numthreads = 1 - #numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) + numblocks = ceil(Int, length(color)/numthreads) # try - @cuda gpu_full_mass_action_kernel!(uout, u, dh, color, qr, ip, ip_geo) + @cuda threads=numthreads blocks=numblocks gpu_full_mass_action_kernel!(uout, u, dh, color, qr, ip, ip_geo) synchronize() # catch err # code_typed(err; interactive = true) @@ -317,33 +366,44 @@ Base.eltype(A::FerriteMFMassOperator{<:Grid{<:Any,<:Any,T}}) where T = T Base.eltype(A::FerriteMFMassOperator{<:GPUGrid{<:Any,<:Any,T}}) where T = T Base.size(A::FerriteMFMassOperator, d) = ndofs(dh) # Square operator -# struct FerriteEAGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR, EAMATS <: AbstractArray{3}} -# # "GPUGrid" -# gpu_cells::CACELLS -# gpu_nodes::CANODES -# colors::CACOLORS -# # "GPUDofHandler" -# all_cell_dofs::CADOFS -# cell_dofs_offset::CAOFFSETS -# # "GPUValues" -# ip::IP -# ip_geo::GIP -# qr::QR -# # -# element_matrices::EAMATS -# end +struct FerriteEAMassOperator{D2E, XE2, EAMATS <: AbstractArray{<:Any,3}} + d2e::D2E + xe2::XE2 + element_matrices::EAMATS +end -# function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr) -# grid = Ferrite.get_grid(dh) -# FerriteGPUMassOperator( -# CuArray(getcells(grid)), -# CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))), -# colors, -# CuArray(dh.cell_dofs), -# CuArray(dh.cell_dofs_offset), -# ip, ip_geo, qr -# ) -# end +function gemv_batched!(xe2::CuDeviceArray{T,2}, emats::CuDeviceArray{T,3}, xe::CuDeviceArray{T,2}) where T + cell_index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x + stride = gridDim().x * blockDim().x + while cell_index ≤ size(emats,3) + xei = @view xe[:,cell_index] + xe2i = @view xe2[:,cell_index] + emati = @view emats[:,:,cell_index] + # mul!(xe2i, emati, xei) # fails? + xe2i .= T(0.0) + for i ∈ 1:size(emats,1), j ∈ 1:size(emats,2) + xe2i[i] += emati[i,j]*xei[j] + end + cell_index += stride + end +end + +function LinearAlgebra.mul!(y,A::FerriteEAMassOperator{<:Any,<:Any,<:CuArray},x) + xe = A.d2e*x + synchronize() + xev = reshape(xe, (size(A.element_matrices,2), size(A.element_matrices,3))) + xe2v = reshape(A.xe2, (size(A.element_matrices,2), size(A.element_matrices,3))) + synchronize() + # CUDA.CUBLAS.gemv_batched!(???) + numthreads = 128 + numblocks = ceil(Int, size(A.element_matrices,3)/numthreads) + @cuda threads=numthreads blocks=numblocks gemv_batched!(xe2v, A.element_matrices, xev) + y .= transpose(A.d2e)*A.xe2 + synchronize() +end +Base.eltype(A::FerriteEAMassOperator{<:Grid{<:Any,<:Any,T}}) where T = T +Base.eltype(A::FerriteEAMassOperator{<:GPUGrid{<:Any,<:Any,T}}) where T = T +Base.size(A::FerriteEAMassOperator, d) = ndofs(dh) # Square operator struct FerriteRHS{DH, COLS, IP, GIP, QR} dh::DH @@ -354,7 +414,7 @@ struct FerriteRHS{DH, COLS, IP, GIP, QR} qr::QR end -function gpu_rhs_kernel!(rhs::CuDeviceVector{T}, gpudh::GPUDofHandler{<:GPUGrid{<:Any,<:Any,T}}, qr::SmallQuadratureRule{<:Any,T}, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation, cell_indices::AbstractVector) where {T} +function gpu_rhs_kernel!(rhs::CuDeviceVector{T}, gpudh::GPUDofHandler{<:GPUGrid{sdim,<:Any,T}}, qr::SmallQuadratureRule{<:Any,T}, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation, cell_indices::AbstractVector) where {sdim,T} index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x stride = gridDim().x * blockDim().x while index <= length(cell_indices) @@ -376,7 +436,7 @@ function gpu_rhs_kernel!(rhs::CuDeviceVector{T}, gpudh::GPUDofHandler{<:GPUGrid{ dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point) # # TODO spatial_coordinate - x = zero(Vec{3,T}) + x = zero(Vec{sdim,T}) ϕᵍ = shape_values(ip_geo, ξ) for i in 1:getnbasefunctions(ip_geo) x += ϕᵍ[i]*coords[i] #geometric_value(fe_v, q_point, i) * x[i] @@ -394,11 +454,11 @@ end function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler{<:GPUGrid{<:Any,<:Any,T}}}) where T rhs = CuArray(zeros(T,ndofs(dh))) - # numthreads = 1 - # numblocks = 1 #fails...? ceil(Int, length(color)/numthreads) + numthreads = 128 for color ∈ gpurhs.colors + numblocks = ceil(Int, length(color)/numthreads) # try - @cuda gpu_rhs_kernel!(rhs, gpurhs.dh, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color) + @cuda threads=numthreads blocks=numblocks gpu_rhs_kernel!(rhs, gpurhs.dh, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color) synchronize() # catch err # code_typed(err; interactive = true) @@ -432,10 +492,13 @@ Agpu = FerriteMFMassOperator(cudh, cucolors, ip, ip, SmallQuadratureRule(qr)) rhsop = FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr)) bgpu = generate_rhs(rhsop) u = CUDA.fill(0.f0, ndofs(dh)); -cg!(u, Agpu, bgpu; verbose=true); +# cg!(u, Agpu, bgpu; verbose=true); # @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu) # CUDA.@profile mul!(b,A,u) +Aeagpu = FerriteEAMassOperator(CUDA.CUSPARSE.CuSparseMatrixCSC(d2e), CuArray(zeros(Float32, size(d2e,1))), CuArray(Keacpu)) +# @benchmark CUDA.@sync mul!($u,$Aeagpu,$bgpu) + # ### Exporting to VTK # To visualize the result we export the grid and our field `u` # to a VTK-file, which can be viewed in e.g. [ParaView](https://www.paraview.org/). From 3e9edc521c387fc46b71346a90a20711977fbec3 Mon Sep 17 00:00:00 2001 From: termi-official Date: Sun, 31 Dec 2023 03:50:49 +0100 Subject: [PATCH 13/16] Benchmark code. --- docs/src/literate-tutorials/gpu_assembly.jl | 50 ++++++++++----------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl index 252d1887fd..7a15fb520c 100644 --- a/docs/src/literate-tutorials/gpu_assembly.jl +++ b/docs/src/literate-tutorials/gpu_assembly.jl @@ -177,12 +177,12 @@ cellnodesvec(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]].x, nodes[cell.node # We start by generating a simple grid with 20x20 quadrilateral elements # using `generate_grid`. The generator defaults to the unit square, # so we don't need to specify the corners of the domain. -# grid = generate_grid(Hexahedron, (2, 1, 1), Vec{3}((-1.f0,-1.f0,-1.f0)), Vec{3}((1.f0,1.f0,1.f0))); -grid = generate_grid(Quadrilateral, (2, 1), Vec((-1.f0,-1.f0)), Vec((1.f0,1.f0))); +grid = generate_grid(Hexahedron, (100, 100, 100), Vec{3}((-1.f0,-1.f0,-1.f0)), Vec{3}((1.f0,1.f0,1.f0))); +# grid = generate_grid(Quadrilateral, (2, 1), Vec((-1.f0,-1.f0)), Vec((1.f0,1.f0))); colors = create_coloring(grid); # TODO add example without coloring, i.e. using Atomics instead -ip = Lagrange{RefQuadrilateral, 1}() -qr = QuadratureRule{RefQuadrilateral,Float32}(2) +ip = Lagrange{RefHexahedron, 1}() +qr = QuadratureRule{RefHexahedron,Float32}(2) cellvalues = CellValues(qr, ip); dh = DofHandler(grid) @@ -228,7 +228,7 @@ function assemble_global(cellvalues::CellValues, K::Array{<:Any,3}, dh::DofHandl return K end -Kcpu = create_sparsity_pattern(dh) +Kcpu = convert(SparseMatrixCSC{Float32, Int32}, create_sparsity_pattern(dh)) assemble_global(cellvalues, Kcpu, dh) Keacpu = zeros(Float32, ndofs_per_cell(dh,1),ndofs_per_cell(dh,1),length(dh.grid.cells)) @@ -246,35 +246,20 @@ function d2eop(dh::DofHandler) next_edof_idx += 1 end end - V = ones(Bool, next_edof_idx-1) + V = ones(Float32, next_edof_idx-1) return sparse(I,J,V) end d2e = d2eop(dh) # ### Boundary conditions -# In Ferrite constraints like Dirichlet boundary conditions -# are handled by a `ConstraintHandler`. # ch = ConstraintHandler(dh); - -# Next we need to add constraints to `ch`. For this problem we define -# homogeneous Dirichlet boundary conditions on the whole boundary, i.e. -# the `union` of all the face sets on the boundary. # ∂Ω = union( # getfaceset(grid, "left"), # getfaceset(grid, "right"), # getfaceset(grid, "top"), # getfaceset(grid, "bottom"), # ); - -# Now we are set up to define our constraint. We specify which field -# the condition is for, and our combined face set `∂Ω`. The last -# argument is a function of the form $f(\textbf{x})$ or $f(\textbf{x}, t)$, -# where $\textbf{x}$ is the spatial coordinate and -# $t$ the current time, and returns the prescribed value. Since the boundary condition in -# this case do not depend on time we define our function as $f(\textbf{x}) = 0$, i.e. -# no matter what $\textbf{x}$ we return $0$. When we have -# specified our constraint we `add!` it to `ch`. # dbc = Dirichlet(:u, ∂Ω, (x, t) -> 0) # add!(ch, dbc); # close!(ch) @@ -373,6 +358,10 @@ struct FerriteEAMassOperator{D2E, XE2, EAMATS <: AbstractArray{<:Any,3}} end function gemv_batched!(xe2::CuDeviceArray{T,2}, emats::CuDeviceArray{T,3}, xe::CuDeviceArray{T,2}) where T + # base_cell_index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x + # stride = blockDim().x + # for cell_index ∈ base_cell_index:min(base_cell_index+stride,size(emats,3)) + cell_index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x stride = gridDim().x * blockDim().x while cell_index ≤ size(emats,3) @@ -381,7 +370,7 @@ function gemv_batched!(xe2::CuDeviceArray{T,2}, emats::CuDeviceArray{T,3}, xe::C emati = @view emats[:,:,cell_index] # mul!(xe2i, emati, xei) # fails? xe2i .= T(0.0) - for i ∈ 1:size(emats,1), j ∈ 1:size(emats,2) + for j ∈ 1:size(emats,2), i ∈ 1:size(emats,1) xe2i[i] += emati[i,j]*xei[j] end cell_index += stride @@ -398,7 +387,8 @@ function LinearAlgebra.mul!(y,A::FerriteEAMassOperator{<:Any,<:Any,<:CuArray},x) numthreads = 128 numblocks = ceil(Int, size(A.element_matrices,3)/numthreads) @cuda threads=numthreads blocks=numblocks gemv_batched!(xe2v, A.element_matrices, xev) - y .= transpose(A.d2e)*A.xe2 + synchronize() + mul!(y, transpose(A.d2e), A.xe2) synchronize() end Base.eltype(A::FerriteEAMassOperator{<:Grid{<:Any,<:Any,T}}) where T = T @@ -491,13 +481,19 @@ cucolors = CuArray.(colors) Agpu = FerriteMFMassOperator(cudh, cucolors, ip, ip, SmallQuadratureRule(qr)) rhsop = FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr)) bgpu = generate_rhs(rhsop) -u = CUDA.fill(0.f0, ndofs(dh)); -# cg!(u, Agpu, bgpu; verbose=true); +ugpu = CUDA.fill(0.f0, ndofs(dh)); +# cg!(ugpu, Agpu, bgpu; verbose=true); # @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu) -# CUDA.@profile mul!(b,A,u) Aeagpu = FerriteEAMassOperator(CUDA.CUSPARSE.CuSparseMatrixCSC(d2e), CuArray(zeros(Float32, size(d2e,1))), CuArray(Keacpu)) -# @benchmark CUDA.@sync mul!($u,$Aeagpu,$bgpu) +# @benchmark CUDA.@sync mul!($ugpu,$Aeagpu,$bgpu) + +ucpu = copy(Array(ugpu)) +bcpu = copy(Array(bgpu)) +# @benchmark mul!($u,$Kcpu,$(Array(bgpu))) + +Kgpu = CUDA.CUSPARSE.CuSparseMatrixCSC(Kcpu) +# @benchmark mul!($ugpu,$Kcpu,$bgpu) # ### Exporting to VTK # To visualize the result we export the grid and our field `u` From 7a42d4cd7272db71696bf4f9ff0d1e419b4d820c Mon Sep 17 00:00:00 2001 From: termi-official Date: Sun, 31 Dec 2023 23:50:49 +0100 Subject: [PATCH 14/16] Move small quadrature rule to Ferrite core --- docs/src/literate-tutorials/gpu_assembly.jl | 12 ----------- src/Quadrature/quadrature.jl | 22 +++++++++++++++++++++ src/exports.jl | 2 ++ 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl index 7a15fb520c..5277eb5e7a 100644 --- a/docs/src/literate-tutorials/gpu_assembly.jl +++ b/docs/src/literate-tutorials/gpu_assembly.jl @@ -47,18 +47,6 @@ using IterativeSolvers, LinearAlgebra ### TODO Extension import Adapt using StaticArrays, SparseArrays -struct SmallQuadratureRule{N,T,dim} - weights::SVector{N,T} - points::SVector{N,Vec{dim,T}} -end -SmallQuadratureRule(qr::QuadratureRule) = SmallQuadratureRule(SVector{length(qr.weights)}(qr.weights), SVector{length(qr.points)}(qr.points)) -function Adapt.adapt_structure(to, qr::QuadratureRule{shape,T,dim}) where {shape,T,dim} - N = length(qr.weights) - SmallQuadratureRule{N,T,dim}(SVector{N,T}(qr.weights), SVector{N,Vec{dim,T}}(qr.points)) -end -function Adapt.adapt_structure(to, nodes::Vector{Node{dim,T}}) where {dim,T} - CuArray(Vec{dim,T}.(get_node_coordinate.(grid.nodes))) -end struct GPUGrid{sdim,C<:Ferrite.AbstractCell,T<:Real, CELA<:AbstractArray{C,1}, NODA<:AbstractArray{Node{sdim,T},1}} <: Ferrite.AbstractGrid{sdim} cells::CELA diff --git a/src/Quadrature/quadrature.jl b/src/Quadrature/quadrature.jl index 5623f89b19..ec4d5be589 100644 --- a/src/Quadrature/quadrature.jl +++ b/src/Quadrature/quadrature.jl @@ -154,6 +154,17 @@ function QuadratureRule{RefPyramid, T}(quad_type::Symbol, order::Int) where T QuadratureRule{RefPyramid,T}(weights, points) end +####################### +# SmallQuadratureRule # +####################### +struct SmallQuadratureRule{N,T,dim} + weights::SVector{N,T} + points::SVector{N,Vec{dim,T}} +end +SmallQuadratureRule(qr::QuadratureRule) = SmallQuadratureRule(SVector{length(qr.weights)}(qr.weights), SVector{length(qr.points)}(qr.points)) +SmallQuadratureRule{refshape,T}(quad_type::Symbol, order::Int) where {refshape,T} = SmallQuadratureRule(QuadratureRule{refshape,T}(quad_type, order)) +SmallQuadratureRule{refshape,T}(order::Int) where {refshape,T} = SmallQuadratureRule(QuadratureRule{refshape,T}(order)) + ###################### # FaceQuadratureRule # ###################### @@ -230,6 +241,17 @@ function FaceQuadratureRule{RefPyramid, T}(quad_type::Symbol, order::Int) where [2,3,4,5], qr_tri.weights, qr_tri.points) end + +########################### +# SmallFaceQuadratureRule # +########################### +struct SmallFaceQuadratureRule{N,T,dim,M} + face_rules::SVector{M,SmallQuadratureRule{N, T, dim}} +end +SmallFaceQuadratureRule(fqr::FaceQuadratureRule{shape}) where shape = SmallFaceQuadratureRule(SVector{nfaces(shape)}(SmallQuadratureRule.(fqr.face_rules))) +SmallFaceQuadratureRule{refshape,T}(quad_type::Symbol, order::Int) where {refshape,T} = SmallFaceQuadratureRule(FaceQuadratureRule{refshape,T}(quad_type, order)) +SmallFaceQuadratureRule{refshape,T}(order::Int) where {refshape,T} = SmallFaceQuadratureRule(FaceQuadratureRule{refshape,T}(order)) + ################## # Common methods # ################## diff --git a/src/exports.jl b/src/exports.jl index dab7e368fb..27997ddb5d 100644 --- a/src/exports.jl +++ b/src/exports.jl @@ -21,7 +21,9 @@ export # Quadrature QuadratureRule, + SmallQuadratureRule, FaceQuadratureRule, + SmallFaceQuadratureRule, getnquadpoints, # FEValues From 4c563746fda29ead33a76ff515cb897578db90bd Mon Sep 17 00:00:00 2001 From: termi-official Date: Mon, 15 Jan 2024 21:15:51 +0100 Subject: [PATCH 15/16] manifest --- docs/Manifest.toml | 286 +++++++++++++++++++-------------------------- 1 file changed, 121 insertions(+), 165 deletions(-) diff --git a/docs/Manifest.toml b/docs/Manifest.toml index 9d91fc57f3..2a62920e10 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -1,13 +1,13 @@ # This file is machine-generated - editing it directly is not advised -julia_version = "1.9.4" +julia_version = "1.10.0" manifest_format = "2.0" project_hash = "3221800eb396241cbc64f6deb94d7c003e4f881d" [[deps.ADTypes]] -git-tree-sha1 = "332e5d7baeff8497b923b730b994fa480601efc7" +git-tree-sha1 = "41c37aa88889c171f1300ceac1313c06e891d245" uuid = "47edcb42-4c32-4615-8424-f2b9edc5f35b" -version = "0.2.5" +version = "0.2.6" [[deps.ANSIColoredPrinters]] git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c" @@ -33,25 +33,6 @@ git-tree-sha1 = "faa260e4cb5aba097a73fab382dd4b5819d8ec8c" uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" version = "0.4.4" -[[deps.Accessors]] -deps = ["CompositionsBase", "ConstructionBase", "Dates", "InverseFunctions", "LinearAlgebra", "MacroTools", "Test"] -git-tree-sha1 = "a7055b939deae2455aa8a67491e034f735dd08d3" -uuid = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" -version = "0.1.33" - - [deps.Accessors.extensions] - AccessorsAxisKeysExt = "AxisKeys" - AccessorsIntervalSetsExt = "IntervalSets" - AccessorsStaticArraysExt = "StaticArrays" - AccessorsStructArraysExt = "StructArrays" - - [deps.Accessors.weakdeps] - AxisKeys = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5" - IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" - Requires = "ae029012-a4dd-5104-9daa-d747884805df" - StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" - StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" - [[deps.Adapt]] deps = ["LinearAlgebra", "Requires"] git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608" @@ -74,9 +55,9 @@ version = "0.2.0" [[deps.ArrayInterface]] deps = ["Adapt", "LinearAlgebra", "Requires", "SparseArrays", "SuiteSparse"] -git-tree-sha1 = "247efbccf92448be332d154d6ca56b9fcdd93c31" +git-tree-sha1 = "bbec08a37f8722786d87bedf84eae19c020c4efa" uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" -version = "7.6.1" +version = "7.7.0" [deps.ArrayInterface.extensions] ArrayInterfaceBandedMatricesExt = "BandedMatrices" @@ -96,9 +77,9 @@ version = "7.6.1" [[deps.ArrayLayouts]] deps = ["FillArrays", "LinearAlgebra"] -git-tree-sha1 = "af43df5704827c8618afd36eb56fcab20d3041ee" +git-tree-sha1 = "b08a4043e1c14096ef8efe4dd97e07de5cacf240" uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a" -version = "1.4.3" +version = "1.4.5" weakdeps = ["SparseArrays"] [deps.ArrayLayouts.extensions] @@ -275,16 +256,7 @@ weakdeps = ["Dates", "LinearAlgebra"] [[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" -version = "1.0.5+0" - -[[deps.CompositionsBase]] -git-tree-sha1 = "802bb88cd69dfd1509f6670416bd4434015693ad" -uuid = "a33af91c-f02d-484b-be07-31d278c5ca2b" -version = "0.1.2" -weakdeps = ["InverseFunctions"] - - [deps.CompositionsBase.extensions] - CompositionsBaseInverseFunctionsExt = "InverseFunctions" +version = "1.0.5+1" [[deps.ConcreteStructs]] git-tree-sha1 = "f749037478283d372048690eb3b5f92a79432b34" @@ -361,9 +333,9 @@ version = "1.9.1" [[deps.DiffEqBase]] deps = ["ArrayInterface", "DataStructures", "DocStringExtensions", "EnumX", "EnzymeCore", "FastBroadcast", "ForwardDiff", "FunctionWrappers", "FunctionWrappersWrappers", "LinearAlgebra", "Logging", "Markdown", "MuladdMacro", "Parameters", "PreallocationTools", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "Setfield", "SparseArrays", "Static", "StaticArraysCore", "Statistics", "Tricks", "TruncatedStacktraces"] -git-tree-sha1 = "4aff905c723e3f883064f00a4ba553b29372ce57" +git-tree-sha1 = "f45adda33e3eb24ae1428dfa9b57ded8248ced99" uuid = "2b5f629d-d688-5b77-993f-72d75c75574e" -version = "6.142.0" +version = "6.145.4" [deps.DiffEqBase.extensions] DiffEqBaseChainRulesCoreExt = "ChainRulesCore" @@ -570,9 +542,9 @@ version = "1.9.3" [[deps.FiniteDiff]] deps = ["ArrayInterface", "LinearAlgebra", "Requires", "Setfield", "SparseArrays"] -git-tree-sha1 = "c6e4a1fbe73b31a3dea94b1da449503b8830c306" +git-tree-sha1 = "73d1214fec245096717847c62d389a5d2ac86504" uuid = "6a86dc24-6348-571c-b903-95158fe2bd41" -version = "2.21.1" +version = "2.22.0" [deps.FiniteDiff.extensions] FiniteDiffBandedMatricesExt = "BandedMatrices" @@ -640,10 +612,10 @@ deps = ["Random"] uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" [[deps.GLFW_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "Xorg_libXcursor_jll", "Xorg_libXi_jll", "Xorg_libXinerama_jll", "Xorg_libXrandr_jll"] -git-tree-sha1 = "d972031d28c8c8d9d7b41a536ad7bb0c2579caca" +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Xorg_libXcursor_jll", "Xorg_libXi_jll", "Xorg_libXinerama_jll", "Xorg_libXrandr_jll"] +git-tree-sha1 = "ff38ba61beff76b8f4acad8ab0c97ef73bb670cb" uuid = "0656b61e-2033-5cc2-a64a-77c0f6c09b89" -version = "3.3.8+0" +version = "3.3.9+0" [[deps.GLU_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg"] @@ -654,7 +626,7 @@ version = "9.0.1+0" [[deps.GMP_jll]] deps = ["Artifacts", "Libdl"] uuid = "781609d7-10c4-51f6-84f2-b8444358ff6d" -version = "6.2.1+2" +version = "6.2.1+6" [[deps.GPUArrays]] deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] @@ -785,27 +757,16 @@ git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461" uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" version = "1.4.0" -[[deps.IntegerMathUtils]] -git-tree-sha1 = "b8ffb903da9f7b8cf695a8bead8e01814aa24b30" -uuid = "18e54dd8-cb9d-406c-a71d-865a43cbb235" -version = "0.1.2" - [[deps.IntelOpenMP_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "31d6adb719886d4e32e38197aae466e98881320b" +git-tree-sha1 = "5fdf2fe6724d8caabf43b557b84ce53f3b7e2f6b" uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0" -version = "2024.0.0+0" +version = "2024.0.2+0" [[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -[[deps.InverseFunctions]] -deps = ["Test"] -git-tree-sha1 = "68772f49f54b479fa88ace904f6127f0a3bb2e46" -uuid = "3587e190-3f89-42d0-90ee-14403ec27112" -version = "0.1.12" - [[deps.InvertedIndices]] git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" @@ -847,9 +808,15 @@ version = "0.21.4" [[deps.JSON3]] deps = ["Dates", "Mmap", "Parsers", "PrecompileTools", "StructTypes", "UUIDs"] -git-tree-sha1 = "95220473901735a0f4df9d1ca5b171b568b2daa3" +git-tree-sha1 = "eb3edce0ed4fa32f75a0a11217433c31d56bd48b" uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" -version = "1.13.2" +version = "1.14.0" + + [deps.JSON3.extensions] + JSON3ArrowExt = ["ArrowTypes"] + + [deps.JSON3.weakdeps] + ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd" [[deps.JSONSchema]] deps = ["Downloads", "JSON", "JSON3", "URIs"] @@ -877,9 +844,9 @@ version = "0.4.1" [[deps.KernelAbstractions]] deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] -git-tree-sha1 = "81de11f7b02465435aab0ed7e935965bfcb3072b" +git-tree-sha1 = "653e0824fc9ab55b3beec67a6dbbe514a65fb954" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -version = "0.9.14" +version = "0.9.15" weakdeps = ["EnzymeCore"] [deps.KernelAbstractions.extensions] @@ -925,10 +892,10 @@ uuid = "8b046642-f1f6-4319-8d3c-209ddc03c586" version = "1.0.0" [[deps.LLVMOpenMP_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "f689897ccbe049adb19a065c495e75f372ecd42b" +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "d986ce2d884d49126836ea94ed5bfb0f12679713" uuid = "1d63c593-3942-5779-bab2-d838dc0a180e" -version = "15.0.4+0" +version = "15.0.7+0" [[deps.LZO_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] @@ -955,12 +922,6 @@ version = "0.16.1" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" SymEngine = "123dc426-2d89-5057-bbad-38513e3affd8" -[[deps.LatticeRules]] -deps = ["Random"] -git-tree-sha1 = "7f5b02258a3ca0221a6a9710b0a0a2e8fb4957fe" -uuid = "73f95e8e-ec14-4e6a-8b18-0d2e271c4e55" -version = "0.0.1" - [[deps.LayoutPointers]] deps = ["ArrayInterface", "LinearAlgebra", "ManualMemory", "SIMDTypes", "Static", "StaticArrayInterface"] git-tree-sha1 = "62edfee3211981241b57ff1cedf4d74d79519277" @@ -980,9 +941,9 @@ version = "0.15.1" [[deps.LazyArrays]] deps = ["ArrayLayouts", "FillArrays", "LinearAlgebra", "MacroTools", "MatrixFactorizations", "SparseArrays"] -git-tree-sha1 = "08d56555410b0683cd7b48bff37aa59cbb0ea908" +git-tree-sha1 = "9cfca23ab83b0dfac93cb1a1ef3331ab9fe596a5" uuid = "5078a376-72f3-5289-bfd5-ec5146d43c02" -version = "1.8.2" +version = "1.8.3" weakdeps = ["StaticArrays"] [deps.LazyArrays.extensions] @@ -1003,9 +964,14 @@ uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" version = "8.4.0+0" [[deps.LibGit2]] -deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" +[[deps.LibGit2_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"] +uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5" +version = "1.6.4+0" + [[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" @@ -1085,10 +1051,10 @@ uuid = "18c40d15-f7cd-5a6d-bc92-87468d86c5db" version = "5.0.0+0" [[deps.LinearSolve]] -deps = ["ArrayInterface", "ConcreteStructs", "DocStringExtensions", "EnumX", "FastLapackInterface", "GPUArraysCore", "InteractiveUtils", "KLU", "Krylov", "Libdl", "LinearAlgebra", "MKL_jll", "PrecompileTools", "Preferences", "RecursiveFactorization", "Reexport", "Requires", "SciMLBase", "SciMLOperators", "Setfield", "SparseArrays", "Sparspak", "StaticArraysCore", "UnPack"] -git-tree-sha1 = "1961a307fae7aef8e2b331902c6072f0e90e5abb" +deps = ["ArrayInterface", "ConcreteStructs", "DocStringExtensions", "EnumX", "FastLapackInterface", "GPUArraysCore", "InteractiveUtils", "KLU", "Krylov", "Libdl", "LinearAlgebra", "MKL_jll", "PrecompileTools", "Preferences", "RecursiveFactorization", "Reexport", "SciMLBase", "SciMLOperators", "Setfield", "SparseArrays", "Sparspak", "StaticArraysCore", "UnPack"] +git-tree-sha1 = "97dc499678d50d989f1a74170840808641ce9880" uuid = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae" -version = "2.20.2" +version = "2.22.0" [deps.LinearSolve.extensions] LinearSolveBandedMatricesExt = "BandedMatrices" @@ -1196,9 +1162,9 @@ version = "5.6.0+0" [[deps.MacroTools]] deps = ["Markdown", "Random"] -git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48" +git-tree-sha1 = "b211c553c199c111d998ecdaf7623d1b89b69f93" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.11" +version = "0.5.12" [[deps.ManualMemory]] git-tree-sha1 = "bcaef4fc7a0cfe2cba636d84cda54b5e4e4ca3cd" @@ -1236,7 +1202,7 @@ version = "1.1.9" [[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" -version = "2.28.2+0" +version = "2.28.2+1" [[deps.Measures]] git-tree-sha1 = "c13304c81eec1ed3af7fc20e75fb6b26092a1102" @@ -1254,7 +1220,7 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804" [[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" -version = "2022.10.11" +version = "2023.1.10" [[deps.MuladdMacro]] git-tree-sha1 = "cac9cc5499c25554cba55cd3c30543cff5ca4fab" @@ -1267,12 +1233,6 @@ git-tree-sha1 = "a0b464d183da839699f4c79e7606d9d186ec172c" uuid = "d41bc354-129a-5804-8e4c-c37616107c6c" version = "7.8.3" -[[deps.NLsolve]] -deps = ["Distances", "LineSearches", "LinearAlgebra", "NLSolversBase", "Printf", "Reexport"] -git-tree-sha1 = "019f12e9a1a7880459d0173c182e6a99365d7ac1" -uuid = "2774e3e8-f4cf-5e23-947b-6d7e65073b56" -version = "4.5.1" - [[deps.NVTX]] deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"] git-tree-sha1 = "8bc9ce4233be3c63f8dcd78ccaf1b63a9c0baa34" @@ -1293,34 +1253,42 @@ version = "1.0.2" [[deps.NearestNeighbors]] deps = ["Distances", "StaticArrays"] -git-tree-sha1 = "3ef8ff4f011295fd938a521cb605099cecf084ca" +git-tree-sha1 = "ded64ff6d4fdd1cb68dfcbb818c69e144a5b2e4c" uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce" -version = "0.4.15" +version = "0.4.16" [[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" version = "1.2.0" [[deps.NonlinearSolve]] -deps = ["ADTypes", "ArrayInterface", "ConcreteStructs", "DiffEqBase", "EnumX", "FastBroadcast", "FiniteDiff", "ForwardDiff", "LazyArrays", "LineSearches", "LinearAlgebra", "LinearSolve", "MaybeInplace", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "SimpleNonlinearSolve", "SparseArrays", "SparseDiffTools", "StaticArrays", "UnPack"] -git-tree-sha1 = "ec6577e2ebccf19797edcbf5951229598ad35650" +deps = ["ADTypes", "ArrayInterface", "ConcreteStructs", "DiffEqBase", "EnumX", "FastBroadcast", "FastClosures", "FiniteDiff", "ForwardDiff", "LazyArrays", "LineSearches", "LinearAlgebra", "LinearSolve", "MaybeInplace", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "SimpleNonlinearSolve", "SparseArrays", "SparseDiffTools", "StaticArrays", "UnPack"] +git-tree-sha1 = "72b036b728461272ae1b1c3f7096cb4c319d8793" uuid = "8913a72c-1f9b-4ce2-8d82-65094dcecaec" -version = "3.0.1" +version = "3.4.0" [deps.NonlinearSolve.extensions] NonlinearSolveBandedMatricesExt = "BandedMatrices" NonlinearSolveFastLevenbergMarquardtExt = "FastLevenbergMarquardt" + NonlinearSolveFixedPointAccelerationExt = "FixedPointAcceleration" NonlinearSolveLeastSquaresOptimExt = "LeastSquaresOptim" NonlinearSolveMINPACKExt = "MINPACK" NonlinearSolveNLsolveExt = "NLsolve" + NonlinearSolveSIAMFANLEquationsExt = "SIAMFANLEquations" + NonlinearSolveSpeedMappingExt = "SpeedMapping" + NonlinearSolveSymbolicsExt = "Symbolics" NonlinearSolveZygoteExt = "Zygote" [deps.NonlinearSolve.weakdeps] BandedMatrices = "aae01518-5342-5314-be14-df237901396f" FastLevenbergMarquardt = "7a0df574-e128-4d35-8cbd-3d84502bf7ce" + FixedPointAcceleration = "817d07cb-a79a-5c30-9a31-890123675176" LeastSquaresOptim = "0fc2ff8b-aaa3-5acd-a817-1944a5e08891" MINPACK = "4854310b-de5a-5eb6-a2a5-c1dee2bd17f9" NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56" + SIAMFANLEquations = "084e46ad-d928-497d-ad5e-07fa361a48c4" + SpeedMapping = "f1835b91-879b-4a3f-a438-e4baacf14412" + Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [[deps.OCCT_jll]] @@ -1330,10 +1298,13 @@ uuid = "baad4e97-8daa-5946-aac2-2edac59d34e1" version = "7.6.2+2" [[deps.OffsetArrays]] -deps = ["Adapt"] -git-tree-sha1 = "2ac17d29c523ce1cd38e27785a7d23024853a4bb" +git-tree-sha1 = "6a731f2b5c03157418a20c12195eb4b74c8f8621" uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" -version = "1.12.10" +version = "1.13.0" +weakdeps = ["Adapt"] + + [deps.OffsetArrays.extensions] + OffsetArraysAdaptExt = "Adapt" [[deps.Ogg_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] @@ -1344,12 +1315,12 @@ version = "1.3.5+1" [[deps.OpenBLAS_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" -version = "0.3.21+4" +version = "0.3.23+2" [[deps.OpenLibm_jll]] deps = ["Artifacts", "Libdl"] uuid = "05823500-19ac-5b8b-9628-191a04bc5112" -version = "0.8.1+0" +version = "0.8.1+2" [[deps.OpenSSL]] deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "OpenSSL_jll", "Sockets"] @@ -1387,15 +1358,15 @@ uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" version = "1.6.3" [[deps.OrdinaryDiffEq]] -deps = ["ADTypes", "Adapt", "ArrayInterface", "DataStructures", "DiffEqBase", "DocStringExtensions", "ExponentialUtilities", "FastBroadcast", "FastClosures", "FillArrays", "FiniteDiff", "ForwardDiff", "FunctionWrappersWrappers", "IfElse", "InteractiveUtils", "LineSearches", "LinearAlgebra", "LinearSolve", "Logging", "LoopVectorization", "MacroTools", "MuladdMacro", "NLsolve", "NonlinearSolve", "Polyester", "PreallocationTools", "PrecompileTools", "Preferences", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLNLSolve", "SciMLOperators", "SimpleNonlinearSolve", "SimpleUnPack", "SparseArrays", "SparseDiffTools", "StaticArrayInterface", "StaticArrays", "TruncatedStacktraces"] -git-tree-sha1 = "70edac326ad5bf91a51871f2437c6faa65956ef8" +deps = ["ADTypes", "Adapt", "ArrayInterface", "DataStructures", "DiffEqBase", "DocStringExtensions", "ExponentialUtilities", "FastBroadcast", "FastClosures", "FillArrays", "FiniteDiff", "ForwardDiff", "FunctionWrappersWrappers", "IfElse", "InteractiveUtils", "LineSearches", "LinearAlgebra", "LinearSolve", "Logging", "LoopVectorization", "MacroTools", "MuladdMacro", "NonlinearSolve", "Polyester", "PreallocationTools", "PrecompileTools", "Preferences", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "SimpleNonlinearSolve", "SimpleUnPack", "SparseArrays", "SparseDiffTools", "StaticArrayInterface", "StaticArrays", "TruncatedStacktraces"] +git-tree-sha1 = "96ae028da53cdfe24712ab015a6f854cfd7609c0" uuid = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" -version = "6.61.0" +version = "6.66.0" [[deps.PCRE2_jll]] deps = ["Artifacts", "Libdl"] uuid = "efcefdf7-47ab-520b-bdef-62a2eaa19f15" -version = "10.42.0+0" +version = "10.42.0+1" [[deps.PackageExtensionCompat]] git-tree-sha1 = "fb28e33b8a95c4cee25ce296c817d89cc2e53518" @@ -1411,9 +1382,9 @@ version = "0.12.3" [[deps.Parsers]] deps = ["Dates", "PrecompileTools", "UUIDs"] -git-tree-sha1 = "a935806434c9d4c506ba941871b327b96d41f2bf" +git-tree-sha1 = "8489905bcdbcfac64d1daa51ca07c0d8f0283821" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "2.8.0" +version = "2.8.1" [[deps.Pipe]] git-tree-sha1 = "6842804e7867b115ca9de748a0cf6b364523c16d" @@ -1429,7 +1400,7 @@ version = "0.42.2+0" [[deps.Pkg]] deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -version = "1.9.2" +version = "1.10.0" [[deps.PlotThemes]] deps = ["PlotUtils", "Statistics"] @@ -1439,9 +1410,9 @@ version = "3.1.0" [[deps.PlotUtils]] deps = ["ColorSchemes", "Colors", "Dates", "PrecompileTools", "Printf", "Random", "Reexport", "Statistics"] -git-tree-sha1 = "f92e1315dadf8c46561fb9396e525f7200cdc227" +git-tree-sha1 = "862942baf5663da528f66d24996eb6da85218e76" uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043" -version = "1.3.5" +version = "1.4.0" [[deps.Plots]] deps = ["Base64", "Contour", "Dates", "Downloads", "FFMPEG", "FixedPointNumbers", "GR", "JLFzf", "JSON", "LaTeXStrings", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "PrecompileTools", "Preferences", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "RelocatableFolders", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs", "UnicodeFun", "UnitfulLatexify", "Unzip"] @@ -1489,9 +1460,9 @@ version = "0.2.4" [[deps.PreallocationTools]] deps = ["Adapt", "ArrayInterface", "ForwardDiff", "Requires"] -git-tree-sha1 = "f739b1b3cc7b9949af3b35089931f2b58c289163" +git-tree-sha1 = "01ac95fca7daabe77a9cb705862bd87016af9ddb" uuid = "d236fae5-4411-538c-8e31-a6e3d9e00b46" -version = "0.4.12" +version = "0.4.13" [deps.PreallocationTools.extensions] PreallocationToolsReverseDiffExt = "ReverseDiff" @@ -1517,12 +1488,6 @@ git-tree-sha1 = "88b895d13d53b5577fd53379d913b9ab9ac82660" uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" version = "2.3.1" -[[deps.Primes]] -deps = ["IntegerMathUtils"] -git-tree-sha1 = "1d05623b5952aed1307bf8b43bec8b8d1ef94b6e" -uuid = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae" -version = "0.5.5" - [[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -1539,31 +1504,19 @@ git-tree-sha1 = "7c29f0e8c575428bd84dc3c72ece5178caa67336" uuid = "c0090381-4147-56d7-9ebc-da0b1113ec56" version = "6.5.2+2" -[[deps.QuasiMonteCarlo]] -deps = ["Accessors", "ConcreteStructs", "LatticeRules", "LinearAlgebra", "Primes", "Random", "Requires", "Sobol", "StatsBase"] -git-tree-sha1 = "cc086f8485bce77b6187141e1413c3b55f9a4341" -uuid = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b" -version = "0.3.3" - - [deps.QuasiMonteCarlo.extensions] - QuasiMonteCarloDistributionsExt = "Distributions" - - [deps.QuasiMonteCarlo.weakdeps] - Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" - [[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" [[deps.Random]] -deps = ["SHA", "Serialization"] +deps = ["SHA"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[deps.Random123]] deps = ["Random", "RandomNumbers"] -git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3" +git-tree-sha1 = "c860e84651f58ce240dd79e5d9e055d55234c35a" uuid = "74087812-796a-5b5d-8853-05524746bad3" -version = "1.6.1" +version = "1.6.2" [[deps.RandomNumbers]] deps = ["Random", "Requires"] @@ -1584,18 +1537,20 @@ uuid = "01d81517-befc-4cb6-b9ec-a95719d0359c" version = "0.6.12" [[deps.RecursiveArrayTools]] -deps = ["Adapt", "ArrayInterface", "DocStringExtensions", "GPUArraysCore", "IteratorInterfaceExtensions", "LinearAlgebra", "RecipesBase", "Requires", "StaticArraysCore", "Statistics", "SymbolicIndexingInterface", "Tables"] -git-tree-sha1 = "d7087c013e8a496ff396bae843b1e16d9a30ede8" +deps = ["Adapt", "ArrayInterface", "DocStringExtensions", "GPUArraysCore", "IteratorInterfaceExtensions", "LinearAlgebra", "RecipesBase", "Requires", "SparseArrays", "StaticArraysCore", "Statistics", "SymbolicIndexingInterface", "Tables"] +git-tree-sha1 = "27ee1c03e732c488ecce1a25f0d7da9b5d936574" uuid = "731186ca-8d62-57ce-b412-fbd966d074cd" -version = "2.38.10" +version = "3.3.3" [deps.RecursiveArrayTools.extensions] + RecursiveArrayToolsFastBroadcastExt = "FastBroadcast" RecursiveArrayToolsMeasurementsExt = "Measurements" RecursiveArrayToolsMonteCarloMeasurementsExt = "MonteCarloMeasurements" RecursiveArrayToolsTrackerExt = "Tracker" RecursiveArrayToolsZygoteExt = "Zygote" [deps.RecursiveArrayTools.weakdeps] + FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898" Measurements = "eff96d63-e80a-5855-80a2-b1b0885c5ab7" MonteCarloMeasurements = "0987c9cc-fe09-11e8-30f0-b96dd679fdca" Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" @@ -1664,10 +1619,10 @@ uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa" version = "0.6.42" [[deps.SciMLBase]] -deps = ["ADTypes", "ArrayInterface", "CommonSolve", "ConstructionBase", "Distributed", "DocStringExtensions", "EnumX", "FillArrays", "FunctionWrappersWrappers", "IteratorInterfaceExtensions", "LinearAlgebra", "Logging", "Markdown", "PrecompileTools", "Preferences", "Printf", "QuasiMonteCarlo", "RecipesBase", "RecursiveArrayTools", "Reexport", "RuntimeGeneratedFunctions", "SciMLOperators", "StaticArraysCore", "Statistics", "SymbolicIndexingInterface", "Tables", "TruncatedStacktraces"] -git-tree-sha1 = "32ea825941f7b58a6f48268f4b76971ae8eb9eec" +deps = ["ADTypes", "ArrayInterface", "CommonSolve", "ConstructionBase", "Distributed", "DocStringExtensions", "EnumX", "FillArrays", "FunctionWrappersWrappers", "IteratorInterfaceExtensions", "LinearAlgebra", "Logging", "Markdown", "PrecompileTools", "Preferences", "Printf", "RecipesBase", "RecursiveArrayTools", "Reexport", "RuntimeGeneratedFunctions", "SciMLOperators", "StaticArraysCore", "Statistics", "SymbolicIndexingInterface", "Tables", "TruncatedStacktraces"] +git-tree-sha1 = "2c3706caa9adab5031f30937699c46e159ae477f" uuid = "0bca4576-84f4-4d90-8ffe-ffa030f20462" -version = "2.10.0" +version = "2.13.0" [deps.SciMLBase.extensions] SciMLBaseChainRulesCoreExt = "ChainRulesCore" @@ -1686,12 +1641,6 @@ version = "2.10.0" RCall = "6f49c342-dc21-5d91-9882-a32aef131414" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" -[[deps.SciMLNLSolve]] -deps = ["DiffEqBase", "LineSearches", "NLsolve", "Reexport", "SciMLBase"] -git-tree-sha1 = "765b788339abd7d983618c09cfc0192e2b6b15fd" -uuid = "e9a6253c-8580-4d32-9898-8661bb511710" -version = "0.1.9" - [[deps.SciMLOperators]] deps = ["ArrayInterface", "DocStringExtensions", "Lazy", "LinearAlgebra", "Setfield", "SparseArrays", "StaticArraysCore", "Tricks"] git-tree-sha1 = "51ae235ff058a64815e0a2c34b1db7578a06813d" @@ -1736,9 +1685,15 @@ version = "1.1.0" [[deps.SimpleNonlinearSolve]] deps = ["ADTypes", "ArrayInterface", "ConcreteStructs", "DiffEqBase", "FiniteDiff", "ForwardDiff", "LinearAlgebra", "MaybeInplace", "PrecompileTools", "Reexport", "SciMLBase", "StaticArraysCore"] -git-tree-sha1 = "62b08ae70b9fe7e4dacffcf9cbb0e5c2d6688d09" +git-tree-sha1 = "8d672bd91dc432fb286b6d4bcf1a5dc417e932a3" uuid = "727e6d20-b764-4bd8-a329-72de5adea6c7" -version = "1.0.2" +version = "1.2.0" + + [deps.SimpleNonlinearSolve.extensions] + SimpleNonlinearSolvePolyesterForwardDiffExt = "PolyesterForwardDiff" + + [deps.SimpleNonlinearSolve.weakdeps] + PolyesterForwardDiff = "98d1487c-24ca-40b6-b7ab-df2af84e126b" [[deps.SimpleTraits]] deps = ["InteractiveUtils", "MacroTools"] @@ -1751,30 +1706,25 @@ git-tree-sha1 = "58e6353e72cde29b90a69527e56df1b5c3d8c437" uuid = "ce78b400-467f-4804-87d8-8f486da07d0a" version = "1.1.0" -[[deps.Sobol]] -deps = ["DelimitedFiles", "Random"] -git-tree-sha1 = "5a74ac22a9daef23705f010f72c81d6925b19df8" -uuid = "ed01d8cd-4d21-5b2a-85b4-cc3bdc58bad4" -version = "1.5.0" - [[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" [[deps.SortingAlgorithms]] deps = ["DataStructures"] -git-tree-sha1 = "5165dfb9fd131cf0c6957a3a7605dede376e7b63" +git-tree-sha1 = "66e0a8e672a0bdfca2c3f5937efb8538b9ddc085" uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" -version = "1.2.0" +version = "1.2.1" [[deps.SparseArrays]] deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +version = "1.10.0" [[deps.SparseDiffTools]] deps = ["ADTypes", "Adapt", "ArrayInterface", "Compat", "DataStructures", "FiniteDiff", "ForwardDiff", "Graphs", "LinearAlgebra", "PackageExtensionCompat", "Random", "Reexport", "SciMLOperators", "Setfield", "SparseArrays", "StaticArrayInterface", "StaticArrays", "Tricks", "UnPack", "VertexSafeGraphs"] -git-tree-sha1 = "ddea63e5de5405878d990a2cea4fc1daf2d52d41" +git-tree-sha1 = "c281e11db4eacb36a292a054bac83c5a0aca2a26" uuid = "47a9eef4-7e08-11e9-0b38-333d64bd3804" -version = "2.14.0" +version = "2.15.0" [deps.SparseDiffTools.extensions] SparseDiffToolsEnzymeExt = "Enzyme" @@ -1823,14 +1773,18 @@ weakdeps = ["OffsetArrays", "StaticArrays"] [[deps.StaticArrays]] deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"] -git-tree-sha1 = "5ef59aea6f18c25168842bded46b16662141ab87" +git-tree-sha1 = "fba11dbe2562eecdfcac49a05246af09ee64d055" uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.7.0" -weakdeps = ["Statistics"] +version = "1.8.1" [deps.StaticArrays.extensions] + StaticArraysChainRulesCoreExt = "ChainRulesCore" StaticArraysStatisticsExt = "Statistics" + [deps.StaticArrays.weakdeps] + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" + Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + [[deps.StaticArraysCore]] git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" @@ -1839,7 +1793,7 @@ version = "1.4.2" [[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -version = "1.9.0" +version = "1.10.0" [[deps.StatsAPI]] deps = ["LinearAlgebra"] @@ -1882,15 +1836,14 @@ deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"] uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" [[deps.SuiteSparse_jll]] -deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] +deps = ["Artifacts", "Libdl", "libblastrampoline_jll"] uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" -version = "5.10.1+6" +version = "7.2.1+1" [[deps.SymbolicIndexingInterface]] -deps = ["DocStringExtensions"] -git-tree-sha1 = "f8ab052bfcbdb9b48fad2c80c873aa0d0344dfe5" +git-tree-sha1 = "be414bfd80c2c91197823890c66ef4b74f5bf5fe" uuid = "2efcf032-c050-4f8e-a9bb-153293bab1f5" -version = "0.2.2" +version = "0.3.1" [[deps.TOML]] deps = ["Dates"] @@ -1993,12 +1946,15 @@ deps = ["Dates", "LinearAlgebra", "Random"] git-tree-sha1 = "3c793be6df9dd77a0cf49d80984ef9ff996948fa" uuid = "1986cc42-f94f-5a68-af5c-568840ba703d" version = "1.19.0" -weakdeps = ["ConstructionBase", "InverseFunctions"] [deps.Unitful.extensions] ConstructionBaseUnitfulExt = "ConstructionBase" InverseFunctionsUnitfulExt = "InverseFunctions" + [deps.Unitful.weakdeps] + ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9" + InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" + [[deps.UnitfulLatexify]] deps = ["LaTeXStrings", "Latexify", "Unitful"] git-tree-sha1 = "e2d817cc500e960fdbafcf988ac8436ba3208bfd" @@ -2239,7 +2195,7 @@ version = "0.4.9" [[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" -version = "1.2.13+0" +version = "1.2.13+1" [[deps.Zstd_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -2286,7 +2242,7 @@ version = "0.15.1+0" [[deps.libblastrampoline_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" -version = "5.8.0+0" +version = "5.8.0+1" [[deps.libevdev_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] @@ -2332,7 +2288,7 @@ version = "1.52.0+1" [[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" -version = "17.4.0+0" +version = "17.4.0+2" [[deps.x264_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] From 2b7bb240a77f00ed3da4a6012ed7c60a0c710d6c Mon Sep 17 00:00:00 2001 From: termi-official Date: Tue, 27 Feb 2024 15:48:11 +0100 Subject: [PATCH 16/16] Tune runtimes --- docs/src/literate-tutorials/gpu_assembly.jl | 127 ++++++++++++-------- 1 file changed, 76 insertions(+), 51 deletions(-) diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl index 5277eb5e7a..362b498a20 100644 --- a/docs/src/literate-tutorials/gpu_assembly.jl +++ b/docs/src/literate-tutorials/gpu_assembly.jl @@ -67,7 +67,7 @@ function GPUGrid(::Type{T}, grid::Grid) where T ) end -struct GPUDofHandler{GRID <: GPUGrid, CADOFS <: AbstractArray{Int,1}, CAOFFSETS <: AbstractArray{Int,1}} +struct GPUDofHandler{GRID <: GPUGrid, CADOFS <: AbstractArray{Int,1}, CAOFFSETS <: AbstractArray{Int,1}} <: Ferrite.AbstractDofHandler cell_dofs::CADOFS cell_dofs_offset::CAOFFSETS grid::GRID @@ -93,6 +93,11 @@ function Ferrite.celldofs(dh::GPUDofHandler, cell_index::Int) end Adapt.@adapt_structure GPUDofHandler +function Ferrite.celldofs(dh::DofHandler, cell_index::Int) + cell_dof_range = dh.cell_dofs_offset[cell_index]:(dh.cell_dofs_offset[cell_index+1]-1) + return @view dh.cell_dofs[cell_dof_range] +end + function shape_values(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}) ξ_x = ξ[1] ξ_y = ξ[2] @@ -302,7 +307,7 @@ function gpu_full_mass_action!(uout::AbstractVector, u::AbstractVector, dh::GPUD synchronize() # Apply action one time - numthreads = 128 + numthreads = 8 for color ∈ colors numblocks = ceil(Int, length(color)/numthreads) # try @@ -341,24 +346,22 @@ Base.size(A::FerriteMFMassOperator, d) = ndofs(dh) # Square operator struct FerriteEAMassOperator{D2E, XE2, EAMATS <: AbstractArray{<:Any,3}} d2e::D2E + e2d::D2E + xe::XE2 xe2::XE2 element_matrices::EAMATS end function gemv_batched!(xe2::CuDeviceArray{T,2}, emats::CuDeviceArray{T,3}, xe::CuDeviceArray{T,2}) where T - # base_cell_index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x - # stride = blockDim().x - # for cell_index ∈ base_cell_index:min(base_cell_index+stride,size(emats,3)) - cell_index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x stride = gridDim().x * blockDim().x while cell_index ≤ size(emats,3) xei = @view xe[:,cell_index] xe2i = @view xe2[:,cell_index] emati = @view emats[:,:,cell_index] - # mul!(xe2i, emati, xei) # fails? xe2i .= T(0.0) - for j ∈ 1:size(emats,2), i ∈ 1:size(emats,1) + # mul!(xe2i, emati, xei) # fails? + for j ∈ 1:size(emati,2), i ∈ 1:size(emati,1) xe2i[i] += emati[i,j]*xei[j] end cell_index += stride @@ -366,17 +369,17 @@ function gemv_batched!(xe2::CuDeviceArray{T,2}, emats::CuDeviceArray{T,3}, xe::C end function LinearAlgebra.mul!(y,A::FerriteEAMassOperator{<:Any,<:Any,<:CuArray},x) - xe = A.d2e*x + mul!(A.xe, A.d2e, x) synchronize() - xev = reshape(xe, (size(A.element_matrices,2), size(A.element_matrices,3))) + xev = reshape(A.xe, (size(A.element_matrices,2), size(A.element_matrices,3))) xe2v = reshape(A.xe2, (size(A.element_matrices,2), size(A.element_matrices,3))) - synchronize() + # synchronize() # CUDA.CUBLAS.gemv_batched!(???) - numthreads = 128 + numthreads = 8 numblocks = ceil(Int, size(A.element_matrices,3)/numthreads) @cuda threads=numthreads blocks=numblocks gemv_batched!(xe2v, A.element_matrices, xev) synchronize() - mul!(y, transpose(A.d2e), A.xe2) + mul!(y, A.e2d, A.xe2) synchronize() end Base.eltype(A::FerriteEAMassOperator{<:Grid{<:Any,<:Any,T}}) where T = T @@ -392,47 +395,66 @@ struct FerriteRHS{DH, COLS, IP, GIP, QR} qr::QR end +function rhs_kernel_inner!(rhs::AbstractVector{T}, dh::Union{DofHandler{<:Any,<:Grid{sdim,<:Any,T}}, GPUDofHandler{<:GPUGrid{sdim,<:Any,T}}}, qr::SmallQuadratureRule{<:Any,T}, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation, cell_index::Int) where {sdim,T} + ## Grab the actual cell + cell = dh.grid.cells[cell_index] + # ## Grab the dofs on the cell + cell_dofs = celldofs(dh, cell_index) + ## Grab the buffers for the y and x + rhsₑ = @view rhs[cell_dofs] + ## Grab coordinate array + coords = cellnodesvec(cell, dh.grid.nodes) + ## Apply local action for y = Ax + for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues) + ξ = qr.points[q_point] + # TODO recover abstraction layer + J = getJ(ip_geo, coords, ξ) + dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point) + + # # TODO spatial_coordinate + x = zero(Vec{sdim,T}) + ϕᵍ = shape_values(ip_geo, ξ) + for i in 1:getnbasefunctions(ip_geo) + x += ϕᵍ[i]*coords[i] #geometric_value(fe_v, q_point, i) * x[i] + end + + ϕᶠ = shape_values(ip, ξ) + @inbounds for i in 1:getnbasefunctions(ip) + rhsₑ[i] += cos(x[1]/π)*cos(x[2]/π)*ϕᶠ[i]*dΩ + end + end +end + function gpu_rhs_kernel!(rhs::CuDeviceVector{T}, gpudh::GPUDofHandler{<:GPUGrid{sdim,<:Any,T}}, qr::SmallQuadratureRule{<:Any,T}, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation, cell_indices::AbstractVector) where {sdim,T} index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x stride = gridDim().x * blockDim().x while index <= length(cell_indices) ## Get index of the current cell cell_index = cell_indices[index] - ## Grab the actual cell - cell = gpudh.grid.cells[cell_index] - # ## Grab the dofs on the cell - cell_dofs = celldofs(gpudh, cell_index) - ## Grab the buffers for the y and x - rhsₑ = @view rhs[cell_dofs] - ## Grab coordinate array - coords = cellnodesvec(cell, gpudh.grid.nodes) - ## Apply local action for y = Ax - for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues) - ξ = qr.points[q_point] - # TODO recover abstraction layer - J = getJ(ip_geo, coords, ξ) - dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point) - - # # TODO spatial_coordinate - x = zero(Vec{sdim,T}) - ϕᵍ = shape_values(ip_geo, ξ) - for i in 1:getnbasefunctions(ip_geo) - x += ϕᵍ[i]*coords[i] #geometric_value(fe_v, q_point, i) * x[i] - end - - ϕᶠ = shape_values(ip, ξ) - @inbounds for i in 1:getnbasefunctions(ip) - rhsₑ[i] += cos(x[1]/π)*cos(x[2]/π)*ϕᶠ[i]*dΩ - end - end + ## Launch assembly kernel + rhs_kernel_inner!(rhs, gpudh, qr, ip, ip_geo, cell_index) + ## Shift index up index += stride end end +function generate_rhs(cpurhs::FerriteRHS{<:DofHandler{<:Any, <:Grid{<:Any,<:Any,T}}}) where T + rhs = zeros(T,ndofs(dh)) + for color ∈ cpurhs.colors + # try + for cell_index ∈ color + # rhs_kernel_inner!(rhs, cpurhs.dh, cpurhs.qr, cpurhs.ip, cpurhs.ip_geo, cell_index) + # return @benchmark rhs_kernel_inner!($rhs, $(cpurhs.dh), $(cpurhs.qr), $cpurhs.ip, $cpurhs.ip_geo, $cell_index) + end + # catch err + # code_typed(err; interactive = true) + # end + end + return rhs +end -function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler{<:GPUGrid{<:Any,<:Any,T}}}) where T +function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler{<:GPUGrid{<:Any,<:Any,T}}}, numthreads) where T rhs = CuArray(zeros(T,ndofs(dh))) - numthreads = 128 for color ∈ gpurhs.colors numblocks = ceil(Int, length(color)/numthreads) # try @@ -464,24 +486,27 @@ function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler{<:GPUGrid{<:Any,<:Any,T return rhs end +cpurhsop = FerriteRHS(dh, colors, ip, ip, SmallQuadratureRule(qr)) +bcpu = generate_rhs(cpurhsop) +@benchmark generate_rhs($cpurhsop) + cudh = GPUDofHandler(dh) cucolors = CuArray.(colors) Agpu = FerriteMFMassOperator(cudh, cucolors, ip, ip, SmallQuadratureRule(qr)) -rhsop = FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr)) -bgpu = generate_rhs(rhsop) +gpurhsop = FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr)) +bgpu = generate_rhs(gpurhsop, 16) +@benchmark generate_rhs($gpurhsop, 16) ugpu = CUDA.fill(0.f0, ndofs(dh)); # cg!(ugpu, Agpu, bgpu; verbose=true); -# @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu) -Aeagpu = FerriteEAMassOperator(CUDA.CUSPARSE.CuSparseMatrixCSC(d2e), CuArray(zeros(Float32, size(d2e,1))), CuArray(Keacpu)) +Aeagpu = FerriteEAMassOperator(CUDA.CUSPARSE.CuSparseMatrixCSR(d2e), CUDA.CUSPARSE.CuSparseMatrixCSR(transpose(d2e)), CuArray(zeros(Float32, size(d2e,1))), CuArray(zeros(Float32, size(d2e,1))), CuArray(Keacpu)) # @benchmark CUDA.@sync mul!($ugpu,$Aeagpu,$bgpu) -ucpu = copy(Array(ugpu)) -bcpu = copy(Array(bgpu)) -# @benchmark mul!($u,$Kcpu,$(Array(bgpu))) +ucpu = fill(0.f0, ndofs(dh)); +# @benchmark mul!($ucpu,$Kcpu,$(Array(bcpu))) -Kgpu = CUDA.CUSPARSE.CuSparseMatrixCSC(Kcpu) -# @benchmark mul!($ugpu,$Kcpu,$bgpu) +Kgpu = CUDA.CUSPARSE.CuSparseMatrixCSR(Kcpu) +# @benchmark CUDA.@sync mul!($ugpu,$Kgpu,$bgpu) # ### Exporting to VTK # To visualize the result we export the grid and our field `u`