From 6803abf54669bb89a56e4c92b2e4951be3146f02 Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Fri, 14 Jul 2023 19:25:04 +0200
Subject: [PATCH 01/16] MatrixFree GPU heat equation prototype.

---
 docs/src/literate-tutorials/gpu_assembly.jl | 323 ++++++++++++++++++++
 1 file changed, 323 insertions(+)
 create mode 100644 docs/src/literate-tutorials/gpu_assembly.jl

diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl
new file mode 100644
index 0000000000..c2adf4c57d
--- /dev/null
+++ b/docs/src/literate-tutorials/gpu_assembly.jl
@@ -0,0 +1,323 @@
+# # [Heat equation](@id tutorial-heat-equation)
+#
+# ![](heat_square.png)
+#
+# *Figure 1*: Temperature field on the unit square with an internal uniform heat source
+# solved with homogeneous Dirichlet boundary conditions on the boundary.
+#
+#-
+#md # !!! tip
+#md #     This example is also available as a Jupyter notebook:
+#md #     [`heat_equation.ipynb`](@__NBVIEWER_ROOT_URL__/examples/heat_equation.ipynb).
+#-
+#
+# ## Introduction
+#
+# The heat equation is the "Hello, world!" equation of finite elements.
+# Here we solve the equation on a unit square, with a uniform internal source.
+# The strong form of the (linear) heat equation is given by
+#
+# ```math
+#  -\nabla \cdot (k \nabla u) = f  \quad \textbf{x} \in \Omega,
+# ```
+#
+# where $u$ is the unknown temperature field, $k$ the heat conductivity,
+# $f$ the heat source and $\Omega$ the domain. For simplicity we set $f = 1$
+# and $k = 1$. We will consider homogeneous Dirichlet boundary conditions such that
+# ```math
+# u(\textbf{x}) = 0 \quad \textbf{x} \in \partial \Omega,
+# ```
+# where $\partial \Omega$ denotes the boundary of $\Omega$.
+# The resulting weak form is given given as follows: Find ``u \in \mathbb{U}`` such that
+# ```math
+# \int_{\Omega} \nabla \delta u \cdot \nabla u \ d\Omega = \int_{\Omega} \delta u \ d\Omega \quad \forall \delta u \in \mathbb{T},
+# ```
+# where $\delta u$ is a test function, and where $\mathbb{U}$ and $\mathbb{T}$ are suitable
+# trial and test function sets, respectively.
+#-
+# ## Commented Program
+#
+# Now we solve the problem in Ferrite. What follows is a program spliced with comments.
+#md # The full program, without comments, can be found in the next [section](@ref heat_equation-plain-program).
+#
+# First we load Ferrite, and some other packages we need
+using Ferrite, CUDA
+using IterativeSolvers, LinearAlgebra
+
+### TODO Extension
+import Adapt
+using StaticArrays
+struct GPUQuadratureRule{N,T,dim}
+    weights::SVector{N,T}
+    points::SVector{N,Vec{dim,T}}
+end
+function Adapt.adapt_structure(to, qr::QuadratureRule{shape,T,dim}) where {shape,T,dim}
+    N = length(qr.weights)
+    GPUQuadratureRule{N,T,dim}(SVector{N,T}(qr.weights), SVector{N,Vec{dim,T}}(qr.points))
+end
+function Adapt.adapt_structure(to, nodes::Vector{Node})
+    CuArray(get_node_coordinate.(nodes))
+end
+### TODO Adapt dofhandler
+
+# TODO not sure how to do this automatically
+function unsafe_shape_value(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}, i::Int)
+    ξ_x = ξ[1]
+    ξ_y = ξ[2]
+    i == 1 && return (1 - ξ_x) * (1 - ξ_y) * 0.25
+    i == 2 && return (1 + ξ_x) * (1 - ξ_y) * 0.25
+    i == 3 && return (1 + ξ_x) * (1 + ξ_y) * 0.25
+    i == 4 && return (1 - ξ_x) * (1 + ξ_y) * 0.25
+end
+
+function shape_values(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2})
+    ξ_x = ξ[1]
+    ξ_y = ξ[2]
+    return @SVector [
+        (1 - ξ_x) * (1 - ξ_y) * 0.25,
+        (1 + ξ_x) * (1 - ξ_y) * 0.25,
+        (1 + ξ_x) * (1 + ξ_y) * 0.25,
+        (1 - ξ_x) * (1 + ξ_y) * 0.25,
+    ]
+end
+
+function shape_gradients(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2})
+    ξ_x = ξ[1]
+    ξ_y = ξ[2]
+    return @SMatrix [
+        (0 - 1) * (1 - ξ_y) * 0.25    (1 - ξ_x) * (0 - 1) * 0.25;
+        (0 + 1) * (1 - ξ_y) * 0.25    (1 + ξ_x) * (0 - 1) * 0.25;
+        (0 + 1) * (1 + ξ_y) * 0.25    (1 + ξ_x) * (0 + 1) * 0.25;
+        (0 - 1) * (1 + ξ_y) * 0.25    (1 - ξ_x) * (0 + 1) * 0.25;
+    ]
+end
+
+function unsafe_shape_gradient(ip::Interpolation, ξ::Vec, i::Int)
+    return Tensors.gradient(x -> unsafe_shape_value(ip, x, i), ξ)
+end
+
+cellnodes(cell::Quadrilateral, nodes) = (nodes[cell.nodes[1]], nodes[cell.nodes[2]], nodes[cell.nodes[3]], nodes[cell.nodes[4]])
+
+# We start by generating a simple grid with 20x20 quadrilateral elements
+# using `generate_grid`. The generator defaults to the unit square,
+# so we don't need to specify the corners of the domain.
+grid = generate_grid(Quadrilateral, (200, 200));
+colors = CuArray.(create_coloring(grid));
+
+# ### Trial and test functions
+# A `CellValues` facilitates the process of evaluating values and gradients of
+# test and trial functions (among other things). To define
+# this we need to specify an interpolation space for the shape functions.
+# We use Lagrange functions
+# based on the two-dimensional reference quadrilateral. We also define a quadrature rule based on
+# the same reference element. We combine the interpolation and the quadrature rule
+# to a `CellValues` object.
+ip = Lagrange{RefQuadrilateral, 1}()
+qr = QuadratureRule{RefQuadrilateral}(2)
+cellvalues = CellValues(qr, ip);
+
+# ### Degrees of freedom
+# Next we need to define a `DofHandler`, which will take care of numbering
+# and distribution of degrees of freedom for our approximated fields.
+# We create the `DofHandler` and then add a single scalar field called `:u` based on
+# our interpolation `ip` defined above.
+# Lastly we `close!` the `DofHandler`, it is now that the dofs are distributed
+# for all the elements.
+dh = DofHandler(grid)
+add!(dh, :u, ip)
+close!(dh);
+
+# ### Boundary conditions
+# In Ferrite constraints like Dirichlet boundary conditions
+# are handled by a `ConstraintHandler`.
+# ch = ConstraintHandler(dh);
+
+# Next we need to add constraints to `ch`. For this problem we define
+# homogeneous Dirichlet boundary conditions on the whole boundary, i.e.
+# the `union` of all the face sets on the boundary.
+# ∂Ω = union(
+#     getfaceset(grid, "left"),
+#     getfaceset(grid, "right"),
+#     getfaceset(grid, "top"),
+#     getfaceset(grid, "bottom"),
+# );
+
+# Now we are set up to define our constraint. We specify which field
+# the condition is for, and our combined face set `∂Ω`. The last
+# argument is a function of the form $f(\textbf{x})$ or $f(\textbf{x}, t)$,
+# where $\textbf{x}$ is the spatial coordinate and
+# $t$ the current time, and returns the prescribed value. Since the boundary condition in
+# this case do not depend on time we define our function as $f(\textbf{x}) = 0$, i.e.
+# no matter what $\textbf{x}$ we return $0$. When we have
+# specified our constraint we `add!` it to `ch`.
+# dbc = Dirichlet(:u, ∂Ω, (x, t) -> 0)
+# add!(ch, dbc);
+# close!(ch)
+
+function gpu_element_mass_action!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{N,T,dim}, ip, ip_geo, xₑ) where {V,N,T,dim}
+    n_basefuncs = length(uₑin) #getnbasefunctions(cellvalues)
+    for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues)
+        ξ = qr.points[q_point]
+        # TODO recover abstraction layer
+        J = getJ(ip_geo, xₑ, ξ)
+        dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point)
+        for i in 1:n_basefuncs
+            ϕᵢ = unsafe_shape_value(ip, ξ, i)
+            for j in 1:n_basefuncs
+                ϕⱼ = unsafe_shape_value(ip, ξ, j)
+                uₑout[i] += ϕᵢ*ϕⱼ*uₑin[j]*dΩ
+            end
+        end
+    end
+    return nothing
+end
+
+function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, xₑ) where {V,N,T,dim,IP,GIP}
+    n_basefuncs = length(uₑin) #getnbasefunctions(cellvalues)
+    for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues)
+        ξ = qr.points[q_point]
+        shapes = shape_values(ip, ξ)
+        # TODO recover abstraction layer
+        J = getJ(ip_geo, xₑ, ξ)
+        dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point)
+        for i in 1:n_basefuncs
+            ϕᵢ = shapes[i]
+            for j in 1:n_basefuncs
+                inner = shapes[j]*uₑin[j]
+                uₑout[i] += inner*ϕᵢ*dΩ
+            end
+        end
+    end
+    return nothing
+end
+
+# Mass action on a single color
+Base.@propagate_inbounds function gpu_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offsets, cell_indices, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, cells::CuDeviceArray{CT}, nodes::CuDeviceArray{Vec{dim,T}}, dofs_per_cell)  where {V,N,T,dim,CT,IP,GIP}
+    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
+    stride = blockDim().x
+    for i = index:stride:length(cell_indices)
+        ## Get index of the current cell
+        cell_index = cell_indices[i]
+        ## Grab the actual cell
+        cell = cells[cell_index]
+        ## Grab the dofs on the cell
+        cell_dof_range = cell_dof_offsets[cell_index]:(cell_dof_offsets[cell_index]+dofs_per_cell-1)
+        cell_dofs = @view all_cell_dofs[cell_dof_range]
+        ## Grab the buffers for the y and x
+        uₑin  = @view uin[cell_dofs]
+        uₑout = @view uout[cell_dofs]
+        ## Grab coordinate array
+        xₑ = cellnodes(cell, nodes)
+        ## Apply local action for y = Ax
+        gpu_element_mass_action2!(uₑout, uₑin, qr, ip, ip_geo, xₑ)
+    end
+    return nothing
+end
+
+# Mass action of the full operator
+function gpu_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR, ip::IP, ip_geo::GIP, colors, gpu_cells, gpu_nodes) where {V, QR, IP, GIP}
+    # Initialize solution
+    fill!(uout, 0.0)
+    synchronize()
+
+    # Apply action one time
+    dofs_per_cell  = ndofs_per_cell(dh, 1)
+    for color ∈ colors
+        numthreads = 256
+        numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
+        # try
+            @cuda threads=numthreads blocks=numblocks gpu_mass_action_kernel!(uout, u, all_cell_dofs, cell_dof_offsets, color, qr, ip, ip_geo, gpu_cells, gpu_nodes, dofs_per_cell)
+            synchronize()
+            # catch err
+        #     code_typed(err; interactive = true)
+        # end
+    end
+end
+
+function getJ(ip_geo::GIP, xₑ::NTuple{N,Vec{dim,T}}, ξ::Vec{dim,T}) where {N,GIP,dim,T}
+    dMdξ = shape_gradients(ip_geo, ξ)
+    fecv_J = zero(Tensor{2,dim,T})
+    for i in 1:length(xₑ)
+        fecv_J += xₑ[i] ⊗ Vec{dim,T}(dMdξ[i, :])
+    end
+    return fecv_J
+end
+
+# 
+# Define the operator struct (https://iterativesolvers.julialinearalgebra.org/dev/getting_started/)
+struct FerriteGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR}
+    # "GPUGrid"
+    gpu_cells::CACELLS
+    gpu_nodes::CANODES
+    colors::CACOLORS
+    # "GPUDofHandler"
+    all_cell_dofs::CADOFS
+    cell_dof_offsets::CAOFFSETS
+    # "GPUValues"
+    ip::IP
+    ip_geo::GIP
+    qr::QR
+end
+
+function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr)
+    FerriteGPUMassOperator(
+        CuArray(getcells(Ferrite.get_grid(dh))),
+        CuArray(get_node_coordinate.(getnodes(Ferrite.get_grid(dh)))),
+        colors,
+        CuArray(dh.cell_dofs),
+        CuArray(dh.cell_dofs_offset),
+        ip, ip_geo, qr
+    )
+end
+
+LinearAlgebra.mul!(y, A::FerriteGPUMassOperator, x) = gpu_mass_action!(y, x, A.all_cell_dofs, A.cell_dof_offsets, A.qr, A.ip, A.ip_geo, A.colors, A.gpu_cells, A.gpu_nodes)
+Base.eltype(A::FerriteGPUMassOperator) = Float64
+Base.size(A::FerriteGPUMassOperator, d) = ndofs(dh) # Square operator
+
+A = FerriteGPUMassOperator(dh, colors, ip, ip, qr)
+
+function generate_rhs(dh)
+    rhs = zeros(ndofs(dh))
+    for cell in CellIterator(dh)
+        reinit!(cellvalues, cell)
+        coords = get_cell_coordinates(cell)
+        n_basefuncs = getnbasefunctions(cellvalues)
+        fe = zeros(n_basefuncs)
+        for q_point in 1:getnquadpoints(cellvalues)
+            ## Get the quadrature weight
+            dΩ = getdetJdV(cellvalues, q_point)
+            x = spatial_coordinate(cellvalues, q_point, coords)
+            ## Loop over test shape functions
+            for i in 1:n_basefuncs
+                δu  = shape_value(cellvalues, q_point, i)
+                ## Add contribution to fe
+                fe[i] += cos(x[1]/π)*cos(x[2]/π)*δu * dΩ
+            end
+        end
+        rhs[celldofs(cell)] .+= fe
+    end
+    return CuArray(rhs)
+end
+b = generate_rhs(dh)
+u = CUDA.fill(0.0, ndofs(dh));
+cg!(u, A, b; verbose=true);
+
+# ### Exporting to VTK
+# To visualize the result we export the grid and our field `u`
+# to a VTK-file, which can be viewed in e.g. [ParaView](https://www.paraview.org/).
+vtk_grid("heat_equation", dh) do vtk
+    vtk_point_data(vtk, dh, Array(u))
+end
+
+## test the result                #src
+# using Test                        #src
+# @test norm(u) ≈ 3.307743912641305 #src
+
+#md # ## [Plain program](@id heat_equation-plain-program)
+#md #
+#md # Here follows a version of the program without any comments.
+#md # The file is also available here: [`heat_equation.jl`](heat_equation.jl).
+#md #
+#md # ```julia
+#md # @__CODE__
+#md # ```

From 08256af8bbd8ff99b272262b9c5f808eabe23874 Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Wed, 11 Oct 2023 18:41:27 +0200
Subject: [PATCH 02/16] Temporary solution for sparsity pattern on GPU.

---
 src/Dofs/sparsity_pattern.jl | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/Dofs/sparsity_pattern.jl b/src/Dofs/sparsity_pattern.jl
index 46ef67c3d3..8171ea3a54 100644
--- a/src/Dofs/sparsity_pattern.jl
+++ b/src/Dofs/sparsity_pattern.jl
@@ -1,5 +1,5 @@
 """
-    create_sparsity_pattern(dh::DofHandler; coupling, [topology::Union{Nothing, AbstractTopology}], [cross_coupling])
+    create_sparsity_pattern(dh::DofHandler; coupling, topology::Union{Nothing, AbstractTopology}, cross_coupling, matrix_type)
 
 Create the sparsity pattern corresponding to the degree of freedom
 numbering in the [`DofHandler`](@ref). Return a `SparseMatrixCSC`
@@ -13,15 +13,17 @@ not. By default full coupling is assumed inside the element with no coupling bet
 
 If `topology` and `cross_coupling` are passed, dof of fields with discontinuous interpolations are coupled between elements according to `cross_coupling`.
 
+With 
+
 See the [Sparsity Pattern](@ref) section of the manual.
 """
 function create_sparsity_pattern(dh::AbstractDofHandler; coupling=nothing,
-    topology::Union{Nothing, AbstractTopology} = nothing, cross_coupling = nothing)
-    return _create_sparsity_pattern(dh, nothing, false, true, coupling, topology, cross_coupling)
+    topology::Union{Nothing, AbstractTopology} = nothing, cross_coupling = nothing, matrix_type = SparseMatrixCSC{Float64})
+    return _create_sparsity_pattern(matrix_type, dh, nothing, false, true, coupling, topology, cross_coupling)
 end
 
 """
-    create_symmetric_sparsity_pattern(dh::DofHandler; coupling, topology::Union{Nothing, AbstractTopology}, cross_coupling)
+    create_symmetric_sparsity_pattern(dh::DofHandler; coupling, topology::Union{Nothing, AbstractTopology}, cross_coupling, matrix_type)
 
 Create the symmetric sparsity pattern corresponding to the degree of freedom
 numbering in the [`DofHandler`](@ref) by only considering the upper
@@ -30,32 +32,32 @@ triangle of the matrix. Return a `Symmetric{SparseMatrixCSC}`.
 See the [Sparsity Pattern](@ref) section of the manual.
 """
 function create_symmetric_sparsity_pattern(dh::AbstractDofHandler; coupling=nothing,
-    topology::Union{Nothing, AbstractTopology} = nothing, cross_coupling = nothing)
-    return Symmetric(_create_sparsity_pattern(dh, nothing, true, true, coupling, topology, cross_coupling), :U)
+    topology::Union{Nothing, AbstractTopology} = nothing, cross_coupling = nothing, matrix_type = SparseMatrixCSC{Float64})
+    return Symmetric(_create_sparsity_pattern(matrix_type, dh, nothing, true, true, coupling, topology, cross_coupling), :U)
 end
 
 """
-    create_symmetric_sparsity_pattern(dh::AbstractDofHandler, ch::ConstraintHandler; coupling, topology::Union{Nothing, AbstractTopology}, cross_coupling)
+    create_symmetric_sparsity_pattern(dh::AbstractDofHandler, ch::ConstraintHandler; coupling, topology::Union{Nothing, AbstractTopology}, cross_coupling, matrix_type)
 
 Create a symmetric sparsity pattern accounting for affine constraints in `ch`. See
 the Affine Constraints section of the manual for further details.
 """
 function create_symmetric_sparsity_pattern(dh::AbstractDofHandler, ch::ConstraintHandler;
         keep_constrained::Bool=true, coupling=nothing, topology::Union{Nothing, AbstractTopology} = nothing,
-        cross_coupling = nothing)
-    return Symmetric(_create_sparsity_pattern(dh, ch, true, keep_constrained, coupling, topology, cross_coupling), :U)
+        cross_coupling = nothing, matrix_type = SparseMatrixCSC{Float64})
+    return Symmetric(_create_sparsity_pattern(matrix_type, dh, ch, true, keep_constrained, coupling, topology, cross_coupling), :U)
 end
 
 """
-    create_sparsity_pattern(dh::AbstractDofHandler, ch::ConstraintHandler; coupling, topology::Union{Nothing, AbstractTopology} = nothing)
+    create_sparsity_pattern(dh::AbstractDofHandler, ch::ConstraintHandler; coupling, topology::Union{Nothing, AbstractTopology}, matrix_type)
 
 Create a sparsity pattern accounting for affine constraints in `ch`. See
 the Affine Constraints section of the manual for further details.
 """
 function create_sparsity_pattern(dh::AbstractDofHandler, ch::ConstraintHandler;
         keep_constrained::Bool=true, coupling=nothing, topology::Union{Nothing, AbstractTopology} = nothing,
-        cross_coupling = nothing)
-    return _create_sparsity_pattern(dh, ch, false, keep_constrained, coupling, topology, cross_coupling)
+        cross_coupling = nothing, matrix_type = SparseMatrixCSC{Float64})
+    return _create_sparsity_pattern(matrix_type, dh, ch, false, keep_constrained, coupling, topology, cross_coupling)
 end
 
 # Compute a coupling matrix of size (ndofs_per_cell × ndofs_per_cell) based on the input
@@ -163,8 +165,8 @@ function cross_element_coupling!(dh::DofHandler, ch::Union{ConstraintHandler, No
     return cnt
 end
 
-function _create_sparsity_pattern(dh::AbstractDofHandler, ch#=::Union{ConstraintHandler, Nothing}=#, sym::Bool, keep_constrained::Bool, coupling::Union{AbstractMatrix{Bool},Nothing},
-    topology::Union{Nothing, AbstractTopology}, cross_coupling::Union{AbstractMatrix{Bool},Nothing})
+function _create_sparsity_pattern(matrix_type::Type{SparseMatrixCSC{T}}, dh::AbstractDofHandler, ch#=::Union{ConstraintHandler, Nothing}=#, sym::Bool, keep_constrained::Bool, coupling::Union{AbstractMatrix{Bool},Nothing},
+    topology::Union{Nothing, AbstractTopology}, cross_coupling::Union{AbstractMatrix{Bool},Nothing}) where {T}
     @assert isclosed(dh)
     if !keep_constrained
         @assert ch !== nothing && isclosed(ch)
@@ -227,7 +229,7 @@ function _create_sparsity_pattern(dh::AbstractDofHandler, ch#=::Union{Constraint
     end
     @assert length(I) == length(J) == cnt
 
-    K = spzeros!!(Float64, I, J, ndofs(dh), ndofs(dh))
+    K = spzeros!!(T, I, J, ndofs(dh), ndofs(dh))
 
     # If ConstraintHandler is given, create the condensation pattern due to affine constraints
     if ch !== nothing
@@ -295,7 +297,7 @@ function _condense_sparsity_pattern!(K::SparseMatrixCSC{T}, dofcoefficients::Vec
     resize!(J, cnt)
 
     # Fill the sparse matrix with a non-zero value so that :+ operation does not remove entries with value zero.
-    K2 = spzeros!!(Float64, I, J, ndofs, ndofs)
+    K2 = spzeros!!(T, I, J, ndofs, ndofs)
     fill!(K2.nzval, 1)
 
     K .+= K2

From d8456a4ee41c832a6b4e53f96d901b22bd00ba1c Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Wed, 11 Oct 2023 19:11:17 +0200
Subject: [PATCH 03/16] Add missing tests.

---
 test/test_dofs.jl | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/test/test_dofs.jl b/test/test_dofs.jl
index 676fa1c6d6..dda5427718 100644
--- a/test/test_dofs.jl
+++ b/test/test_dofs.jl
@@ -436,6 +436,27 @@ end
         @test is_stored(K, i, j)
     end
 
+    # Full coupling (default) with reduced precision
+    K = create_sparsity_pattern(dh; matrix_type=SparseMatrixCSC{Float32})
+    @test eltype(K) == Float32
+    for j in 1:ndofs(dh), i in 1:ndofs(dh)
+        @test is_stored(K, i, j)
+    end
+
+    # Full coupling (default) with reduced precision
+    K = create_sparsity_pattern(dh; matrix_type=SparseMatrixCSC{Float32})
+    @test eltype(K) == Float32
+    for j in 1:ndofs(dh), i in 1:ndofs(dh)
+        @test is_stored(K, i, j)
+    end
+
+    # Complex operator
+    K = create_sparsity_pattern(dh; matrix_type=SparseMatrixCSC{ComplexF64})
+    @test eltype(K) == Float32
+    for j in 1:ndofs(dh), i in 1:ndofs(dh)
+        @test is_stored(K, i, j)
+    end
+
     # Field coupling
     coupling = [
     #   u    p

From 7e151bfd9217493b010a48821bd4609e2382b415 Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Tue, 17 Oct 2023 15:46:00 +0200
Subject: [PATCH 04/16] =?UTF-8?q?Make=20GPU=20catching=20=F0=9F=94=A5.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/src/literate-tutorials/gpu_assembly.jl | 204 ++++++++++++++++----
 1 file changed, 166 insertions(+), 38 deletions(-)

diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl
index c2adf4c57d..9a6163e8f8 100644
--- a/docs/src/literate-tutorials/gpu_assembly.jl
+++ b/docs/src/literate-tutorials/gpu_assembly.jl
@@ -58,26 +58,56 @@ end
 function Adapt.adapt_structure(to, nodes::Vector{Node})
     CuArray(get_node_coordinate.(nodes))
 end
-### TODO Adapt dofhandler
+### TODO Adapt dofhandler?
 
 # TODO not sure how to do this automatically
 function unsafe_shape_value(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}, i::Int)
     ξ_x = ξ[1]
     ξ_y = ξ[2]
-    i == 1 && return (1 - ξ_x) * (1 - ξ_y) * 0.25
-    i == 2 && return (1 + ξ_x) * (1 - ξ_y) * 0.25
-    i == 3 && return (1 + ξ_x) * (1 + ξ_y) * 0.25
-    i == 4 && return (1 - ξ_x) * (1 + ξ_y) * 0.25
+    i == 1 && return (1 - ξ_x) * (1 - ξ_y) / 4
+    i == 2 && return (1 + ξ_x) * (1 - ξ_y) / 4
+    i == 3 && return (1 + ξ_x) * (1 + ξ_y) / 4
+    i == 4 && return (1 - ξ_x) * (1 + ξ_y) / 4
+end
+
+function unsafe_shape_value(ip::Lagrange{RefHexahedron, 1}, ξ::Vec{3}, i::Int)
+    ξ_x = ξ[1]
+    ξ_y = ξ[2]
+    ξ_z = ξ[3]
+    i == 1 && return 0.125(1 - ξ_x) * (1 - ξ_y) * (1 - ξ_z)
+    i == 2 && return 0.125(1 + ξ_x) * (1 - ξ_y) * (1 - ξ_z)
+    i == 3 && return 0.125(1 + ξ_x) * (1 + ξ_y) * (1 - ξ_z)
+    i == 4 && return 0.125(1 - ξ_x) * (1 + ξ_y) * (1 - ξ_z)
+    i == 5 && return 0.125(1 - ξ_x) * (1 - ξ_y) * (1 + ξ_z)
+    i == 6 && return 0.125(1 + ξ_x) * (1 - ξ_y) * (1 + ξ_z)
+    i == 7 && return 0.125(1 + ξ_x) * (1 + ξ_y) * (1 + ξ_z)
+    i == 8 && return 0.125(1 - ξ_x) * (1 + ξ_y) * (1 + ξ_z)
 end
 
 function shape_values(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2})
     ξ_x = ξ[1]
     ξ_y = ξ[2]
     return @SVector [
-        (1 - ξ_x) * (1 - ξ_y) * 0.25,
-        (1 + ξ_x) * (1 - ξ_y) * 0.25,
-        (1 + ξ_x) * (1 + ξ_y) * 0.25,
-        (1 - ξ_x) * (1 + ξ_y) * 0.25,
+        (1 - ξ_x) * (1 - ξ_y) / 4,
+        (1 + ξ_x) * (1 - ξ_y) / 4,
+        (1 + ξ_x) * (1 + ξ_y) / 4,
+        (1 - ξ_x) * (1 + ξ_y) / 4,
+    ]
+end
+
+function shape_values(ip::Lagrange{RefHexahedron, 1}, ξ::Vec{3})
+    ξ_x = ξ[1]
+    ξ_y = ξ[2]
+    ξ_z = ξ[3]
+    return @SVector [
+        (1 - ξ_x) * (1 - ξ_y) * (1 - ξ_z) / 8,
+        (1 + ξ_x) * (1 - ξ_y) * (1 - ξ_z) / 8,
+        (1 + ξ_x) * (1 + ξ_y) * (1 - ξ_z) / 8,
+        (1 - ξ_x) * (1 + ξ_y) * (1 - ξ_z) / 8,
+        (1 - ξ_x) * (1 - ξ_y) * (1 + ξ_z) / 8,
+        (1 + ξ_x) * (1 - ξ_y) * (1 + ξ_z) / 8,
+        (1 + ξ_x) * (1 + ξ_y) * (1 + ξ_z) / 8,
+        (1 - ξ_x) * (1 + ξ_y) * (1 + ξ_z) / 8,
     ]
 end
 
@@ -85,10 +115,26 @@ function shape_gradients(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2})
     ξ_x = ξ[1]
     ξ_y = ξ[2]
     return @SMatrix [
-        (0 - 1) * (1 - ξ_y) * 0.25    (1 - ξ_x) * (0 - 1) * 0.25;
-        (0 + 1) * (1 - ξ_y) * 0.25    (1 + ξ_x) * (0 - 1) * 0.25;
-        (0 + 1) * (1 + ξ_y) * 0.25    (1 + ξ_x) * (0 + 1) * 0.25;
-        (0 - 1) * (1 + ξ_y) * 0.25    (1 - ξ_x) * (0 + 1) * 0.25;
+        (0 - 1) * (1 - ξ_y) / 4    (1 - ξ_x) * (0 - 1) / 4;
+        (0 + 1) * (1 - ξ_y) / 4    (1 + ξ_x) * (0 - 1) / 4;
+        (0 + 1) * (1 + ξ_y) / 4    (1 + ξ_x) * (0 + 1) / 4;
+        (0 - 1) * (1 + ξ_y) / 4    (1 - ξ_x) * (0 + 1) / 4;
+    ]
+end
+
+function shape_gradients(ip::Lagrange{RefHexahedron, 1}, ξ::Vec{3})
+    ξ_x = ξ[1]
+    ξ_y = ξ[2]
+    ξ_z = ξ[3]
+    return @SMatrix [
+        (0 - 1) * (1 - ξ_y) * (1 - ξ_z) / 8    (1 - ξ_x) * (0 - 1) * (1 - ξ_z) / 8    (1 - ξ_x) * (1 - ξ_y) * (0 - 1) / 8;
+        (0 + 1) * (1 - ξ_y) * (1 - ξ_z) / 8    (1 + ξ_x) * (0 - 1) * (1 - ξ_z) / 8    (1 + ξ_x) * (1 - ξ_y) * (0 - 1) / 8;
+        (0 + 1) * (1 + ξ_y) * (1 - ξ_z) / 8    (1 + ξ_x) * (0 + 1) * (1 - ξ_z) / 8    (1 + ξ_x) * (1 + ξ_y) * (0 - 1) / 8;
+        (0 - 1) * (1 + ξ_y) * (1 - ξ_z) / 8    (1 - ξ_x) * (0 + 1) * (1 - ξ_z) / 8    (1 - ξ_x) * (1 + ξ_y) * (0 - 1) / 8;
+        (0 - 1) * (1 - ξ_y) * (1 + ξ_z) / 8    (1 - ξ_x) * (0 - 1) * (1 + ξ_z) / 8    (1 - ξ_x) * (1 - ξ_y) * (0 + 1) / 8;
+        (0 + 1) * (1 - ξ_y) * (1 + ξ_z) / 8    (1 + ξ_x) * (0 - 1) * (1 + ξ_z) / 8    (1 + ξ_x) * (1 - ξ_y) * (0 + 1) / 8;
+        (0 + 1) * (1 + ξ_y) * (1 + ξ_z) / 8    (1 + ξ_x) * (0 + 1) * (1 + ξ_z) / 8    (1 + ξ_x) * (1 + ξ_y) * (0 + 1) / 8;
+        (0 - 1) * (1 + ξ_y) * (1 + ξ_z) / 8    (1 - ξ_x) * (0 + 1) * (1 + ξ_z) / 8    (1 - ξ_x) * (1 + ξ_y) * (0 + 1) / 8;
     ]
 end
 
@@ -96,13 +142,15 @@ function unsafe_shape_gradient(ip::Interpolation, ξ::Vec, i::Int)
     return Tensors.gradient(x -> unsafe_shape_value(ip, x, i), ξ)
 end
 
+# ntuple fails...
 cellnodes(cell::Quadrilateral, nodes) = (nodes[cell.nodes[1]], nodes[cell.nodes[2]], nodes[cell.nodes[3]], nodes[cell.nodes[4]])
+cellnodes(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]], nodes[cell.nodes[2]], nodes[cell.nodes[3]], nodes[cell.nodes[4]], nodes[cell.nodes[5]], nodes[cell.nodes[6]], nodes[cell.nodes[7]], nodes[cell.nodes[8]])
 
 # We start by generating a simple grid with 20x20 quadrilateral elements
 # using `generate_grid`. The generator defaults to the unit square,
 # so we don't need to specify the corners of the domain.
-grid = generate_grid(Quadrilateral, (200, 200));
-colors = CuArray.(create_coloring(grid));
+grid = generate_grid(Hexahedron, (100, 100, 100));
+colors = CuArray.(create_coloring(grid)); # TODO add example without coloring, i.e. using Atomics instead
 
 # ### Trial and test functions
 # A `CellValues` facilitates the process of evaluating values and gradients of
@@ -112,8 +160,8 @@ colors = CuArray.(create_coloring(grid));
 # based on the two-dimensional reference quadrilateral. We also define a quadrature rule based on
 # the same reference element. We combine the interpolation and the quadrature rule
 # to a `CellValues` object.
-ip = Lagrange{RefQuadrilateral, 1}()
-qr = QuadratureRule{RefQuadrilateral}(2)
+ip = Lagrange{RefHexahedron, 1}()
+qr = QuadratureRule{RefHexahedron}(2)
 cellvalues = CellValues(qr, ip);
 
 # ### Degrees of freedom
@@ -276,36 +324,116 @@ Base.size(A::FerriteGPUMassOperator, d) = ndofs(dh) # Square operator
 
 A = FerriteGPUMassOperator(dh, colors, ip, ip, qr)
 
-function generate_rhs(dh)
-    rhs = zeros(ndofs(dh))
-    for cell in CellIterator(dh)
-        reinit!(cellvalues, cell)
-        coords = get_cell_coordinates(cell)
-        n_basefuncs = getnbasefunctions(cellvalues)
-        fe = zeros(n_basefuncs)
-        for q_point in 1:getnquadpoints(cellvalues)
-            ## Get the quadrature weight
-            dΩ = getdetJdV(cellvalues, q_point)
-            x = spatial_coordinate(cellvalues, q_point, coords)
-            ## Loop over test shape functions
-            for i in 1:n_basefuncs
-                δu  = shape_value(cellvalues, q_point, i)
-                ## Add contribution to fe
-                fe[i] += cos(x[1]/π)*cos(x[2]/π)*δu * dΩ
-            end
+struct FerriteGPURHS{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR}
+    # "GPUGrid"
+    gpu_cells::CACELLS
+    gpu_nodes::CANODES
+    colors::CACOLORS
+    # "GPUDofHandler"
+    all_cell_dofs::CADOFS
+    cell_dof_offsets::CAOFFSETS
+    # "GPUValues"
+    ip::IP
+    ip_geo::GIP
+    qr::QR
+end
+
+function FerriteGPURHS(dh, colors, ip, ip_geo, qr)
+    FerriteGPURHS(
+        CuArray(getcells(Ferrite.get_grid(dh))),
+        CuArray(get_node_coordinate.(getnodes(Ferrite.get_grid(dh)))),
+        colors,
+        CuArray(dh.cell_dofs),
+        CuArray(dh.cell_dofs_offset),
+        ip, ip_geo, qr
+    )
+end
+
+function generate_rhs(gpurhs::FerriteGPURHS)
+    rhs = CuArray(zeros(ndofs(dh)))
+    numthreads = 256
+    numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
+    dofs_per_cell  = ndofs_per_cell(dh, 1)
+    for color ∈ gpurhs.colors
+        # try
+        @cuda threads=numthreads blocks=numblocks gpu_rhs_kernel!(rhs, gpurhs.all_cell_dofs, gpurhs.cell_dof_offsets, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color, gpurhs.gpu_cells, gpurhs.gpu_nodes, dofs_per_cell)
+        synchronize()
+        # catch err
+        #     code_typed(err; interactive = true)
+        # end
+    end
+
+    # for cell in CellIterator(dh)
+    #     reinit!(cellvalues, cell)
+    #     coords = getcoordinates(cell)
+    #     n_basefuncs = getnbasefunctions(cellvalues)
+    #     fe = zeros(n_basefuncs)
+    #     for q_point in 1:getnquadpoints(cellvalues)
+    #         ## Get the quadrature weight
+    #         dΩ = getdetJdV(cellvalues, q_point)
+    #         x = spatial_coordinate(cellvalues, q_point, coords)
+    #         ## Loop over test shape functions
+    #         for i in 1:n_basefuncs
+    #             δu  = shape_value(cellvalues, q_point, i)
+    #             ## Add contribution to fe
+    #             fe[i] += cos(x[1]/π)*cos(x[2]/π)*δu * dΩ
+    #         end
+    #     end
+    #     rhs[celldofs(cell)] .+= fe
+    # end
+    return rhs
+end
+
+Base.@propagate_inbounds function gpu_rhs_kernel!(rhs::RHS, all_cell_dofs::CD, cell_dof_offsets::DO, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC, cells::CELLS, nodes::GPUN, dofs_per_cell::DOFSPC) where {RHS, CD, DO, QR, FIP, GIP, GPUC, CELLS, GPUN, DOFSPC}
+    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
+    stride = blockDim().x
+    for i = index:stride:length(cell_indices)
+        ## Get index of the current cell
+        cell_index = cell_indices[i]
+        ## Grab the actual cell
+        cell = cells[cell_index]
+        ## Grab the dofs on the cell
+        cell_dof_range = cell_dof_offsets[cell_index]:(cell_dof_offsets[cell_index]+dofs_per_cell-1)
+        cell_dofs = @view all_cell_dofs[cell_dof_range]
+        ## Grab the buffers for the y and x
+        rhsₑ = @view rhs[cell_dofs]
+        ## Grab coordinate array
+        coords = cellnodes(cell, nodes)
+        ## Apply local action for y = Ax
+        gpu_rhs_kernel2!(rhsₑ, qr, ip, ip_geo, coords)
+    end
+end
+
+function gpu_rhs_kernel2!(rhsₑ, qr, ip, ip_geo, coords)    
+    n_basefuncs = getnbasefunctions(ip)
+    for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues)
+        ξ = qr.points[q_point]
+        # TODO recover abstraction layer
+        J = getJ(ip_geo, coords, ξ)
+        dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point)
+        
+        # TODO spatial_coordinate
+        x = zero(Vec{3,Float64})
+        for i in 1:n_basefuncs
+            x += unsafe_shape_value(ip_geo, ξ, i)*coords[i] #geometric_value(fe_v, q_point, i) * x[i]
+        end
+
+        for i in 1:n_basefuncs
+            ϕᵢ = unsafe_shape_value(ip, ξ, i)
+            rhsₑ[i] += cos(x[1]/π)*cos(x[2]/π)*ϕᵢ * dΩ
         end
-        rhs[celldofs(cell)] .+= fe
     end
-    return CuArray(rhs)
 end
-b = generate_rhs(dh)
+
+
+b = generate_rhs(FerriteGPURHS(dh, colors, ip, ip, qr))
 u = CUDA.fill(0.0, ndofs(dh));
 cg!(u, A, b; verbose=true);
 
 # ### Exporting to VTK
 # To visualize the result we export the grid and our field `u`
 # to a VTK-file, which can be viewed in e.g. [ParaView](https://www.paraview.org/).
-vtk_grid("heat_equation", dh) do vtk
+vtk_grid("heat_equation", dh) do vtk # The cake is a lie
     vtk_point_data(vtk, dh, Array(u))
 end
 

From 6f33d098e2ae72e791dbbddbdf6d0fad0bb44882 Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Tue, 17 Oct 2023 15:46:23 +0200
Subject: [PATCH 05/16] Packages.

---
 docs/Manifest.toml | 185 ++++++++++++++++++++++++++++++++++++++++++++-
 docs/Project.toml  |   3 +
 2 files changed, 184 insertions(+), 4 deletions(-)

diff --git a/docs/Manifest.toml b/docs/Manifest.toml
index e79fd3ef2e..9749a9bbd8 100644
--- a/docs/Manifest.toml
+++ b/docs/Manifest.toml
@@ -1,8 +1,8 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.9.1"
+julia_version = "1.9.3"
 manifest_format = "2.0"
-project_hash = "a0caf5d07899f1863658c7736f8530d20d28db07"
+project_hash = "3221800eb396241cbc64f6deb94d7c003e4f881d"
 
 [[deps.ADTypes]]
 git-tree-sha1 = "5d2e21d7b0d8c22f67483ef95ebdc39c0e6b6003"
@@ -14,6 +14,17 @@ git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c"
 uuid = "a4c015fc-c6ff-483c-b24f-f7ea428134e9"
 version = "0.0.1"
 
+[[deps.AbstractFFTs]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef"
+uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
+version = "1.5.0"
+weakdeps = ["ChainRulesCore", "Test"]
+
+    [deps.AbstractFFTs.extensions]
+    AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
+    AbstractFFTsTestExt = "Test"
+
 [[deps.AbstractTrees]]
 git-tree-sha1 = "faa260e4cb5aba097a73fab382dd4b5819d8ec8c"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@@ -80,6 +91,18 @@ weakdeps = ["SparseArrays"]
 [[deps.Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 
+[[deps.Atomix]]
+deps = ["UnsafeAtomics"]
+git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
+uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
+version = "0.1.0"
+
+[[deps.BFloat16s]]
+deps = ["LinearAlgebra", "Printf", "Random", "Test"]
+git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
+uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
+version = "0.4.2"
+
 [[deps.Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
@@ -123,12 +146,45 @@ git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
 uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
 version = "1.0.8+0"
 
+[[deps.CEnum]]
+git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
+uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
+version = "0.4.2"
+
 [[deps.CPUSummary]]
 deps = ["CpuId", "IfElse", "PrecompileTools", "Static"]
 git-tree-sha1 = "601f7e7b3d36f18790e2caf83a882d88e9b71ff1"
 uuid = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
 version = "0.2.4"
 
+[[deps.CUDA]]
+deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "Statistics", "UnsafeAtomicsLLVM"]
+git-tree-sha1 = "f062a48c26ae027f70c44f48f244862aec47bf99"
+uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
+version = "5.0.0"
+weakdeps = ["SpecialFunctions"]
+
+    [deps.CUDA.extensions]
+    SpecialFunctionsExt = "SpecialFunctions"
+
+[[deps.CUDA_Driver_jll]]
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
+git-tree-sha1 = "2f64185414751a5f878c4ab616c0edd94ade3419"
+uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
+version = "0.6.0+4"
+
+[[deps.CUDA_Runtime_Discovery]]
+deps = ["Libdl"]
+git-tree-sha1 = "bcc4a23cbbd99c8535a5318455dcf0f2546ec536"
+uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
+version = "0.2.2"
+
+[[deps.CUDA_Runtime_jll]]
+deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "178a606891f82b6d734f0eadd5336b7aad44d5df"
+uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
+version = "0.9.2+1"
+
 [[deps.Cairo_jll]]
 deps = ["Artifacts", "Bzip2_jll", "CompilerSupportLibraries_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"]
 git-tree-sha1 = "4b859a208b2397a7a623a03449e4636bdb17bcf2"
@@ -205,7 +261,7 @@ weakdeps = ["Dates", "LinearAlgebra"]
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.0.2+0"
+version = "1.0.5+0"
 
 [[deps.ConcreteStructs]]
 git-tree-sha1 = "f749037478283d372048690eb3b5f92a79432b34"
@@ -243,11 +299,22 @@ git-tree-sha1 = "fcbb72b032692610bfbdb15018ac16a36cf2e406"
 uuid = "adafc99b-e345-5852-983c-f28acb93d879"
 version = "0.3.1"
 
+[[deps.Crayons]]
+git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
+uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
+version = "4.1.1"
+
 [[deps.DataAPI]]
 git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c"
 uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
 version = "1.15.0"
 
+[[deps.DataFrames]]
+deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
+git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
+uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+version = "1.6.1"
+
 [[deps.DataStructures]]
 deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
 git-tree-sha1 = "3dbd312d370723b6bb43ba9d02fc36abade4518d"
@@ -558,12 +625,24 @@ deps = ["Artifacts", "Libdl"]
 uuid = "781609d7-10c4-51f6-84f2-b8444358ff6d"
 version = "6.2.1+2"
 
+[[deps.GPUArrays]]
+deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
+git-tree-sha1 = "8ad8f375ae365aa1eb2f42e2565a40b55a4b69a8"
+uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+version = "9.0.0"
+
 [[deps.GPUArraysCore]]
 deps = ["Adapt"]
 git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0"
 uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
 version = "0.1.5"
 
+[[deps.GPUCompiler]]
+deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
+git-tree-sha1 = "5e4487558477f191c043166f8301dd0b4be4e2b2"
+uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
+version = "0.24.5"
+
 [[deps.GR]]
 deps = ["Artifacts", "Base64", "DelimitedFiles", "Downloads", "GR_jll", "HTTP", "JSON", "Libdl", "LinearAlgebra", "Pkg", "Preferences", "Printf", "Random", "Serialization", "Sockets", "TOML", "Tar", "Test", "UUIDs", "p7zip_jll"]
 git-tree-sha1 = "27442171f28c952804dede8ff72828a96f2bfc1f"
@@ -657,6 +736,12 @@ git-tree-sha1 = "5cd07aab533df5170988219191dfad0519391428"
 uuid = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
 version = "0.1.3"
 
+[[deps.InlineStrings]]
+deps = ["Parsers"]
+git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
+uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
+version = "1.4.0"
+
 [[deps.IntelOpenMP_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "ad37c091f7d7daf900963171600d7c1c5c3ede32"
@@ -667,6 +752,11 @@ version = "2023.2.0+0"
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
+[[deps.InvertedIndices]]
+git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
+uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
+version = "1.3.0"
+
 [[deps.IrrationalConstants]]
 git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
 uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
@@ -719,12 +809,28 @@ git-tree-sha1 = "6f2675ef130a300a112286de91973805fcc5ffbc"
 uuid = "aacddb02-875f-59d6-b918-886e6ef4fbf8"
 version = "2.1.91+0"
 
+[[deps.JuliaNVTXCallbacks_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f"
+uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e"
+version = "0.2.1+0"
+
 [[deps.KLU]]
 deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse_jll"]
 git-tree-sha1 = "884c2968c2e8e7e6bf5956af88cb46aa745c854b"
 uuid = "ef3ab10e-7fda-4108-b977-705223b18434"
 version = "0.4.1"
 
+[[deps.KernelAbstractions]]
+deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
+git-tree-sha1 = "5f1ecfddb6abde48563d08b2cc7e5116ebcd6c27"
+uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+version = "0.9.10"
+weakdeps = ["EnzymeCore"]
+
+    [deps.KernelAbstractions.extensions]
+    EnzymeExt = "EnzymeCore"
+
 [[deps.Krylov]]
 deps = ["LinearAlgebra", "Printf", "SparseArrays"]
 git-tree-sha1 = "17e462054b42dcdda73e9a9ba0c67754170c88ae"
@@ -743,6 +849,18 @@ git-tree-sha1 = "bf36f528eec6634efc60d7ec062008f171071434"
 uuid = "88015f11-f218-50d7-93a8-a6af411a945d"
 version = "3.0.0+1"
 
+[[deps.LLVM]]
+deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
+git-tree-sha1 = "4ea2928a96acfcf8589e6cd1429eff2a3a82c366"
+uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
+version = "6.3.0"
+
+[[deps.LLVMExtra_jll]]
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "e7c01b69bcbcb93fd4cbc3d0fea7d229541e18d2"
+uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
+version = "0.0.26+0"
+
 [[deps.LLVMOpenMP_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
 git-tree-sha1 = "f689897ccbe049adb19a065c495e75f372ecd42b"
@@ -1053,6 +1171,18 @@ git-tree-sha1 = "019f12e9a1a7880459d0173c182e6a99365d7ac1"
 uuid = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
 version = "4.5.1"
 
+[[deps.NVTX]]
+deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
+git-tree-sha1 = "8bc9ce4233be3c63f8dcd78ccaf1b63a9c0baa34"
+uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
+version = "0.3.3"
+
+[[deps.NVTX_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b"
+uuid = "e98f9f5b-d649-5603-91fd-7774390e6439"
+version = "3.1.0+2"
+
 [[deps.NaNMath]]
 deps = ["OpenLibm_jll"]
 git-tree-sha1 = "0877504529a3e5c3343c6f8b4c0381e57e4387e4"
@@ -1181,7 +1311,7 @@ version = "0.42.2+0"
 [[deps.Pkg]]
 deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-version = "1.9.0"
+version = "1.9.2"
 
 [[deps.PlotThemes]]
 deps = ["PlotUtils", "Statistics"]
@@ -1227,6 +1357,12 @@ git-tree-sha1 = "240d7170f5ffdb285f9427b92333c3463bf65bf6"
 uuid = "1d0040c9-8b98-4ee7-8388-3f51789ca0ad"
 version = "0.2.1"
 
+[[deps.PooledArrays]]
+deps = ["DataAPI", "Future"]
+git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3"
+uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
+version = "1.4.3"
+
 [[deps.PositiveFactorizations]]
 deps = ["LinearAlgebra"]
 git-tree-sha1 = "17275485f373e6673f7e7f97051f703ed5b15b20"
@@ -1257,6 +1393,12 @@ git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e"
 uuid = "21216c6a-2e73-6563-6e65-726566657250"
 version = "1.4.1"
 
+[[deps.PrettyTables]]
+deps = ["Crayons", "LaTeXStrings", "Markdown", "Printf", "Reexport", "StringManipulation", "Tables"]
+git-tree-sha1 = "ee094908d720185ddbdc58dbe0c1cbe35453ec7a"
+uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
+version = "2.2.7"
+
 [[deps.Printf]]
 deps = ["Unicode"]
 uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -1281,6 +1423,18 @@ uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 deps = ["SHA", "Serialization"]
 uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
+[[deps.Random123]]
+deps = ["Random", "RandomNumbers"]
+git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3"
+uuid = "74087812-796a-5b5d-8853-05524746bad3"
+version = "1.6.1"
+
+[[deps.RandomNumbers]]
+deps = ["Random", "Requires"]
+git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
+uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
+version = "1.5.3"
+
 [[deps.RecipesBase]]
 deps = ["PrecompileTools"]
 git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff"
@@ -1409,6 +1563,12 @@ git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
 uuid = "6c6a2e73-6563-6170-7368-637461726353"
 version = "1.2.0"
 
+[[deps.SentinelArrays]]
+deps = ["Dates", "Random"]
+git-tree-sha1 = "04bdff0b09c65ff3e06a05e3eb7b120223da3d39"
+uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
+version = "1.4.0"
+
 [[deps.Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
@@ -1568,6 +1728,12 @@ git-tree-sha1 = "b765e46ba27ecf6b44faf70df40c57aa3a547dcb"
 uuid = "69024149-9ee7-55f6-a4c4-859efe599b68"
 version = "0.3.7"
 
+[[deps.StringManipulation]]
+deps = ["PrecompileTools"]
+git-tree-sha1 = "a04cabe79c5f01f4d723cc6704070ada0b9d46d5"
+uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
+version = "0.3.4"
+
 [[deps.StructTypes]]
 deps = ["Dates", "UUIDs"]
 git-tree-sha1 = "ca4bccb03acf9faaf4137a9abc1881ed1841aa70"
@@ -1705,6 +1871,17 @@ git-tree-sha1 = "e2d817cc500e960fdbafcf988ac8436ba3208bfd"
 uuid = "45397f5d-5981-4c77-b2b3-fc36d6e9b728"
 version = "1.6.3"
 
+[[deps.UnsafeAtomics]]
+git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
+uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
+version = "0.2.1"
+
+[[deps.UnsafeAtomicsLLVM]]
+deps = ["LLVM", "UnsafeAtomics"]
+git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
+uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
+version = "0.1.3"
+
 [[deps.Unzip]]
 git-tree-sha1 = "ca0969166a028236229f63514992fc073799bb78"
 uuid = "41fe7b60-77ed-43a1-b4f0-825fd5a5650d"
diff --git a/docs/Project.toml b/docs/Project.toml
index 2d83ab8a20..752415eee6 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,7 @@
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 Ferrite = "c061ca5d-56c9-439f-9c0e-210fe06d3992"
@@ -16,6 +18,7 @@ Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"

From 1e5cb41bbb9cbe01f5e1fad6cc4479e9461f9f74 Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Wed, 13 Dec 2023 14:13:21 +0100
Subject: [PATCH 06/16] ...

---
 docs/src/literate-tutorials/gpu_assembly.jl | 64 +++++++++------------
 1 file changed, 26 insertions(+), 38 deletions(-)

diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl
index 9a6163e8f8..bbc751cc82 100644
--- a/docs/src/literate-tutorials/gpu_assembly.jl
+++ b/docs/src/literate-tutorials/gpu_assembly.jl
@@ -53,10 +53,10 @@ struct GPUQuadratureRule{N,T,dim}
 end
 function Adapt.adapt_structure(to, qr::QuadratureRule{shape,T,dim}) where {shape,T,dim}
     N = length(qr.weights)
-    GPUQuadratureRule{N,T,dim}(SVector{N,T}(qr.weights), SVector{N,Vec{dim,T}}(qr.points))
+    GPUQuadratureRule{N,Float32,dim}(SVector{N,Float32}(qr.weights), SVector{N,Vec{dim,Float32}}(qr.points))
 end
-function Adapt.adapt_structure(to, nodes::Vector{Node})
-    CuArray(get_node_coordinate.(nodes))
+function Adapt.adapt_structure(to, nodes::Vector{Node{dim}}) where dim
+    CuArray(Vec{dim,Float32}.(get_node_coordinate.(grid.nodes)))
 end
 ### TODO Adapt dofhandler?
 
@@ -149,7 +149,7 @@ cellnodes(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]], nodes[cell.nodes[2]]
 # We start by generating a simple grid with 20x20 quadrilateral elements
 # using `generate_grid`. The generator defaults to the unit square,
 # so we don't need to specify the corners of the domain.
-grid = generate_grid(Hexahedron, (100, 100, 100));
+grid = generate_grid(Hexahedron, (20, 20, 20));
 colors = CuArray.(create_coloring(grid)); # TODO add example without coloring, i.e. using Atomics instead
 
 # ### Trial and test functions
@@ -202,27 +202,10 @@ close!(dh);
 # add!(ch, dbc);
 # close!(ch)
 
-function gpu_element_mass_action!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{N,T,dim}, ip, ip_geo, xₑ) where {V,N,T,dim}
-    n_basefuncs = length(uₑin) #getnbasefunctions(cellvalues)
-    for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues)
-        ξ = qr.points[q_point]
-        # TODO recover abstraction layer
-        J = getJ(ip_geo, xₑ, ξ)
-        dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point)
-        for i in 1:n_basefuncs
-            ϕᵢ = unsafe_shape_value(ip, ξ, i)
-            for j in 1:n_basefuncs
-                ϕⱼ = unsafe_shape_value(ip, ξ, j)
-                uₑout[i] += ϕᵢ*ϕⱼ*uₑin[j]*dΩ
-            end
-        end
-    end
-    return nothing
-end
-
 function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, xₑ) where {V,N,T,dim,IP,GIP}
     n_basefuncs = length(uₑin) #getnbasefunctions(cellvalues)
-    for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues)
+    # shapes = MVector{8,Float32}(undef)
+    @inbounds for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues)
         ξ = qr.points[q_point]
         shapes = shape_values(ip, ξ)
         # TODO recover abstraction layer
@@ -240,10 +223,10 @@ function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{
 end
 
 # Mass action on a single color
-Base.@propagate_inbounds function gpu_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offsets, cell_indices, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, cells::CuDeviceArray{CT}, nodes::CuDeviceArray{Vec{dim,T}}, dofs_per_cell)  where {V,N,T,dim,CT,IP,GIP}
+function gpu_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offsets, cell_indices, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, cells::CuDeviceArray{CT}, nodes::CuDeviceArray{Vec{dim,T}}, dofs_per_cell)  where {V,N,T,dim,CT,IP,GIP}
     index = threadIdx().x    # this example only requires linear indexing, so just use `x`
     stride = blockDim().x
-    for i = index:stride:length(cell_indices)
+    @inbounds for i = index:stride:length(cell_indices)
         ## Get index of the current cell
         cell_index = cell_indices[i]
         ## Grab the actual cell
@@ -265,7 +248,7 @@ end
 # Mass action of the full operator
 function gpu_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR, ip::IP, ip_geo::GIP, colors, gpu_cells, gpu_nodes) where {V, QR, IP, GIP}
     # Initialize solution
-    fill!(uout, 0.0)
+    fill!(uout, 0.f0)
     synchronize()
 
     # Apply action one time
@@ -274,8 +257,9 @@ function gpu_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR
         numthreads = 256
         numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
         # try
-            @cuda threads=numthreads blocks=numblocks gpu_mass_action_kernel!(uout, u, all_cell_dofs, cell_dof_offsets, color, qr, ip, ip_geo, gpu_cells, gpu_nodes, dofs_per_cell)
+            @show @device_code_warntype @cuda threads=numthreads blocks=numblocks gpu_mass_action_kernel!(uout, u, all_cell_dofs, cell_dof_offsets, color, qr, ip, ip_geo, gpu_cells, gpu_nodes, dofs_per_cell)
             synchronize()
+            break
             # catch err
         #     code_typed(err; interactive = true)
         # end
@@ -308,9 +292,10 @@ struct FerriteGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP,
 end
 
 function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr)
+    grid = Ferrite.get_grid(dh)
     FerriteGPUMassOperator(
-        CuArray(getcells(Ferrite.get_grid(dh))),
-        CuArray(get_node_coordinate.(getnodes(Ferrite.get_grid(dh)))),
+        CuArray(getcells(grid)),
+        CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))),
         colors,
         CuArray(dh.cell_dofs),
         CuArray(dh.cell_dofs_offset),
@@ -339,9 +324,10 @@ struct FerriteGPURHS{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR}
 end
 
 function FerriteGPURHS(dh, colors, ip, ip_geo, qr)
+    grid = Ferrite.get_grid(dh)
     FerriteGPURHS(
-        CuArray(getcells(Ferrite.get_grid(dh))),
-        CuArray(get_node_coordinate.(getnodes(Ferrite.get_grid(dh)))),
+        CuArray(getcells(grid)),
+        CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))),
         colors,
         CuArray(dh.cell_dofs),
         CuArray(dh.cell_dofs_offset),
@@ -350,7 +336,7 @@ function FerriteGPURHS(dh, colors, ip, ip_geo, qr)
 end
 
 function generate_rhs(gpurhs::FerriteGPURHS)
-    rhs = CuArray(zeros(ndofs(dh)))
+    rhs = CuArray(zeros(Float32,ndofs(dh)))
     numthreads = 256
     numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
     dofs_per_cell  = ndofs_per_cell(dh, 1)
@@ -384,7 +370,7 @@ function generate_rhs(gpurhs::FerriteGPURHS)
     return rhs
 end
 
-Base.@propagate_inbounds function gpu_rhs_kernel!(rhs::RHS, all_cell_dofs::CD, cell_dof_offsets::DO, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC, cells::CELLS, nodes::GPUN, dofs_per_cell::DOFSPC) where {RHS, CD, DO, QR, FIP, GIP, GPUC, CELLS, GPUN, DOFSPC}
+function gpu_rhs_kernel!(rhs::RHS, all_cell_dofs::CD, cell_dof_offsets::DO, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC, cells::CELLS, nodes::GPUN, dofs_per_cell::DOFSPC) where {RHS, CD, DO, QR, FIP, GIP, GPUC, CELLS, GPUN, DOFSPC}
     index = threadIdx().x    # this example only requires linear indexing, so just use `x`
     stride = blockDim().x
     for i = index:stride:length(cell_indices)
@@ -427,15 +413,17 @@ end
 
 
 b = generate_rhs(FerriteGPURHS(dh, colors, ip, ip, qr))
-u = CUDA.fill(0.0, ndofs(dh));
-cg!(u, A, b; verbose=true);
+u = CUDA.fill(0.f0, ndofs(dh));
+# cg!(u, A, b; verbose=true);
+# @benchmark CUDA.@sync mul!($b,$A,$u)
+# CUDA.@profile mul!(b,A,u)
 
 # ### Exporting to VTK
 # To visualize the result we export the grid and our field `u`
 # to a VTK-file, which can be viewed in e.g. [ParaView](https://www.paraview.org/).
-vtk_grid("heat_equation", dh) do vtk # The cake is a lie
-    vtk_point_data(vtk, dh, Array(u))
-end
+# vtk_grid("heat_equation", dh) do vtk # The cake is a lie
+#     vtk_point_data(vtk, dh, Array(u))
+# end
 
 ## test the result                #src
 # using Test                        #src

From 07a0e7ae35634daa506dd87c9aba87f2d760d91b Mon Sep 17 00:00:00 2001
From: Dennis Ogiermann <termi-official@users.noreply.github.com>
Date: Sat, 30 Dec 2023 15:51:38 +0100
Subject: [PATCH 07/16] GPU grid prototype.

---
 docs/src/literate-tutorials/gpu_assembly.jl | 58 ++++++++++++++++++---
 1 file changed, 50 insertions(+), 8 deletions(-)

diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl
index bbc751cc82..f1a346834b 100644
--- a/docs/src/literate-tutorials/gpu_assembly.jl
+++ b/docs/src/literate-tutorials/gpu_assembly.jl
@@ -60,6 +60,22 @@ function Adapt.adapt_structure(to, nodes::Vector{Node{dim}}) where dim
 end
 ### TODO Adapt dofhandler?
 
+struct CuGrid{sdim,C<:Ferrite.AbstractCell,T<:Real, CELA<:CUDA.Const{C,1}, NODA<:CUDA:Const{Vec{sdim,T},1}, COLA<:CUDA.Const{Int,1}} <: Ferrite.AbstractGrid{sdim}
+    cells::CELA
+    nodes::NODA
+    colors::COLA
+    # TODO subdomains
+end
+
+CuGrid(grid::Grid{<:Any,<:Any,T}) where T = CuGrid(T, grid)
+function CuGrid(::Type{T}, grid::Grid)
+    CuGrid(
+        Const(CuArray(getcells(grid))),
+        Const(CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid))))),
+        Const(CuArray(colors))
+    )
+end
+
 # TODO not sure how to do this automatically
 function unsafe_shape_value(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}, i::Int)
     ξ_x = ξ[1]
@@ -223,7 +239,7 @@ function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{
 end
 
 # Mass action on a single color
-function gpu_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offsets, cell_indices, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, cells::CuDeviceArray{CT}, nodes::CuDeviceArray{Vec{dim,T}}, dofs_per_cell)  where {V,N,T,dim,CT,IP,GIP}
+function gpu_full_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offsets, cell_indices, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, cells::CuDeviceArray{CT}, nodes::CuDeviceArray{Vec{dim,T}}, dofs_per_cell)  where {V,N,T,dim,CT,IP,GIP}
     index = threadIdx().x    # this example only requires linear indexing, so just use `x`
     stride = blockDim().x
     @inbounds for i = index:stride:length(cell_indices)
@@ -246,7 +262,7 @@ function gpu_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offset
 end
 
 # Mass action of the full operator
-function gpu_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR, ip::IP, ip_geo::GIP, colors, gpu_cells, gpu_nodes) where {V, QR, IP, GIP}
+function gpu_full_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR, ip::IP, ip_geo::GIP, colors, gpu_cells, gpu_nodes) where {V, QR, IP, GIP}
     # Initialize solution
     fill!(uout, 0.f0)
     synchronize()
@@ -257,7 +273,7 @@ function gpu_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR
         numthreads = 256
         numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
         # try
-            @show @device_code_warntype @cuda threads=numthreads blocks=numblocks gpu_mass_action_kernel!(uout, u, all_cell_dofs, cell_dof_offsets, color, qr, ip, ip_geo, gpu_cells, gpu_nodes, dofs_per_cell)
+            @show @device_code_warntype @cuda threads=numthreads blocks=numblocks gpu_full_mass_action_kernel!(uout, u, all_cell_dofs, cell_dof_offsets, color, qr, ip, ip_geo, gpu_cells, gpu_nodes, dofs_per_cell)
             synchronize()
             break
             # catch err
@@ -277,11 +293,9 @@ end
 
 # 
 # Define the operator struct (https://iterativesolvers.julialinearalgebra.org/dev/getting_started/)
-struct FerriteGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR}
+struct FerriteGPUMassOperator{TGRID, CADOFS, CAOFFSETS, IP, GIP, QR}
     # "GPUGrid"
-    gpu_cells::CACELLS
-    gpu_nodes::CANODES
-    colors::CACOLORS
+    grid::TGRID
     # "GPUDofHandler"
     all_cell_dofs::CADOFS
     cell_dof_offsets::CAOFFSETS
@@ -303,12 +317,40 @@ function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr)
     )
 end
 
-LinearAlgebra.mul!(y, A::FerriteGPUMassOperator, x) = gpu_mass_action!(y, x, A.all_cell_dofs, A.cell_dof_offsets, A.qr, A.ip, A.ip_geo, A.colors, A.gpu_cells, A.gpu_nodes)
+LinearAlgebra.mul!(y, A::FerriteGPUMassOperator, x) = gpu_full_mass_action!(y, x, A.all_cell_dofs, A.cell_dof_offsets, A.qr, A.ip, A.ip_geo, A.colors, A.gpu_cells, A.gpu_nodes)
 Base.eltype(A::FerriteGPUMassOperator) = Float64
 Base.size(A::FerriteGPUMassOperator, d) = ndofs(dh) # Square operator
 
 A = FerriteGPUMassOperator(dh, colors, ip, ip, qr)
 
+struct FerriteEAGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR, EAMATS <: AbstractArray{3}}
+    # "GPUGrid"
+    gpu_cells::CACELLS
+    gpu_nodes::CANODES
+    colors::CACOLORS
+    # "GPUDofHandler"
+    all_cell_dofs::CADOFS
+    cell_dof_offsets::CAOFFSETS
+    # "GPUValues"
+    ip::IP
+    ip_geo::GIP
+    qr::QR
+    # 
+    element_matrices::EAMATS
+end
+
+function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr)
+    grid = Ferrite.get_grid(dh)
+    FerriteGPUMassOperator(
+        CuArray(getcells(grid)),
+        CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))),
+        colors,
+        CuArray(dh.cell_dofs),
+        CuArray(dh.cell_dofs_offset),
+        ip, ip_geo, qr
+    )
+end
+
 struct FerriteGPURHS{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR}
     # "GPUGrid"
     gpu_cells::CACELLS

From 0aca0274f9be54d232b498a73d578df9ac8abad2 Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Sat, 30 Dec 2023 23:50:43 +0100
Subject: [PATCH 08/16] Initial GPUGrid and GPUDofHandler + performance bugfix.
 Can now generate NaNs at the speed of light.

---
 docs/src/literate-tutorials/gpu_assembly.jl | 303 +++++++++-----------
 src/Dofs/DofHandler.jl                      |   5 +-
 2 files changed, 142 insertions(+), 166 deletions(-)

diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl
index f1a346834b..1a86c28c05 100644
--- a/docs/src/literate-tutorials/gpu_assembly.jl
+++ b/docs/src/literate-tutorials/gpu_assembly.jl
@@ -47,58 +47,64 @@ using IterativeSolvers, LinearAlgebra
 ### TODO Extension
 import Adapt
 using StaticArrays
-struct GPUQuadratureRule{N,T,dim}
+struct SmallQuadratureRule{N,T,dim}
     weights::SVector{N,T}
     points::SVector{N,Vec{dim,T}}
 end
+SmallQuadratureRule(qr::QuadratureRule) = SmallQuadratureRule(SVector{length(qr.weights)}(qr.weights), SVector{length(qr.points)}(qr.points))
 function Adapt.adapt_structure(to, qr::QuadratureRule{shape,T,dim}) where {shape,T,dim}
     N = length(qr.weights)
-    GPUQuadratureRule{N,Float32,dim}(SVector{N,Float32}(qr.weights), SVector{N,Vec{dim,Float32}}(qr.points))
+    SmallQuadratureRule{N,T,dim}(SVector{N,T}(qr.weights), SVector{N,Vec{dim,T}}(qr.points))
 end
-function Adapt.adapt_structure(to, nodes::Vector{Node{dim}}) where dim
-    CuArray(Vec{dim,Float32}.(get_node_coordinate.(grid.nodes)))
+function Adapt.adapt_structure(to, nodes::Vector{Node{dim,T}}) where {dim,T}
+    CuArray(Vec{dim,T}.(get_node_coordinate.(grid.nodes)))
 end
-### TODO Adapt dofhandler?
 
-struct CuGrid{sdim,C<:Ferrite.AbstractCell,T<:Real, CELA<:CUDA.Const{C,1}, NODA<:CUDA:Const{Vec{sdim,T},1}, COLA<:CUDA.Const{Int,1}} <: Ferrite.AbstractGrid{sdim}
+struct GPUGrid{sdim,C<:Ferrite.AbstractCell,T<:Real, CELA<:AbstractArray{C,1}, NODA<:AbstractArray{Node{sdim,T},1}} <: Ferrite.AbstractGrid{sdim}
     cells::CELA
     nodes::NODA
-    colors::COLA
     # TODO subdomains
 end
+function Adapt.adapt_structure(to, grid::GPUGrid)
+    cells = Adapt.adapt_structure(to, grid.cells)
+    nodes = Adapt.adapt_structure(to, grid.nodes)
+    return GPUGrid(cells, nodes)
+end
 
-CuGrid(grid::Grid{<:Any,<:Any,T}) where T = CuGrid(T, grid)
-function CuGrid(::Type{T}, grid::Grid)
-    CuGrid(
-        Const(CuArray(getcells(grid))),
-        Const(CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid))))),
-        Const(CuArray(colors))
+GPUGrid(grid::Grid{<:Any,<:Any,T}) where T = GPUGrid(T, grid)
+function GPUGrid(::Type{T}, grid::Grid) where T
+    GPUGrid(
+        CuArray(getcells(grid)),
+        # CuArray(Vec{Ferrite.getdim(grid),T}.(get_node_coordinate.(getnodes(grid))))
+        CuArray(getnodes(grid))
     )
 end
 
-# TODO not sure how to do this automatically
-function unsafe_shape_value(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2}, i::Int)
-    ξ_x = ξ[1]
-    ξ_y = ξ[2]
-    i == 1 && return (1 - ξ_x) * (1 - ξ_y) / 4
-    i == 2 && return (1 + ξ_x) * (1 - ξ_y) / 4
-    i == 3 && return (1 + ξ_x) * (1 + ξ_y) / 4
-    i == 4 && return (1 - ξ_x) * (1 + ξ_y) / 4
+struct GPUDofHandler{GRID <: GPUGrid, CADOFS <: AbstractArray{Int,1}, CAOFFSETS <: AbstractArray{Int,1}}
+    cell_dofs::CADOFS
+    cell_dofs_offset::CAOFFSETS
+    grid::GRID
 end
-
-function unsafe_shape_value(ip::Lagrange{RefHexahedron, 1}, ξ::Vec{3}, i::Int)
-    ξ_x = ξ[1]
-    ξ_y = ξ[2]
-    ξ_z = ξ[3]
-    i == 1 && return 0.125(1 - ξ_x) * (1 - ξ_y) * (1 - ξ_z)
-    i == 2 && return 0.125(1 + ξ_x) * (1 - ξ_y) * (1 - ξ_z)
-    i == 3 && return 0.125(1 + ξ_x) * (1 + ξ_y) * (1 - ξ_z)
-    i == 4 && return 0.125(1 - ξ_x) * (1 + ξ_y) * (1 - ξ_z)
-    i == 5 && return 0.125(1 - ξ_x) * (1 - ξ_y) * (1 + ξ_z)
-    i == 6 && return 0.125(1 + ξ_x) * (1 - ξ_y) * (1 + ξ_z)
-    i == 7 && return 0.125(1 + ξ_x) * (1 + ξ_y) * (1 + ξ_z)
-    i == 8 && return 0.125(1 - ξ_x) * (1 + ξ_y) * (1 + ξ_z)
+function GPUDofHandler(dh::DofHandler)
+    GPUDofHandler(
+        CuArray(dh.cell_dofs),
+        CuArray(dh.cell_dofs_offset),
+        GPUGrid(dh.grid)
+    )
 end
+function GPUDofHandler(::Type{T}, dh::DofHandler) where T
+    GPUDofHandler(
+        CuArray(dh.cell_dofs),
+        CuArray(dh.cell_dofs_offset),
+        GPUGrid(T, dh.grid)
+    )
+end
+Ferrite.ndofs_per_cell(dh::GPUDofHandler, i::Int) = (cell_dofs_offset[i+1]-1)-cell_dofs_offset[i]
+function Ferrite.celldofs(dh::GPUDofHandler, cell_index::Int) 
+    cell_dof_range = dh.cell_dofs_offset[cell_index]:(dh.cell_dofs_offset[cell_index+1]-1)
+    return @view dh.cell_dofs[cell_dof_range]
+end
+Adapt.@adapt_structure GPUDofHandler
 
 function shape_values(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2})
     ξ_x = ξ[1]
@@ -160,13 +166,15 @@ end
 
 # ntuple fails...
 cellnodes(cell::Quadrilateral, nodes) = (nodes[cell.nodes[1]], nodes[cell.nodes[2]], nodes[cell.nodes[3]], nodes[cell.nodes[4]])
+cellnodesvec(cell::Quadrilateral, nodes) = (nodes[cell.nodes[1]].x, nodes[cell.nodes[2]].x, nodes[cell.nodes[3]].x, nodes[cell.nodes[4]].x)
 cellnodes(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]], nodes[cell.nodes[2]], nodes[cell.nodes[3]], nodes[cell.nodes[4]], nodes[cell.nodes[5]], nodes[cell.nodes[6]], nodes[cell.nodes[7]], nodes[cell.nodes[8]])
+cellnodesvec(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]].x, nodes[cell.nodes[2]].x, nodes[cell.nodes[3]].x, nodes[cell.nodes[4]].x, nodes[cell.nodes[5]].x, nodes[cell.nodes[6]].x, nodes[cell.nodes[7]].x, nodes[cell.nodes[8]].x)
 
 # We start by generating a simple grid with 20x20 quadrilateral elements
 # using `generate_grid`. The generator defaults to the unit square,
 # so we don't need to specify the corners of the domain.
 grid = generate_grid(Hexahedron, (20, 20, 20));
-colors = CuArray.(create_coloring(grid)); # TODO add example without coloring, i.e. using Atomics instead
+colors = create_coloring(grid); # TODO add example without coloring, i.e. using Atomics instead
 
 # ### Trial and test functions
 # A `CellValues` facilitates the process of evaluating values and gradients of
@@ -218,7 +226,7 @@ close!(dh);
 # add!(ch, dbc);
 # close!(ch)
 
-function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, xₑ) where {V,N,T,dim,IP,GIP}
+function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::SmallQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, xₑ) where {V,N,T,dim,IP,GIP}
     n_basefuncs = length(uₑin) #getnbasefunctions(cellvalues)
     # shapes = MVector{8,Float32}(undef)
     @inbounds for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues)
@@ -239,41 +247,41 @@ function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::GPUQuadratureRule{
 end
 
 # Mass action on a single color
-function gpu_full_mass_action_kernel!(uout::V, uin::V, all_cell_dofs, cell_dof_offsets, cell_indices, qr::GPUQuadratureRule{N,T,dim}, ip::IP, ip_geo::GIP, cells::CuDeviceArray{CT}, nodes::CuDeviceArray{Vec{dim,T}}, dofs_per_cell)  where {V,N,T,dim,CT,IP,GIP}
-    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
-    stride = blockDim().x
-    @inbounds for i = index:stride:length(cell_indices)
+function gpu_full_mass_action_kernel!(uout::AbstractVector, uin::AbstractVector, gpudh::GPUDofHandler, cell_indices::AbstractVector, qr::SmallQuadratureRule, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation)
+    index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
+    stride = gridDim().x * blockDim().x
+    i = index
+    while i <= length(cell_indices)
         ## Get index of the current cell
         cell_index = cell_indices[i]
         ## Grab the actual cell
-        cell = cells[cell_index]
+        cell = gpudh.grid.cells[cell_index]
         ## Grab the dofs on the cell
-        cell_dof_range = cell_dof_offsets[cell_index]:(cell_dof_offsets[cell_index]+dofs_per_cell-1)
-        cell_dofs = @view all_cell_dofs[cell_dof_range]
+        cell_dofs = celldofs(gpudh, cell_index)
         ## Grab the buffers for the y and x
         uₑin  = @view uin[cell_dofs]
         uₑout = @view uout[cell_dofs]
         ## Grab coordinate array
-        xₑ = cellnodes(cell, nodes)
+        xₑ = cellnodesvec(cell, gpudh.grid.nodes)
         ## Apply local action for y = Ax
         gpu_element_mass_action2!(uₑout, uₑin, qr, ip, ip_geo, xₑ)
+        i += stride
     end
     return nothing
 end
 
 # Mass action of the full operator
-function gpu_full_mass_action!(uout::V, u::V, all_cell_dofs, cell_dof_offsets, qr::QR, ip::IP, ip_geo::GIP, colors, gpu_cells, gpu_nodes) where {V, QR, IP, GIP}
+function gpu_full_mass_action!(uout::AbstractVector, u::AbstractVector, dh::GPUDofHandler, qr, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation, colors::AbstractVector)
     # Initialize solution
-    fill!(uout, 0.f0)
+    fill!(uout, 0.0)
     synchronize()
 
     # Apply action one time
-    dofs_per_cell  = ndofs_per_cell(dh, 1)
     for color ∈ colors
         numthreads = 256
         numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
         # try
-            @show @device_code_warntype @cuda threads=numthreads blocks=numblocks gpu_full_mass_action_kernel!(uout, u, all_cell_dofs, cell_dof_offsets, color, qr, ip, ip_geo, gpu_cells, gpu_nodes, dofs_per_cell)
+            @cuda threads=numthreads blocks=numblocks gpu_full_mass_action_kernel!(uout, u, dh, color, qr, ip, ip_geo)
             synchronize()
             break
             # catch err
@@ -293,98 +301,104 @@ end
 
 # 
 # Define the operator struct (https://iterativesolvers.julialinearalgebra.org/dev/getting_started/)
-struct FerriteGPUMassOperator{TGRID, CADOFS, CAOFFSETS, IP, GIP, QR}
-    # "GPUGrid"
-    grid::TGRID
-    # "GPUDofHandler"
-    all_cell_dofs::CADOFS
-    cell_dof_offsets::CAOFFSETS
+struct FerriteMFMassOperator{DH, COLS, IP, GIP, QR}
+    dh::DH
+    colors::COLS
     # "GPUValues"
     ip::IP
     ip_geo::GIP
     qr::QR
 end
 
-function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr)
-    grid = Ferrite.get_grid(dh)
-    FerriteGPUMassOperator(
-        CuArray(getcells(grid)),
-        CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))),
-        colors,
-        CuArray(dh.cell_dofs),
-        CuArray(dh.cell_dofs_offset),
-        ip, ip_geo, qr
-    )
-end
-
-LinearAlgebra.mul!(y, A::FerriteGPUMassOperator, x) = gpu_full_mass_action!(y, x, A.all_cell_dofs, A.cell_dof_offsets, A.qr, A.ip, A.ip_geo, A.colors, A.gpu_cells, A.gpu_nodes)
-Base.eltype(A::FerriteGPUMassOperator) = Float64
-Base.size(A::FerriteGPUMassOperator, d) = ndofs(dh) # Square operator
+LinearAlgebra.mul!(y, A::FerriteMFMassOperator{<:GPUDofHandler}, x) = gpu_full_mass_action!(y, x, A.dh, A.qr, A.ip, A.ip_geo, A.colors)
+Base.eltype(A::FerriteMFMassOperator{<:Grid{<:Any,<:Any,T}}) where T = T
+Base.eltype(A::FerriteMFMassOperator{<:GPUGrid{<:Any,<:Any,T}}) where T = T
+Base.size(A::FerriteMFMassOperator, d) = ndofs(dh) # Square operator
+
+# struct FerriteEAGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR, EAMATS <: AbstractArray{3}}
+#     # "GPUGrid"
+#     gpu_cells::CACELLS
+#     gpu_nodes::CANODES
+#     colors::CACOLORS
+#     # "GPUDofHandler"
+#     all_cell_dofs::CADOFS
+#     cell_dofs_offset::CAOFFSETS
+#     # "GPUValues"
+#     ip::IP
+#     ip_geo::GIP
+#     qr::QR
+#     # 
+#     element_matrices::EAMATS
+# end
 
-A = FerriteGPUMassOperator(dh, colors, ip, ip, qr)
+# function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr)
+#     grid = Ferrite.get_grid(dh)
+#     FerriteGPUMassOperator(
+#         CuArray(getcells(grid)),
+#         CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))),
+#         colors,
+#         CuArray(dh.cell_dofs),
+#         CuArray(dh.cell_dofs_offset),
+#         ip, ip_geo, qr
+#     )
+# end
 
-struct FerriteEAGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR, EAMATS <: AbstractArray{3}}
-    # "GPUGrid"
-    gpu_cells::CACELLS
-    gpu_nodes::CANODES
-    colors::CACOLORS
-    # "GPUDofHandler"
-    all_cell_dofs::CADOFS
-    cell_dof_offsets::CAOFFSETS
+struct FerriteRHS{DH, COLS, IP, GIP, QR}
+    dh::DH
+    colors::COLS
     # "GPUValues"
     ip::IP
     ip_geo::GIP
     qr::QR
-    # 
-    element_matrices::EAMATS
 end
 
-function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr)
-    grid = Ferrite.get_grid(dh)
-    FerriteGPUMassOperator(
-        CuArray(getcells(grid)),
-        CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))),
-        colors,
-        CuArray(dh.cell_dofs),
-        CuArray(dh.cell_dofs_offset),
-        ip, ip_geo, qr
-    )
-end
+function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC) where {RHS, GPUDH, QR, FIP, GIP, GPUC}
+    index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
+    stride = gridDim().x * blockDim().x
+    i = index
+    while i <= length(cell_indices)
+        ## Get index of the current cell
+        cell_index = cell_indices[i]
+        ## Grab the actual cell
+        cell = gpudh.grid.cells[cell_index]
+        # ## Grab the dofs on the cell
+        cell_dofs = celldofs(gpudh, cell_index)
+        ## Grab the buffers for the y and x
+        rhsₑ = @view rhs[cell_dofs]
+        ## Grab coordinate array
+        coords = cellnodesvec(cell, gpudh.grid.nodes)
+        ## Apply local action for y = Ax
+        n_basefuncs = getnbasefunctions(ip)
+        for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues)
+            ξ = qr.points[q_point]
+            # TODO recover abstraction layer
+            J = getJ(ip_geo, coords, ξ)
+            dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point)
+
+            # # TODO spatial_coordinate
+            x = zero(Vec{3,Float64})
+            ϕᵍ = shape_values(ip_geo, ξ)
+            for i in 1:n_basefuncs
+                x += ϕᵍ[i]*coords[i] #geometric_value(fe_v, q_point, i) * x[i]
+            end
 
-struct FerriteGPURHS{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR}
-    # "GPUGrid"
-    gpu_cells::CACELLS
-    gpu_nodes::CANODES
-    colors::CACOLORS
-    # "GPUDofHandler"
-    all_cell_dofs::CADOFS
-    cell_dof_offsets::CAOFFSETS
-    # "GPUValues"
-    ip::IP
-    ip_geo::GIP
-    qr::QR
+            ϕᶠ = shape_values(ip, ξ)
+            @inbounds for i in 1:n_basefuncs
+                rhsₑ[i] += cos(x[1]/π)*cos(x[2]/π)*ϕᶠ[i]*dΩ
+            end
+        end
+        i += stride
+    end
 end
 
-function FerriteGPURHS(dh, colors, ip, ip_geo, qr)
-    grid = Ferrite.get_grid(dh)
-    FerriteGPURHS(
-        CuArray(getcells(grid)),
-        CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))),
-        colors,
-        CuArray(dh.cell_dofs),
-        CuArray(dh.cell_dofs_offset),
-        ip, ip_geo, qr
-    )
-end
 
-function generate_rhs(gpurhs::FerriteGPURHS)
-    rhs = CuArray(zeros(Float32,ndofs(dh)))
+function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler})
+    rhs = CuArray(zeros(Float64,ndofs(dh)))
     numthreads = 256
     numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
-    dofs_per_cell  = ndofs_per_cell(dh, 1)
     for color ∈ gpurhs.colors
         # try
-        @cuda threads=numthreads blocks=numblocks gpu_rhs_kernel!(rhs, gpurhs.all_cell_dofs, gpurhs.cell_dof_offsets, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color, gpurhs.gpu_cells, gpurhs.gpu_nodes, dofs_per_cell)
+        @cuda threads=numthreads blocks=numblocks gpu_rhs_kernel!(rhs, gpurhs.dh, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color)
         synchronize()
         # catch err
         #     code_typed(err; interactive = true)
@@ -412,52 +426,13 @@ function generate_rhs(gpurhs::FerriteGPURHS)
     return rhs
 end
 
-function gpu_rhs_kernel!(rhs::RHS, all_cell_dofs::CD, cell_dof_offsets::DO, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC, cells::CELLS, nodes::GPUN, dofs_per_cell::DOFSPC) where {RHS, CD, DO, QR, FIP, GIP, GPUC, CELLS, GPUN, DOFSPC}
-    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
-    stride = blockDim().x
-    for i = index:stride:length(cell_indices)
-        ## Get index of the current cell
-        cell_index = cell_indices[i]
-        ## Grab the actual cell
-        cell = cells[cell_index]
-        ## Grab the dofs on the cell
-        cell_dof_range = cell_dof_offsets[cell_index]:(cell_dof_offsets[cell_index]+dofs_per_cell-1)
-        cell_dofs = @view all_cell_dofs[cell_dof_range]
-        ## Grab the buffers for the y and x
-        rhsₑ = @view rhs[cell_dofs]
-        ## Grab coordinate array
-        coords = cellnodes(cell, nodes)
-        ## Apply local action for y = Ax
-        gpu_rhs_kernel2!(rhsₑ, qr, ip, ip_geo, coords)
-    end
-end
-
-function gpu_rhs_kernel2!(rhsₑ, qr, ip, ip_geo, coords)    
-    n_basefuncs = getnbasefunctions(ip)
-    for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues)
-        ξ = qr.points[q_point]
-        # TODO recover abstraction layer
-        J = getJ(ip_geo, coords, ξ)
-        dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point)
-        
-        # TODO spatial_coordinate
-        x = zero(Vec{3,Float64})
-        for i in 1:n_basefuncs
-            x += unsafe_shape_value(ip_geo, ξ, i)*coords[i] #geometric_value(fe_v, q_point, i) * x[i]
-        end
-
-        for i in 1:n_basefuncs
-            ϕᵢ = unsafe_shape_value(ip, ξ, i)
-            rhsₑ[i] += cos(x[1]/π)*cos(x[2]/π)*ϕᵢ * dΩ
-        end
-    end
-end
-
-
-b = generate_rhs(FerriteGPURHS(dh, colors, ip, ip, qr))
-u = CUDA.fill(0.f0, ndofs(dh));
-# cg!(u, A, b; verbose=true);
-# @benchmark CUDA.@sync mul!($b,$A,$u)
+cudh = GPUDofHandler(dh)
+cucols = CuArray.(colors)
+Agpu = FerriteMFMassOperator(cudh, cucols, ip, ip, SmallQuadratureRule(qr))
+bgpu = generate_rhs(FerriteRHS(cudh, cucols, ip, ip, SmallQuadratureRule(qr)))
+u = CUDA.fill(0.0, ndofs(dh));
+cg!(u, Agpu, bgpu; verbose=true);
+# @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu);
 # CUDA.@profile mul!(b,A,u)
 
 # ### Exporting to VTK
diff --git a/src/Dofs/DofHandler.jl b/src/Dofs/DofHandler.jl
index e6e88dff19..f43fbb039d 100644
--- a/src/Dofs/DofHandler.jl
+++ b/src/Dofs/DofHandler.jl
@@ -131,7 +131,7 @@ close!(dh)
 function DofHandler(grid::G) where {dim, G <: AbstractGrid{dim}}
     ncells = getncells(grid)
     sdhs = SubDofHandler{DofHandler{dim, G}}[]
-    DofHandler{dim, G}(sdhs, Symbol[], Int[], zeros(Int, ncells), zeros(Int, ncells), ScalarWrapper(false), grid, ScalarWrapper(-1))
+    DofHandler{dim, G}(sdhs, Symbol[], Int[], zeros(Int, ncells+1), zeros(Int, ncells), ScalarWrapper(false), grid, ScalarWrapper(-1))
 end
 
 function Base.show(io::IO, mime::MIME"text/plain", dh::DofHandler)
@@ -264,7 +264,7 @@ function add!(sdh::SubDofHandler, name::Symbol, ip::Interpolation)
             # TODO: warn if interpolation type is not the same?
         end
     end
-    
+
     # Check that interpolation is compatible with cells it it added to
     refshape_sdh = getrefshape(getcells(sdh.dh.grid, first(sdh.cellset)))
     if refshape_sdh !== getrefshape(ip)
@@ -374,6 +374,7 @@ function __close!(dh::DofHandler{dim}) where {dim}
             facedicts,
         )
     end
+    dh.cell_dofs_offset[end] = length(dh.cell_dofs) + 1
     dh.ndofs[] = maximum(dh.cell_dofs; init=0)
     dh.closed[] = true
 

From 955a233635dadf5499ce3566760663556cea035d Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Sun, 31 Dec 2023 00:44:19 +0100
Subject: [PATCH 09/16] Derp.

---
 docs/src/literate-tutorials/gpu_assembly.jl | 74 ++++++++++-----------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl
index 1a86c28c05..242910c5bd 100644
--- a/docs/src/literate-tutorials/gpu_assembly.jl
+++ b/docs/src/literate-tutorials/gpu_assembly.jl
@@ -75,7 +75,6 @@ GPUGrid(grid::Grid{<:Any,<:Any,T}) where T = GPUGrid(T, grid)
 function GPUGrid(::Type{T}, grid::Grid) where T
     GPUGrid(
         CuArray(getcells(grid)),
-        # CuArray(Vec{Ferrite.getdim(grid),T}.(get_node_coordinate.(getnodes(grid))))
         CuArray(getnodes(grid))
     )
 end
@@ -109,11 +108,13 @@ Adapt.@adapt_structure GPUDofHandler
 function shape_values(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2})
     ξ_x = ξ[1]
     ξ_y = ξ[2]
+    Nx = ((1 - ξ_x), (1 + ξ_x))
+    Ny = ((1 - ξ_y), (1 + ξ_y))
     return @SVector [
-        (1 - ξ_x) * (1 - ξ_y) / 4,
-        (1 + ξ_x) * (1 - ξ_y) / 4,
-        (1 + ξ_x) * (1 + ξ_y) / 4,
-        (1 - ξ_x) * (1 + ξ_y) / 4,
+        Nx[1] * Ny[1] / 4,
+        Nx[2] * Ny[1] / 4,
+        Nx[2] * Ny[2] / 4,
+        Nx[1] * Ny[2] / 4,
     ]
 end
 
@@ -121,15 +122,18 @@ function shape_values(ip::Lagrange{RefHexahedron, 1}, ξ::Vec{3})
     ξ_x = ξ[1]
     ξ_y = ξ[2]
     ξ_z = ξ[3]
+    Nx = ((1 - ξ_x), (1 + ξ_x))
+    Ny = ((1 - ξ_y), (1 + ξ_y))
+    Nz = ((1 - ξ_z), (1 + ξ_z))
     return @SVector [
-        (1 - ξ_x) * (1 - ξ_y) * (1 - ξ_z) / 8,
-        (1 + ξ_x) * (1 - ξ_y) * (1 - ξ_z) / 8,
-        (1 + ξ_x) * (1 + ξ_y) * (1 - ξ_z) / 8,
-        (1 - ξ_x) * (1 + ξ_y) * (1 - ξ_z) / 8,
-        (1 - ξ_x) * (1 - ξ_y) * (1 + ξ_z) / 8,
-        (1 + ξ_x) * (1 - ξ_y) * (1 + ξ_z) / 8,
-        (1 + ξ_x) * (1 + ξ_y) * (1 + ξ_z) / 8,
-        (1 - ξ_x) * (1 + ξ_y) * (1 + ξ_z) / 8,
+        Nx[1] * Ny[1] * Nz[1] / 8,
+        Nx[2] * Ny[1] * Nz[1] / 8,
+        Nx[2] * Ny[2] * Nz[1] / 8,
+        Nx[1] * Ny[2] * Nz[1] / 8,
+        Nx[1] * Ny[1] * Nz[2] / 8,
+        Nx[2] * Ny[1] * Nz[2] / 8,
+        Nx[2] * Ny[2] * Nz[2] / 8,
+        Nx[1] * Ny[2] * Nz[2] / 8,
     ]
 end
 
@@ -238,8 +242,8 @@ function gpu_element_mass_action2!(uₑout::V, uₑin::V, qr::SmallQuadratureRul
         for i in 1:n_basefuncs
             ϕᵢ = shapes[i]
             for j in 1:n_basefuncs
-                inner = shapes[j]*uₑin[j]
-                uₑout[i] += inner*ϕᵢ*dΩ
+                ϕⱼ = shapes[j]
+                uₑout[i] += ϕᵢ*ϕⱼ*uₑin[j]*dΩ
             end
         end
     end
@@ -250,10 +254,9 @@ end
 function gpu_full_mass_action_kernel!(uout::AbstractVector, uin::AbstractVector, gpudh::GPUDofHandler, cell_indices::AbstractVector, qr::SmallQuadratureRule, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation)
     index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
     stride = gridDim().x * blockDim().x
-    i = index
-    while i <= length(cell_indices)
+    while index <= length(cell_indices)
         ## Get index of the current cell
-        cell_index = cell_indices[i]
+        cell_index = cell_indices[index]
         ## Grab the actual cell
         cell = gpudh.grid.cells[cell_index]
         ## Grab the dofs on the cell
@@ -265,7 +268,7 @@ function gpu_full_mass_action_kernel!(uout::AbstractVector, uin::AbstractVector,
         xₑ = cellnodesvec(cell, gpudh.grid.nodes)
         ## Apply local action for y = Ax
         gpu_element_mass_action2!(uₑout, uₑin, qr, ip, ip_geo, xₑ)
-        i += stride
+        index += stride
     end
     return nothing
 end
@@ -278,12 +281,11 @@ function gpu_full_mass_action!(uout::AbstractVector, u::AbstractVector, dh::GPUD
 
     # Apply action one time
     for color ∈ colors
-        numthreads = 256
-        numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
+        #numthreads = 1
+        #numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
         # try
-            @cuda threads=numthreads blocks=numblocks gpu_full_mass_action_kernel!(uout, u, dh, color, qr, ip, ip_geo)
+            @cuda gpu_full_mass_action_kernel!(uout, u, dh, color, qr, ip, ip_geo)
             synchronize()
-            break
             # catch err
         #     code_typed(err; interactive = true)
         # end
@@ -355,10 +357,9 @@ end
 function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC) where {RHS, GPUDH, QR, FIP, GIP, GPUC}
     index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
     stride = gridDim().x * blockDim().x
-    i = index
-    while i <= length(cell_indices)
+    while index <= length(cell_indices)
         ## Get index of the current cell
-        cell_index = cell_indices[i]
+        cell_index = cell_indices[index]
         ## Grab the actual cell
         cell = gpudh.grid.cells[cell_index]
         # ## Grab the dofs on the cell
@@ -368,7 +369,6 @@ function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, c
         ## Grab coordinate array
         coords = cellnodesvec(cell, gpudh.grid.nodes)
         ## Apply local action for y = Ax
-        n_basefuncs = getnbasefunctions(ip)
         for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues)
             ξ = qr.points[q_point]
             # TODO recover abstraction layer
@@ -378,27 +378,27 @@ function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, c
             # # TODO spatial_coordinate
             x = zero(Vec{3,Float64})
             ϕᵍ = shape_values(ip_geo, ξ)
-            for i in 1:n_basefuncs
+            for i in 1:getnbasefunctions(ip_geo)
                 x += ϕᵍ[i]*coords[i] #geometric_value(fe_v, q_point, i) * x[i]
             end
 
             ϕᶠ = shape_values(ip, ξ)
-            @inbounds for i in 1:n_basefuncs
+            @inbounds for i in 1:getnbasefunctions(ip)
                 rhsₑ[i] += cos(x[1]/π)*cos(x[2]/π)*ϕᶠ[i]*dΩ
             end
         end
-        i += stride
+        index += stride
     end
 end
 
 
 function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler})
     rhs = CuArray(zeros(Float64,ndofs(dh)))
-    numthreads = 256
-    numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
+    # numthreads = 1
+    # numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
     for color ∈ gpurhs.colors
         # try
-        @cuda threads=numthreads blocks=numblocks gpu_rhs_kernel!(rhs, gpurhs.dh, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color)
+        @cuda gpu_rhs_kernel!(rhs, gpurhs.dh, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color)
         synchronize()
         # catch err
         #     code_typed(err; interactive = true)
@@ -427,12 +427,12 @@ function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler})
 end
 
 cudh = GPUDofHandler(dh)
-cucols = CuArray.(colors)
-Agpu = FerriteMFMassOperator(cudh, cucols, ip, ip, SmallQuadratureRule(qr))
-bgpu = generate_rhs(FerriteRHS(cudh, cucols, ip, ip, SmallQuadratureRule(qr)))
+cucolors = CuArray.(colors)
+Agpu = FerriteMFMassOperator(cudh, cucolors, ip, ip, SmallQuadratureRule(qr))
+bgpu = generate_rhs(FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr)))
 u = CUDA.fill(0.0, ndofs(dh));
 cg!(u, Agpu, bgpu; verbose=true);
-# @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu);
+# @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu)
 # CUDA.@profile mul!(b,A,u)
 
 # ### Exporting to VTK

From 75c6dd716fe747876bd85cb40a42dcf530263a4b Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Sun, 31 Dec 2023 01:03:27 +0100
Subject: [PATCH 10/16] Back to Float32.

---
 docs/src/literate-tutorials/gpu_assembly.jl | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl
index 242910c5bd..bfde120974 100644
--- a/docs/src/literate-tutorials/gpu_assembly.jl
+++ b/docs/src/literate-tutorials/gpu_assembly.jl
@@ -177,7 +177,7 @@ cellnodesvec(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]].x, nodes[cell.node
 # We start by generating a simple grid with 20x20 quadrilateral elements
 # using `generate_grid`. The generator defaults to the unit square,
 # so we don't need to specify the corners of the domain.
-grid = generate_grid(Hexahedron, (20, 20, 20));
+grid = generate_grid(Hexahedron, (20, 20, 20), Vec{3}((-1.f0,-1.f0,-1.f0)), Vec{3}((1.f0,1.f0,1.f0)));
 colors = create_coloring(grid); # TODO add example without coloring, i.e. using Atomics instead
 
 # ### Trial and test functions
@@ -189,7 +189,7 @@ colors = create_coloring(grid); # TODO add example without coloring, i.e. using
 # the same reference element. We combine the interpolation and the quadrature rule
 # to a `CellValues` object.
 ip = Lagrange{RefHexahedron, 1}()
-qr = QuadratureRule{RefHexahedron}(2)
+qr = QuadratureRule{RefHexahedron,Float32}(2)
 cellvalues = CellValues(qr, ip);
 
 # ### Degrees of freedom
@@ -354,7 +354,7 @@ struct FerriteRHS{DH, COLS, IP, GIP, QR}
     qr::QR
 end
 
-function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, cell_indices::GPUC) where {RHS, GPUDH, QR, FIP, GIP, GPUC}
+function gpu_rhs_kernel!(rhs::CuDeviceVector{T}, gpudh::GPUDofHandler{<:GPUGrid{<:Any,<:Any,T}}, qr::SmallQuadratureRule{<:Any,T}, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation, cell_indices::AbstractVector) where {T}
     index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
     stride = gridDim().x * blockDim().x
     while index <= length(cell_indices)
@@ -376,7 +376,7 @@ function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, c
             dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point)
 
             # # TODO spatial_coordinate
-            x = zero(Vec{3,Float64})
+            x = zero(Vec{3,T})
             ϕᵍ = shape_values(ip_geo, ξ)
             for i in 1:getnbasefunctions(ip_geo)
                 x += ϕᵍ[i]*coords[i] #geometric_value(fe_v, q_point, i) * x[i]
@@ -392,8 +392,8 @@ function gpu_rhs_kernel!(rhs::RHS, gpudh::GPUDH, qr::QR, ip::FIP, ip_geo::GIP, c
 end
 
 
-function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler})
-    rhs = CuArray(zeros(Float64,ndofs(dh)))
+function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler{<:GPUGrid{<:Any,<:Any,T}}}) where T
+    rhs = CuArray(zeros(T,ndofs(dh)))
     # numthreads = 1
     # numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
     for color ∈ gpurhs.colors
@@ -429,8 +429,9 @@ end
 cudh = GPUDofHandler(dh)
 cucolors = CuArray.(colors)
 Agpu = FerriteMFMassOperator(cudh, cucolors, ip, ip, SmallQuadratureRule(qr))
-bgpu = generate_rhs(FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr)))
-u = CUDA.fill(0.0, ndofs(dh));
+rhsop = FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr))
+bgpu = generate_rhs(rhsop)
+u = CUDA.fill(0.f0, ndofs(dh));
 cg!(u, Agpu, bgpu; verbose=true);
 # @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu)
 # CUDA.@profile mul!(b,A,u)

From 10d6a51085d606f1fc3326fd0ab36636fad21f1c Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Sun, 31 Dec 2023 03:01:20 +0100
Subject: [PATCH 11/16] Fix Float32 2D grid generator.

---
 src/Grid/grid_generators.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Grid/grid_generators.jl b/src/Grid/grid_generators.jl
index 38bebc88f0..b2d119bf2e 100644
--- a/src/Grid/grid_generators.jl
+++ b/src/Grid/grid_generators.jl
@@ -83,7 +83,7 @@ function generate_grid(::Type{QuadraticLine}, nel::NTuple{1,Int}, left::Vec{1,T}
     return Grid(cells, nodes, facesets=facesets, boundary_matrix=boundary_matrix)
 end
 
-function _generate_2d_nodes!(nodes, nx, ny, LL, LR, UR, UL)
+function _generate_2d_nodes!(nodes::AbstractVector{<:Node{2,T}}, nx, ny, LL, LR, UR, UL) where T
       for i in 0:ny-1
         ratio_bounds = i / (ny-1)
 
@@ -97,7 +97,7 @@ function _generate_2d_nodes!(nodes, nx, ny, LL, LR, UR, UL)
             ratio = j / (nx-1)
             x = x0 * (1 - ratio) + ratio * x1
             y = y0 * (1 - ratio) + ratio * y1
-            push!(nodes, Node((x, y)))
+            push!(nodes, Node((T(x), T(y))))
         end
     end
 end

From 778d57b075764ef7396d90b4c6ef1d56b56d3ea4 Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Sun, 31 Dec 2023 03:01:42 +0100
Subject: [PATCH 12/16] Fix performance and add element assembly.

---
 docs/src/literate-tutorials/gpu_assembly.jl | 171 +++++++++++++-------
 1 file changed, 117 insertions(+), 54 deletions(-)

diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl
index bfde120974..252d1887fd 100644
--- a/docs/src/literate-tutorials/gpu_assembly.jl
+++ b/docs/src/literate-tutorials/gpu_assembly.jl
@@ -46,7 +46,7 @@ using IterativeSolvers, LinearAlgebra
 
 ### TODO Extension
 import Adapt
-using StaticArrays
+using StaticArrays, SparseArrays
 struct SmallQuadratureRule{N,T,dim}
     weights::SVector{N,T}
     points::SVector{N,Vec{dim,T}}
@@ -177,32 +177,81 @@ cellnodesvec(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]].x, nodes[cell.node
 # We start by generating a simple grid with 20x20 quadrilateral elements
 # using `generate_grid`. The generator defaults to the unit square,
 # so we don't need to specify the corners of the domain.
-grid = generate_grid(Hexahedron, (20, 20, 20), Vec{3}((-1.f0,-1.f0,-1.f0)), Vec{3}((1.f0,1.f0,1.f0)));
+# grid = generate_grid(Hexahedron, (2, 1, 1), Vec{3}((-1.f0,-1.f0,-1.f0)), Vec{3}((1.f0,1.f0,1.f0)));
+grid = generate_grid(Quadrilateral, (2, 1), Vec((-1.f0,-1.f0)), Vec((1.f0,1.f0)));
 colors = create_coloring(grid); # TODO add example without coloring, i.e. using Atomics instead
 
-# ### Trial and test functions
-# A `CellValues` facilitates the process of evaluating values and gradients of
-# test and trial functions (among other things). To define
-# this we need to specify an interpolation space for the shape functions.
-# We use Lagrange functions
-# based on the two-dimensional reference quadrilateral. We also define a quadrature rule based on
-# the same reference element. We combine the interpolation and the quadrature rule
-# to a `CellValues` object.
-ip = Lagrange{RefHexahedron, 1}()
-qr = QuadratureRule{RefHexahedron,Float32}(2)
+ip = Lagrange{RefQuadrilateral, 1}()
+qr = QuadratureRule{RefQuadrilateral,Float32}(2)
 cellvalues = CellValues(qr, ip);
 
-# ### Degrees of freedom
-# Next we need to define a `DofHandler`, which will take care of numbering
-# and distribution of degrees of freedom for our approximated fields.
-# We create the `DofHandler` and then add a single scalar field called `:u` based on
-# our interpolation `ip` defined above.
-# Lastly we `close!` the `DofHandler`, it is now that the dofs are distributed
-# for all the elements.
 dh = DofHandler(grid)
 add!(dh, :u, ip)
 close!(dh);
 
+function assemble_element!(Ke::AbstractMatrix, cellvalues::CellValues)
+    n_basefuncs = getnbasefunctions(cellvalues)
+    fill!(Ke, 0)
+    for q_point in 1:getnquadpoints(cellvalues)
+        dΩ = getdetJdV(cellvalues, q_point)
+        for i in 1:n_basefuncs
+            δu  = shape_value(cellvalues, q_point, i)
+            for j in 1:n_basefuncs
+                u = shape_value(cellvalues, q_point, j)
+                Ke[i, j] += (δu ⋅ u) * dΩ
+            end
+        end
+    end
+    return Ke
+end
+
+function assemble_global(cellvalues::CellValues, K::SparseMatrixCSC, dh::DofHandler)
+    n_basefuncs = getnbasefunctions(cellvalues)
+    Ke = zeros(n_basefuncs, n_basefuncs)
+    assembler = start_assemble(K)
+    for cell in CellIterator(dh)
+        reinit!(cellvalues, cell)
+        assemble_element!(Ke, cellvalues)
+        assemble!(assembler, celldofs(cell), Ke)
+    end
+    return K
+end
+
+function assemble_global(cellvalues::CellValues, K::Array{<:Any,3}, dh::DofHandler)
+    for cell in CellIterator(dh)
+        ## Reinitialize cellvalues for this cell
+        reinit!(cellvalues, cell)
+        ## Compute element contribution
+        Ke = @view K[:,:,cellid(cell)]
+        assemble_element!(Ke, cellvalues)
+    end
+    return K
+end
+
+Kcpu = create_sparsity_pattern(dh)
+assemble_global(cellvalues, Kcpu, dh)
+
+Keacpu = zeros(Float32, ndofs_per_cell(dh,1),ndofs_per_cell(dh,1),length(dh.grid.cells))
+assemble_global(cellvalues, Keacpu, dh)
+
+function d2eop(dh::DofHandler)
+    I = Int32[]
+    J = Int32[]
+    next_edof_idx = 1
+    for ei ∈ 1:length(dh.grid.cells)
+        cell_dofs = celldofs(dh, ei)
+        for cell_dof ∈ cell_dofs
+            push!(I, Int32(next_edof_idx))
+            push!(J, Int32(cell_dof))
+            next_edof_idx += 1
+        end
+    end
+    V = ones(Bool, next_edof_idx-1)
+    return sparse(I,J,V)
+end
+
+d2e = d2eop(dh)
+
 # ### Boundary conditions
 # In Ferrite constraints like Dirichlet boundary conditions
 # are handled by a `ConstraintHandler`.
@@ -280,11 +329,11 @@ function gpu_full_mass_action!(uout::AbstractVector, u::AbstractVector, dh::GPUD
     synchronize()
 
     # Apply action one time
+    numthreads = 128
     for color ∈ colors
-        #numthreads = 1
-        #numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
+        numblocks = ceil(Int, length(color)/numthreads)
         # try
-            @cuda gpu_full_mass_action_kernel!(uout, u, dh, color, qr, ip, ip_geo)
+            @cuda threads=numthreads blocks=numblocks  gpu_full_mass_action_kernel!(uout, u, dh, color, qr, ip, ip_geo)
             synchronize()
             # catch err
         #     code_typed(err; interactive = true)
@@ -317,33 +366,44 @@ Base.eltype(A::FerriteMFMassOperator{<:Grid{<:Any,<:Any,T}}) where T = T
 Base.eltype(A::FerriteMFMassOperator{<:GPUGrid{<:Any,<:Any,T}}) where T = T
 Base.size(A::FerriteMFMassOperator, d) = ndofs(dh) # Square operator
 
-# struct FerriteEAGPUMassOperator{CACELLS, CANODES, CACOLORS, CADOFS, CAOFFSETS, IP, GIP, QR, EAMATS <: AbstractArray{3}}
-#     # "GPUGrid"
-#     gpu_cells::CACELLS
-#     gpu_nodes::CANODES
-#     colors::CACOLORS
-#     # "GPUDofHandler"
-#     all_cell_dofs::CADOFS
-#     cell_dofs_offset::CAOFFSETS
-#     # "GPUValues"
-#     ip::IP
-#     ip_geo::GIP
-#     qr::QR
-#     # 
-#     element_matrices::EAMATS
-# end
+struct FerriteEAMassOperator{D2E, XE2, EAMATS <: AbstractArray{<:Any,3}}
+    d2e::D2E
+    xe2::XE2
+    element_matrices::EAMATS
+end
 
-# function FerriteGPUMassOperator(dh, colors, ip, ip_geo, qr)
-#     grid = Ferrite.get_grid(dh)
-#     FerriteGPUMassOperator(
-#         CuArray(getcells(grid)),
-#         CuArray(Vec{Ferrite.getdim(grid),Float32}.(get_node_coordinate.(getnodes(grid)))),
-#         colors,
-#         CuArray(dh.cell_dofs),
-#         CuArray(dh.cell_dofs_offset),
-#         ip, ip_geo, qr
-#     )
-# end
+function gemv_batched!(xe2::CuDeviceArray{T,2}, emats::CuDeviceArray{T,3}, xe::CuDeviceArray{T,2}) where T
+    cell_index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
+    stride = gridDim().x * blockDim().x
+    while cell_index ≤ size(emats,3)
+        xei = @view xe[:,cell_index]
+        xe2i = @view xe2[:,cell_index]
+        emati = @view emats[:,:,cell_index]
+        # mul!(xe2i, emati, xei) # fails?
+        xe2i .= T(0.0)
+        for i ∈ 1:size(emats,1), j ∈ 1:size(emats,2)
+            xe2i[i] += emati[i,j]*xei[j]
+        end
+        cell_index += stride
+    end
+end
+
+function LinearAlgebra.mul!(y,A::FerriteEAMassOperator{<:Any,<:Any,<:CuArray},x)
+    xe = A.d2e*x
+    synchronize()
+    xev = reshape(xe, (size(A.element_matrices,2), size(A.element_matrices,3)))
+    xe2v = reshape(A.xe2, (size(A.element_matrices,2), size(A.element_matrices,3)))
+    synchronize()
+    # CUDA.CUBLAS.gemv_batched!(???)
+    numthreads = 128
+    numblocks = ceil(Int, size(A.element_matrices,3)/numthreads)
+    @cuda threads=numthreads blocks=numblocks gemv_batched!(xe2v, A.element_matrices, xev)
+    y .= transpose(A.d2e)*A.xe2
+    synchronize()
+end
+Base.eltype(A::FerriteEAMassOperator{<:Grid{<:Any,<:Any,T}}) where T = T
+Base.eltype(A::FerriteEAMassOperator{<:GPUGrid{<:Any,<:Any,T}}) where T = T
+Base.size(A::FerriteEAMassOperator, d) = ndofs(dh) # Square operator
 
 struct FerriteRHS{DH, COLS, IP, GIP, QR}
     dh::DH
@@ -354,7 +414,7 @@ struct FerriteRHS{DH, COLS, IP, GIP, QR}
     qr::QR
 end
 
-function gpu_rhs_kernel!(rhs::CuDeviceVector{T}, gpudh::GPUDofHandler{<:GPUGrid{<:Any,<:Any,T}}, qr::SmallQuadratureRule{<:Any,T}, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation, cell_indices::AbstractVector) where {T}
+function gpu_rhs_kernel!(rhs::CuDeviceVector{T}, gpudh::GPUDofHandler{<:GPUGrid{sdim,<:Any,T}}, qr::SmallQuadratureRule{<:Any,T}, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation, cell_indices::AbstractVector) where {sdim,T}
     index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
     stride = gridDim().x * blockDim().x
     while index <= length(cell_indices)
@@ -376,7 +436,7 @@ function gpu_rhs_kernel!(rhs::CuDeviceVector{T}, gpudh::GPUDofHandler{<:GPUGrid{
             dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point)
 
             # # TODO spatial_coordinate
-            x = zero(Vec{3,T})
+            x = zero(Vec{sdim,T})
             ϕᵍ = shape_values(ip_geo, ξ)
             for i in 1:getnbasefunctions(ip_geo)
                 x += ϕᵍ[i]*coords[i] #geometric_value(fe_v, q_point, i) * x[i]
@@ -394,11 +454,11 @@ end
 
 function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler{<:GPUGrid{<:Any,<:Any,T}}}) where T
     rhs = CuArray(zeros(T,ndofs(dh)))
-    # numthreads = 1
-    # numblocks = 1 #fails...? ceil(Int, length(color)/numthreads)
+    numthreads = 128
     for color ∈ gpurhs.colors
+        numblocks = ceil(Int, length(color)/numthreads)
         # try
-        @cuda gpu_rhs_kernel!(rhs, gpurhs.dh, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color)
+        @cuda threads=numthreads blocks=numblocks gpu_rhs_kernel!(rhs, gpurhs.dh, gpurhs.qr, gpurhs.ip, gpurhs.ip_geo, color)
         synchronize()
         # catch err
         #     code_typed(err; interactive = true)
@@ -432,10 +492,13 @@ Agpu = FerriteMFMassOperator(cudh, cucolors, ip, ip, SmallQuadratureRule(qr))
 rhsop = FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr))
 bgpu = generate_rhs(rhsop)
 u = CUDA.fill(0.f0, ndofs(dh));
-cg!(u, Agpu, bgpu; verbose=true);
+# cg!(u, Agpu, bgpu; verbose=true);
 # @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu)
 # CUDA.@profile mul!(b,A,u)
 
+Aeagpu = FerriteEAMassOperator(CUDA.CUSPARSE.CuSparseMatrixCSC(d2e), CuArray(zeros(Float32, size(d2e,1))), CuArray(Keacpu))
+# @benchmark CUDA.@sync mul!($u,$Aeagpu,$bgpu)
+
 # ### Exporting to VTK
 # To visualize the result we export the grid and our field `u`
 # to a VTK-file, which can be viewed in e.g. [ParaView](https://www.paraview.org/).

From 3e9edc521c387fc46b71346a90a20711977fbec3 Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Sun, 31 Dec 2023 03:50:49 +0100
Subject: [PATCH 13/16] Benchmark code.

---
 docs/src/literate-tutorials/gpu_assembly.jl | 50 ++++++++++-----------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl
index 252d1887fd..7a15fb520c 100644
--- a/docs/src/literate-tutorials/gpu_assembly.jl
+++ b/docs/src/literate-tutorials/gpu_assembly.jl
@@ -177,12 +177,12 @@ cellnodesvec(cell::Hexahedron, nodes) = (nodes[cell.nodes[1]].x, nodes[cell.node
 # We start by generating a simple grid with 20x20 quadrilateral elements
 # using `generate_grid`. The generator defaults to the unit square,
 # so we don't need to specify the corners of the domain.
-# grid = generate_grid(Hexahedron, (2, 1, 1), Vec{3}((-1.f0,-1.f0,-1.f0)), Vec{3}((1.f0,1.f0,1.f0)));
-grid = generate_grid(Quadrilateral, (2, 1), Vec((-1.f0,-1.f0)), Vec((1.f0,1.f0)));
+grid = generate_grid(Hexahedron, (100, 100, 100), Vec{3}((-1.f0,-1.f0,-1.f0)), Vec{3}((1.f0,1.f0,1.f0)));
+# grid = generate_grid(Quadrilateral, (2, 1), Vec((-1.f0,-1.f0)), Vec((1.f0,1.f0)));
 colors = create_coloring(grid); # TODO add example without coloring, i.e. using Atomics instead
 
-ip = Lagrange{RefQuadrilateral, 1}()
-qr = QuadratureRule{RefQuadrilateral,Float32}(2)
+ip = Lagrange{RefHexahedron, 1}()
+qr = QuadratureRule{RefHexahedron,Float32}(2)
 cellvalues = CellValues(qr, ip);
 
 dh = DofHandler(grid)
@@ -228,7 +228,7 @@ function assemble_global(cellvalues::CellValues, K::Array{<:Any,3}, dh::DofHandl
     return K
 end
 
-Kcpu = create_sparsity_pattern(dh)
+Kcpu = convert(SparseMatrixCSC{Float32, Int32}, create_sparsity_pattern(dh))
 assemble_global(cellvalues, Kcpu, dh)
 
 Keacpu = zeros(Float32, ndofs_per_cell(dh,1),ndofs_per_cell(dh,1),length(dh.grid.cells))
@@ -246,35 +246,20 @@ function d2eop(dh::DofHandler)
             next_edof_idx += 1
         end
     end
-    V = ones(Bool, next_edof_idx-1)
+    V = ones(Float32, next_edof_idx-1)
     return sparse(I,J,V)
 end
 
 d2e = d2eop(dh)
 
 # ### Boundary conditions
-# In Ferrite constraints like Dirichlet boundary conditions
-# are handled by a `ConstraintHandler`.
 # ch = ConstraintHandler(dh);
-
-# Next we need to add constraints to `ch`. For this problem we define
-# homogeneous Dirichlet boundary conditions on the whole boundary, i.e.
-# the `union` of all the face sets on the boundary.
 # ∂Ω = union(
 #     getfaceset(grid, "left"),
 #     getfaceset(grid, "right"),
 #     getfaceset(grid, "top"),
 #     getfaceset(grid, "bottom"),
 # );
-
-# Now we are set up to define our constraint. We specify which field
-# the condition is for, and our combined face set `∂Ω`. The last
-# argument is a function of the form $f(\textbf{x})$ or $f(\textbf{x}, t)$,
-# where $\textbf{x}$ is the spatial coordinate and
-# $t$ the current time, and returns the prescribed value. Since the boundary condition in
-# this case do not depend on time we define our function as $f(\textbf{x}) = 0$, i.e.
-# no matter what $\textbf{x}$ we return $0$. When we have
-# specified our constraint we `add!` it to `ch`.
 # dbc = Dirichlet(:u, ∂Ω, (x, t) -> 0)
 # add!(ch, dbc);
 # close!(ch)
@@ -373,6 +358,10 @@ struct FerriteEAMassOperator{D2E, XE2, EAMATS <: AbstractArray{<:Any,3}}
 end
 
 function gemv_batched!(xe2::CuDeviceArray{T,2}, emats::CuDeviceArray{T,3}, xe::CuDeviceArray{T,2}) where T
+    # base_cell_index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
+    # stride = blockDim().x
+    # for cell_index ∈ base_cell_index:min(base_cell_index+stride,size(emats,3))
+
     cell_index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
     stride = gridDim().x * blockDim().x
     while cell_index ≤ size(emats,3)
@@ -381,7 +370,7 @@ function gemv_batched!(xe2::CuDeviceArray{T,2}, emats::CuDeviceArray{T,3}, xe::C
         emati = @view emats[:,:,cell_index]
         # mul!(xe2i, emati, xei) # fails?
         xe2i .= T(0.0)
-        for i ∈ 1:size(emats,1), j ∈ 1:size(emats,2)
+        for j ∈ 1:size(emats,2), i ∈ 1:size(emats,1)
             xe2i[i] += emati[i,j]*xei[j]
         end
         cell_index += stride
@@ -398,7 +387,8 @@ function LinearAlgebra.mul!(y,A::FerriteEAMassOperator{<:Any,<:Any,<:CuArray},x)
     numthreads = 128
     numblocks = ceil(Int, size(A.element_matrices,3)/numthreads)
     @cuda threads=numthreads blocks=numblocks gemv_batched!(xe2v, A.element_matrices, xev)
-    y .= transpose(A.d2e)*A.xe2
+    synchronize()
+    mul!(y, transpose(A.d2e), A.xe2)
     synchronize()
 end
 Base.eltype(A::FerriteEAMassOperator{<:Grid{<:Any,<:Any,T}}) where T = T
@@ -491,13 +481,19 @@ cucolors = CuArray.(colors)
 Agpu = FerriteMFMassOperator(cudh, cucolors, ip, ip, SmallQuadratureRule(qr))
 rhsop = FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr))
 bgpu = generate_rhs(rhsop)
-u = CUDA.fill(0.f0, ndofs(dh));
-# cg!(u, Agpu, bgpu; verbose=true);
+ugpu = CUDA.fill(0.f0, ndofs(dh));
+# cg!(ugpu, Agpu, bgpu; verbose=true);
 # @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu)
-# CUDA.@profile mul!(b,A,u)
 
 Aeagpu = FerriteEAMassOperator(CUDA.CUSPARSE.CuSparseMatrixCSC(d2e), CuArray(zeros(Float32, size(d2e,1))), CuArray(Keacpu))
-# @benchmark CUDA.@sync mul!($u,$Aeagpu,$bgpu)
+# @benchmark CUDA.@sync mul!($ugpu,$Aeagpu,$bgpu)
+
+ucpu = copy(Array(ugpu))
+bcpu = copy(Array(bgpu))
+# @benchmark mul!($u,$Kcpu,$(Array(bgpu)))
+
+Kgpu = CUDA.CUSPARSE.CuSparseMatrixCSC(Kcpu)
+# @benchmark mul!($ugpu,$Kcpu,$bgpu)
 
 # ### Exporting to VTK
 # To visualize the result we export the grid and our field `u`

From 7a42d4cd7272db71696bf4f9ff0d1e419b4d820c Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Sun, 31 Dec 2023 23:50:49 +0100
Subject: [PATCH 14/16] Move small quadrature rule to Ferrite core

---
 docs/src/literate-tutorials/gpu_assembly.jl | 12 -----------
 src/Quadrature/quadrature.jl                | 22 +++++++++++++++++++++
 src/exports.jl                              |  2 ++
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl
index 7a15fb520c..5277eb5e7a 100644
--- a/docs/src/literate-tutorials/gpu_assembly.jl
+++ b/docs/src/literate-tutorials/gpu_assembly.jl
@@ -47,18 +47,6 @@ using IterativeSolvers, LinearAlgebra
 ### TODO Extension
 import Adapt
 using StaticArrays, SparseArrays
-struct SmallQuadratureRule{N,T,dim}
-    weights::SVector{N,T}
-    points::SVector{N,Vec{dim,T}}
-end
-SmallQuadratureRule(qr::QuadratureRule) = SmallQuadratureRule(SVector{length(qr.weights)}(qr.weights), SVector{length(qr.points)}(qr.points))
-function Adapt.adapt_structure(to, qr::QuadratureRule{shape,T,dim}) where {shape,T,dim}
-    N = length(qr.weights)
-    SmallQuadratureRule{N,T,dim}(SVector{N,T}(qr.weights), SVector{N,Vec{dim,T}}(qr.points))
-end
-function Adapt.adapt_structure(to, nodes::Vector{Node{dim,T}}) where {dim,T}
-    CuArray(Vec{dim,T}.(get_node_coordinate.(grid.nodes)))
-end
 
 struct GPUGrid{sdim,C<:Ferrite.AbstractCell,T<:Real, CELA<:AbstractArray{C,1}, NODA<:AbstractArray{Node{sdim,T},1}} <: Ferrite.AbstractGrid{sdim}
     cells::CELA
diff --git a/src/Quadrature/quadrature.jl b/src/Quadrature/quadrature.jl
index 5623f89b19..ec4d5be589 100644
--- a/src/Quadrature/quadrature.jl
+++ b/src/Quadrature/quadrature.jl
@@ -154,6 +154,17 @@ function QuadratureRule{RefPyramid, T}(quad_type::Symbol, order::Int) where T
     QuadratureRule{RefPyramid,T}(weights, points)
 end
 
+#######################
+# SmallQuadratureRule #
+#######################
+struct SmallQuadratureRule{N,T,dim}
+    weights::SVector{N,T}
+    points::SVector{N,Vec{dim,T}}
+end
+SmallQuadratureRule(qr::QuadratureRule) = SmallQuadratureRule(SVector{length(qr.weights)}(qr.weights), SVector{length(qr.points)}(qr.points))
+SmallQuadratureRule{refshape,T}(quad_type::Symbol, order::Int) where {refshape,T} = SmallQuadratureRule(QuadratureRule{refshape,T}(quad_type, order))
+SmallQuadratureRule{refshape,T}(order::Int) where {refshape,T} = SmallQuadratureRule(QuadratureRule{refshape,T}(order))
+
 ######################
 # FaceQuadratureRule #
 ######################
@@ -230,6 +241,17 @@ function FaceQuadratureRule{RefPyramid, T}(quad_type::Symbol, order::Int) where
         [2,3,4,5], qr_tri.weights, qr_tri.points)
 end
 
+
+###########################
+# SmallFaceQuadratureRule #
+###########################
+struct SmallFaceQuadratureRule{N,T,dim,M}
+    face_rules::SVector{M,SmallQuadratureRule{N, T, dim}}
+end
+SmallFaceQuadratureRule(fqr::FaceQuadratureRule{shape}) where shape = SmallFaceQuadratureRule(SVector{nfaces(shape)}(SmallQuadratureRule.(fqr.face_rules)))
+SmallFaceQuadratureRule{refshape,T}(quad_type::Symbol, order::Int) where {refshape,T} = SmallFaceQuadratureRule(FaceQuadratureRule{refshape,T}(quad_type, order))
+SmallFaceQuadratureRule{refshape,T}(order::Int) where {refshape,T} = SmallFaceQuadratureRule(FaceQuadratureRule{refshape,T}(order))
+
 ##################
 # Common methods #
 ##################
diff --git a/src/exports.jl b/src/exports.jl
index dab7e368fb..27997ddb5d 100644
--- a/src/exports.jl
+++ b/src/exports.jl
@@ -21,7 +21,9 @@ export
 
 # Quadrature
     QuadratureRule,
+    SmallQuadratureRule,
     FaceQuadratureRule,
+    SmallFaceQuadratureRule,
     getnquadpoints,
 
 # FEValues

From 4c563746fda29ead33a76ff515cb897578db90bd Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Mon, 15 Jan 2024 21:15:51 +0100
Subject: [PATCH 15/16] manifest

---
 docs/Manifest.toml | 286 +++++++++++++++++++--------------------------
 1 file changed, 121 insertions(+), 165 deletions(-)

diff --git a/docs/Manifest.toml b/docs/Manifest.toml
index 9d91fc57f3..2a62920e10 100644
--- a/docs/Manifest.toml
+++ b/docs/Manifest.toml
@@ -1,13 +1,13 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.9.4"
+julia_version = "1.10.0"
 manifest_format = "2.0"
 project_hash = "3221800eb396241cbc64f6deb94d7c003e4f881d"
 
 [[deps.ADTypes]]
-git-tree-sha1 = "332e5d7baeff8497b923b730b994fa480601efc7"
+git-tree-sha1 = "41c37aa88889c171f1300ceac1313c06e891d245"
 uuid = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
-version = "0.2.5"
+version = "0.2.6"
 
 [[deps.ANSIColoredPrinters]]
 git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c"
@@ -33,25 +33,6 @@ git-tree-sha1 = "faa260e4cb5aba097a73fab382dd4b5819d8ec8c"
 uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 version = "0.4.4"
 
-[[deps.Accessors]]
-deps = ["CompositionsBase", "ConstructionBase", "Dates", "InverseFunctions", "LinearAlgebra", "MacroTools", "Test"]
-git-tree-sha1 = "a7055b939deae2455aa8a67491e034f735dd08d3"
-uuid = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
-version = "0.1.33"
-
-    [deps.Accessors.extensions]
-    AccessorsAxisKeysExt = "AxisKeys"
-    AccessorsIntervalSetsExt = "IntervalSets"
-    AccessorsStaticArraysExt = "StaticArrays"
-    AccessorsStructArraysExt = "StructArrays"
-
-    [deps.Accessors.weakdeps]
-    AxisKeys = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5"
-    IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
-    Requires = "ae029012-a4dd-5104-9daa-d747884805df"
-    StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
-    StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
-
 [[deps.Adapt]]
 deps = ["LinearAlgebra", "Requires"]
 git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608"
@@ -74,9 +55,9 @@ version = "0.2.0"
 
 [[deps.ArrayInterface]]
 deps = ["Adapt", "LinearAlgebra", "Requires", "SparseArrays", "SuiteSparse"]
-git-tree-sha1 = "247efbccf92448be332d154d6ca56b9fcdd93c31"
+git-tree-sha1 = "bbec08a37f8722786d87bedf84eae19c020c4efa"
 uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
-version = "7.6.1"
+version = "7.7.0"
 
     [deps.ArrayInterface.extensions]
     ArrayInterfaceBandedMatricesExt = "BandedMatrices"
@@ -96,9 +77,9 @@ version = "7.6.1"
 
 [[deps.ArrayLayouts]]
 deps = ["FillArrays", "LinearAlgebra"]
-git-tree-sha1 = "af43df5704827c8618afd36eb56fcab20d3041ee"
+git-tree-sha1 = "b08a4043e1c14096ef8efe4dd97e07de5cacf240"
 uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
-version = "1.4.3"
+version = "1.4.5"
 weakdeps = ["SparseArrays"]
 
     [deps.ArrayLayouts.extensions]
@@ -275,16 +256,7 @@ weakdeps = ["Dates", "LinearAlgebra"]
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.0.5+0"
-
-[[deps.CompositionsBase]]
-git-tree-sha1 = "802bb88cd69dfd1509f6670416bd4434015693ad"
-uuid = "a33af91c-f02d-484b-be07-31d278c5ca2b"
-version = "0.1.2"
-weakdeps = ["InverseFunctions"]
-
-    [deps.CompositionsBase.extensions]
-    CompositionsBaseInverseFunctionsExt = "InverseFunctions"
+version = "1.0.5+1"
 
 [[deps.ConcreteStructs]]
 git-tree-sha1 = "f749037478283d372048690eb3b5f92a79432b34"
@@ -361,9 +333,9 @@ version = "1.9.1"
 
 [[deps.DiffEqBase]]
 deps = ["ArrayInterface", "DataStructures", "DocStringExtensions", "EnumX", "EnzymeCore", "FastBroadcast", "ForwardDiff", "FunctionWrappers", "FunctionWrappersWrappers", "LinearAlgebra", "Logging", "Markdown", "MuladdMacro", "Parameters", "PreallocationTools", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "Setfield", "SparseArrays", "Static", "StaticArraysCore", "Statistics", "Tricks", "TruncatedStacktraces"]
-git-tree-sha1 = "4aff905c723e3f883064f00a4ba553b29372ce57"
+git-tree-sha1 = "f45adda33e3eb24ae1428dfa9b57ded8248ced99"
 uuid = "2b5f629d-d688-5b77-993f-72d75c75574e"
-version = "6.142.0"
+version = "6.145.4"
 
     [deps.DiffEqBase.extensions]
     DiffEqBaseChainRulesCoreExt = "ChainRulesCore"
@@ -570,9 +542,9 @@ version = "1.9.3"
 
 [[deps.FiniteDiff]]
 deps = ["ArrayInterface", "LinearAlgebra", "Requires", "Setfield", "SparseArrays"]
-git-tree-sha1 = "c6e4a1fbe73b31a3dea94b1da449503b8830c306"
+git-tree-sha1 = "73d1214fec245096717847c62d389a5d2ac86504"
 uuid = "6a86dc24-6348-571c-b903-95158fe2bd41"
-version = "2.21.1"
+version = "2.22.0"
 
     [deps.FiniteDiff.extensions]
     FiniteDiffBandedMatricesExt = "BandedMatrices"
@@ -640,10 +612,10 @@ deps = ["Random"]
 uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
 
 [[deps.GLFW_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "Xorg_libXcursor_jll", "Xorg_libXi_jll", "Xorg_libXinerama_jll", "Xorg_libXrandr_jll"]
-git-tree-sha1 = "d972031d28c8c8d9d7b41a536ad7bb0c2579caca"
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Xorg_libXcursor_jll", "Xorg_libXi_jll", "Xorg_libXinerama_jll", "Xorg_libXrandr_jll"]
+git-tree-sha1 = "ff38ba61beff76b8f4acad8ab0c97ef73bb670cb"
 uuid = "0656b61e-2033-5cc2-a64a-77c0f6c09b89"
-version = "3.3.8+0"
+version = "3.3.9+0"
 
 [[deps.GLU_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg"]
@@ -654,7 +626,7 @@ version = "9.0.1+0"
 [[deps.GMP_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "781609d7-10c4-51f6-84f2-b8444358ff6d"
-version = "6.2.1+2"
+version = "6.2.1+6"
 
 [[deps.GPUArrays]]
 deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
@@ -785,27 +757,16 @@ git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
 uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
 version = "1.4.0"
 
-[[deps.IntegerMathUtils]]
-git-tree-sha1 = "b8ffb903da9f7b8cf695a8bead8e01814aa24b30"
-uuid = "18e54dd8-cb9d-406c-a71d-865a43cbb235"
-version = "0.1.2"
-
 [[deps.IntelOpenMP_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "31d6adb719886d4e32e38197aae466e98881320b"
+git-tree-sha1 = "5fdf2fe6724d8caabf43b557b84ce53f3b7e2f6b"
 uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
-version = "2024.0.0+0"
+version = "2024.0.2+0"
 
 [[deps.InteractiveUtils]]
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
-[[deps.InverseFunctions]]
-deps = ["Test"]
-git-tree-sha1 = "68772f49f54b479fa88ace904f6127f0a3bb2e46"
-uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
-version = "0.1.12"
-
 [[deps.InvertedIndices]]
 git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
 uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
@@ -847,9 +808,15 @@ version = "0.21.4"
 
 [[deps.JSON3]]
 deps = ["Dates", "Mmap", "Parsers", "PrecompileTools", "StructTypes", "UUIDs"]
-git-tree-sha1 = "95220473901735a0f4df9d1ca5b171b568b2daa3"
+git-tree-sha1 = "eb3edce0ed4fa32f75a0a11217433c31d56bd48b"
 uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
-version = "1.13.2"
+version = "1.14.0"
+
+    [deps.JSON3.extensions]
+    JSON3ArrowExt = ["ArrowTypes"]
+
+    [deps.JSON3.weakdeps]
+    ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
 
 [[deps.JSONSchema]]
 deps = ["Downloads", "JSON", "JSON3", "URIs"]
@@ -877,9 +844,9 @@ version = "0.4.1"
 
 [[deps.KernelAbstractions]]
 deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "81de11f7b02465435aab0ed7e935965bfcb3072b"
+git-tree-sha1 = "653e0824fc9ab55b3beec67a6dbbe514a65fb954"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-version = "0.9.14"
+version = "0.9.15"
 weakdeps = ["EnzymeCore"]
 
     [deps.KernelAbstractions.extensions]
@@ -925,10 +892,10 @@ uuid = "8b046642-f1f6-4319-8d3c-209ddc03c586"
 version = "1.0.0"
 
 [[deps.LLVMOpenMP_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "f689897ccbe049adb19a065c495e75f372ecd42b"
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "d986ce2d884d49126836ea94ed5bfb0f12679713"
 uuid = "1d63c593-3942-5779-bab2-d838dc0a180e"
-version = "15.0.4+0"
+version = "15.0.7+0"
 
 [[deps.LZO_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
@@ -955,12 +922,6 @@ version = "0.16.1"
     DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
     SymEngine = "123dc426-2d89-5057-bbad-38513e3affd8"
 
-[[deps.LatticeRules]]
-deps = ["Random"]
-git-tree-sha1 = "7f5b02258a3ca0221a6a9710b0a0a2e8fb4957fe"
-uuid = "73f95e8e-ec14-4e6a-8b18-0d2e271c4e55"
-version = "0.0.1"
-
 [[deps.LayoutPointers]]
 deps = ["ArrayInterface", "LinearAlgebra", "ManualMemory", "SIMDTypes", "Static", "StaticArrayInterface"]
 git-tree-sha1 = "62edfee3211981241b57ff1cedf4d74d79519277"
@@ -980,9 +941,9 @@ version = "0.15.1"
 
 [[deps.LazyArrays]]
 deps = ["ArrayLayouts", "FillArrays", "LinearAlgebra", "MacroTools", "MatrixFactorizations", "SparseArrays"]
-git-tree-sha1 = "08d56555410b0683cd7b48bff37aa59cbb0ea908"
+git-tree-sha1 = "9cfca23ab83b0dfac93cb1a1ef3331ab9fe596a5"
 uuid = "5078a376-72f3-5289-bfd5-ec5146d43c02"
-version = "1.8.2"
+version = "1.8.3"
 weakdeps = ["StaticArrays"]
 
     [deps.LazyArrays.extensions]
@@ -1003,9 +964,14 @@ uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
 version = "8.4.0+0"
 
 [[deps.LibGit2]]
-deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
+deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 
+[[deps.LibGit2_jll]]
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"]
+uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5"
+version = "1.6.4+0"
+
 [[deps.LibSSH2_jll]]
 deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
 uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
@@ -1085,10 +1051,10 @@ uuid = "18c40d15-f7cd-5a6d-bc92-87468d86c5db"
 version = "5.0.0+0"
 
 [[deps.LinearSolve]]
-deps = ["ArrayInterface", "ConcreteStructs", "DocStringExtensions", "EnumX", "FastLapackInterface", "GPUArraysCore", "InteractiveUtils", "KLU", "Krylov", "Libdl", "LinearAlgebra", "MKL_jll", "PrecompileTools", "Preferences", "RecursiveFactorization", "Reexport", "Requires", "SciMLBase", "SciMLOperators", "Setfield", "SparseArrays", "Sparspak", "StaticArraysCore", "UnPack"]
-git-tree-sha1 = "1961a307fae7aef8e2b331902c6072f0e90e5abb"
+deps = ["ArrayInterface", "ConcreteStructs", "DocStringExtensions", "EnumX", "FastLapackInterface", "GPUArraysCore", "InteractiveUtils", "KLU", "Krylov", "Libdl", "LinearAlgebra", "MKL_jll", "PrecompileTools", "Preferences", "RecursiveFactorization", "Reexport", "SciMLBase", "SciMLOperators", "Setfield", "SparseArrays", "Sparspak", "StaticArraysCore", "UnPack"]
+git-tree-sha1 = "97dc499678d50d989f1a74170840808641ce9880"
 uuid = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
-version = "2.20.2"
+version = "2.22.0"
 
     [deps.LinearSolve.extensions]
     LinearSolveBandedMatricesExt = "BandedMatrices"
@@ -1196,9 +1162,9 @@ version = "5.6.0+0"
 
 [[deps.MacroTools]]
 deps = ["Markdown", "Random"]
-git-tree-sha1 = "9ee1618cbf5240e6d4e0371d6f24065083f60c48"
+git-tree-sha1 = "b211c553c199c111d998ecdaf7623d1b89b69f93"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.11"
+version = "0.5.12"
 
 [[deps.ManualMemory]]
 git-tree-sha1 = "bcaef4fc7a0cfe2cba636d84cda54b5e4e4ca3cd"
@@ -1236,7 +1202,7 @@ version = "1.1.9"
 [[deps.MbedTLS_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
-version = "2.28.2+0"
+version = "2.28.2+1"
 
 [[deps.Measures]]
 git-tree-sha1 = "c13304c81eec1ed3af7fc20e75fb6b26092a1102"
@@ -1254,7 +1220,7 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[deps.MozillaCACerts_jll]]
 uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
-version = "2022.10.11"
+version = "2023.1.10"
 
 [[deps.MuladdMacro]]
 git-tree-sha1 = "cac9cc5499c25554cba55cd3c30543cff5ca4fab"
@@ -1267,12 +1233,6 @@ git-tree-sha1 = "a0b464d183da839699f4c79e7606d9d186ec172c"
 uuid = "d41bc354-129a-5804-8e4c-c37616107c6c"
 version = "7.8.3"
 
-[[deps.NLsolve]]
-deps = ["Distances", "LineSearches", "LinearAlgebra", "NLSolversBase", "Printf", "Reexport"]
-git-tree-sha1 = "019f12e9a1a7880459d0173c182e6a99365d7ac1"
-uuid = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
-version = "4.5.1"
-
 [[deps.NVTX]]
 deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"]
 git-tree-sha1 = "8bc9ce4233be3c63f8dcd78ccaf1b63a9c0baa34"
@@ -1293,34 +1253,42 @@ version = "1.0.2"
 
 [[deps.NearestNeighbors]]
 deps = ["Distances", "StaticArrays"]
-git-tree-sha1 = "3ef8ff4f011295fd938a521cb605099cecf084ca"
+git-tree-sha1 = "ded64ff6d4fdd1cb68dfcbb818c69e144a5b2e4c"
 uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
-version = "0.4.15"
+version = "0.4.16"
 
 [[deps.NetworkOptions]]
 uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
 version = "1.2.0"
 
 [[deps.NonlinearSolve]]
-deps = ["ADTypes", "ArrayInterface", "ConcreteStructs", "DiffEqBase", "EnumX", "FastBroadcast", "FiniteDiff", "ForwardDiff", "LazyArrays", "LineSearches", "LinearAlgebra", "LinearSolve", "MaybeInplace", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "SimpleNonlinearSolve", "SparseArrays", "SparseDiffTools", "StaticArrays", "UnPack"]
-git-tree-sha1 = "ec6577e2ebccf19797edcbf5951229598ad35650"
+deps = ["ADTypes", "ArrayInterface", "ConcreteStructs", "DiffEqBase", "EnumX", "FastBroadcast", "FastClosures", "FiniteDiff", "ForwardDiff", "LazyArrays", "LineSearches", "LinearAlgebra", "LinearSolve", "MaybeInplace", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "SimpleNonlinearSolve", "SparseArrays", "SparseDiffTools", "StaticArrays", "UnPack"]
+git-tree-sha1 = "72b036b728461272ae1b1c3f7096cb4c319d8793"
 uuid = "8913a72c-1f9b-4ce2-8d82-65094dcecaec"
-version = "3.0.1"
+version = "3.4.0"
 
     [deps.NonlinearSolve.extensions]
     NonlinearSolveBandedMatricesExt = "BandedMatrices"
     NonlinearSolveFastLevenbergMarquardtExt = "FastLevenbergMarquardt"
+    NonlinearSolveFixedPointAccelerationExt = "FixedPointAcceleration"
     NonlinearSolveLeastSquaresOptimExt = "LeastSquaresOptim"
     NonlinearSolveMINPACKExt = "MINPACK"
     NonlinearSolveNLsolveExt = "NLsolve"
+    NonlinearSolveSIAMFANLEquationsExt = "SIAMFANLEquations"
+    NonlinearSolveSpeedMappingExt = "SpeedMapping"
+    NonlinearSolveSymbolicsExt = "Symbolics"
     NonlinearSolveZygoteExt = "Zygote"
 
     [deps.NonlinearSolve.weakdeps]
     BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
     FastLevenbergMarquardt = "7a0df574-e128-4d35-8cbd-3d84502bf7ce"
+    FixedPointAcceleration = "817d07cb-a79a-5c30-9a31-890123675176"
     LeastSquaresOptim = "0fc2ff8b-aaa3-5acd-a817-1944a5e08891"
     MINPACK = "4854310b-de5a-5eb6-a2a5-c1dee2bd17f9"
     NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
+    SIAMFANLEquations = "084e46ad-d928-497d-ad5e-07fa361a48c4"
+    SpeedMapping = "f1835b91-879b-4a3f-a438-e4baacf14412"
+    Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
     Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [[deps.OCCT_jll]]
@@ -1330,10 +1298,13 @@ uuid = "baad4e97-8daa-5946-aac2-2edac59d34e1"
 version = "7.6.2+2"
 
 [[deps.OffsetArrays]]
-deps = ["Adapt"]
-git-tree-sha1 = "2ac17d29c523ce1cd38e27785a7d23024853a4bb"
+git-tree-sha1 = "6a731f2b5c03157418a20c12195eb4b74c8f8621"
 uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
-version = "1.12.10"
+version = "1.13.0"
+weakdeps = ["Adapt"]
+
+    [deps.OffsetArrays.extensions]
+    OffsetArraysAdaptExt = "Adapt"
 
 [[deps.Ogg_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
@@ -1344,12 +1315,12 @@ version = "1.3.5+1"
 [[deps.OpenBLAS_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
 uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
-version = "0.3.21+4"
+version = "0.3.23+2"
 
 [[deps.OpenLibm_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
-version = "0.8.1+0"
+version = "0.8.1+2"
 
 [[deps.OpenSSL]]
 deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "OpenSSL_jll", "Sockets"]
@@ -1387,15 +1358,15 @@ uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 version = "1.6.3"
 
 [[deps.OrdinaryDiffEq]]
-deps = ["ADTypes", "Adapt", "ArrayInterface", "DataStructures", "DiffEqBase", "DocStringExtensions", "ExponentialUtilities", "FastBroadcast", "FastClosures", "FillArrays", "FiniteDiff", "ForwardDiff", "FunctionWrappersWrappers", "IfElse", "InteractiveUtils", "LineSearches", "LinearAlgebra", "LinearSolve", "Logging", "LoopVectorization", "MacroTools", "MuladdMacro", "NLsolve", "NonlinearSolve", "Polyester", "PreallocationTools", "PrecompileTools", "Preferences", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLNLSolve", "SciMLOperators", "SimpleNonlinearSolve", "SimpleUnPack", "SparseArrays", "SparseDiffTools", "StaticArrayInterface", "StaticArrays", "TruncatedStacktraces"]
-git-tree-sha1 = "70edac326ad5bf91a51871f2437c6faa65956ef8"
+deps = ["ADTypes", "Adapt", "ArrayInterface", "DataStructures", "DiffEqBase", "DocStringExtensions", "ExponentialUtilities", "FastBroadcast", "FastClosures", "FillArrays", "FiniteDiff", "ForwardDiff", "FunctionWrappersWrappers", "IfElse", "InteractiveUtils", "LineSearches", "LinearAlgebra", "LinearSolve", "Logging", "LoopVectorization", "MacroTools", "MuladdMacro", "NonlinearSolve", "Polyester", "PreallocationTools", "PrecompileTools", "Preferences", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "SimpleNonlinearSolve", "SimpleUnPack", "SparseArrays", "SparseDiffTools", "StaticArrayInterface", "StaticArrays", "TruncatedStacktraces"]
+git-tree-sha1 = "96ae028da53cdfe24712ab015a6f854cfd7609c0"
 uuid = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
-version = "6.61.0"
+version = "6.66.0"
 
 [[deps.PCRE2_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "efcefdf7-47ab-520b-bdef-62a2eaa19f15"
-version = "10.42.0+0"
+version = "10.42.0+1"
 
 [[deps.PackageExtensionCompat]]
 git-tree-sha1 = "fb28e33b8a95c4cee25ce296c817d89cc2e53518"
@@ -1411,9 +1382,9 @@ version = "0.12.3"
 
 [[deps.Parsers]]
 deps = ["Dates", "PrecompileTools", "UUIDs"]
-git-tree-sha1 = "a935806434c9d4c506ba941871b327b96d41f2bf"
+git-tree-sha1 = "8489905bcdbcfac64d1daa51ca07c0d8f0283821"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "2.8.0"
+version = "2.8.1"
 
 [[deps.Pipe]]
 git-tree-sha1 = "6842804e7867b115ca9de748a0cf6b364523c16d"
@@ -1429,7 +1400,7 @@ version = "0.42.2+0"
 [[deps.Pkg]]
 deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-version = "1.9.2"
+version = "1.10.0"
 
 [[deps.PlotThemes]]
 deps = ["PlotUtils", "Statistics"]
@@ -1439,9 +1410,9 @@ version = "3.1.0"
 
 [[deps.PlotUtils]]
 deps = ["ColorSchemes", "Colors", "Dates", "PrecompileTools", "Printf", "Random", "Reexport", "Statistics"]
-git-tree-sha1 = "f92e1315dadf8c46561fb9396e525f7200cdc227"
+git-tree-sha1 = "862942baf5663da528f66d24996eb6da85218e76"
 uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043"
-version = "1.3.5"
+version = "1.4.0"
 
 [[deps.Plots]]
 deps = ["Base64", "Contour", "Dates", "Downloads", "FFMPEG", "FixedPointNumbers", "GR", "JLFzf", "JSON", "LaTeXStrings", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "PrecompileTools", "Preferences", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "RelocatableFolders", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs", "UnicodeFun", "UnitfulLatexify", "Unzip"]
@@ -1489,9 +1460,9 @@ version = "0.2.4"
 
 [[deps.PreallocationTools]]
 deps = ["Adapt", "ArrayInterface", "ForwardDiff", "Requires"]
-git-tree-sha1 = "f739b1b3cc7b9949af3b35089931f2b58c289163"
+git-tree-sha1 = "01ac95fca7daabe77a9cb705862bd87016af9ddb"
 uuid = "d236fae5-4411-538c-8e31-a6e3d9e00b46"
-version = "0.4.12"
+version = "0.4.13"
 
     [deps.PreallocationTools.extensions]
     PreallocationToolsReverseDiffExt = "ReverseDiff"
@@ -1517,12 +1488,6 @@ git-tree-sha1 = "88b895d13d53b5577fd53379d913b9ab9ac82660"
 uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
 version = "2.3.1"
 
-[[deps.Primes]]
-deps = ["IntegerMathUtils"]
-git-tree-sha1 = "1d05623b5952aed1307bf8b43bec8b8d1ef94b6e"
-uuid = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
-version = "0.5.5"
-
 [[deps.Printf]]
 deps = ["Unicode"]
 uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -1539,31 +1504,19 @@ git-tree-sha1 = "7c29f0e8c575428bd84dc3c72ece5178caa67336"
 uuid = "c0090381-4147-56d7-9ebc-da0b1113ec56"
 version = "6.5.2+2"
 
-[[deps.QuasiMonteCarlo]]
-deps = ["Accessors", "ConcreteStructs", "LatticeRules", "LinearAlgebra", "Primes", "Random", "Requires", "Sobol", "StatsBase"]
-git-tree-sha1 = "cc086f8485bce77b6187141e1413c3b55f9a4341"
-uuid = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
-version = "0.3.3"
-
-    [deps.QuasiMonteCarlo.extensions]
-    QuasiMonteCarloDistributionsExt = "Distributions"
-
-    [deps.QuasiMonteCarlo.weakdeps]
-    Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
-
 [[deps.REPL]]
 deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
 uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 
 [[deps.Random]]
-deps = ["SHA", "Serialization"]
+deps = ["SHA"]
 uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [[deps.Random123]]
 deps = ["Random", "RandomNumbers"]
-git-tree-sha1 = "552f30e847641591ba3f39fd1bed559b9deb0ef3"
+git-tree-sha1 = "c860e84651f58ce240dd79e5d9e055d55234c35a"
 uuid = "74087812-796a-5b5d-8853-05524746bad3"
-version = "1.6.1"
+version = "1.6.2"
 
 [[deps.RandomNumbers]]
 deps = ["Random", "Requires"]
@@ -1584,18 +1537,20 @@ uuid = "01d81517-befc-4cb6-b9ec-a95719d0359c"
 version = "0.6.12"
 
 [[deps.RecursiveArrayTools]]
-deps = ["Adapt", "ArrayInterface", "DocStringExtensions", "GPUArraysCore", "IteratorInterfaceExtensions", "LinearAlgebra", "RecipesBase", "Requires", "StaticArraysCore", "Statistics", "SymbolicIndexingInterface", "Tables"]
-git-tree-sha1 = "d7087c013e8a496ff396bae843b1e16d9a30ede8"
+deps = ["Adapt", "ArrayInterface", "DocStringExtensions", "GPUArraysCore", "IteratorInterfaceExtensions", "LinearAlgebra", "RecipesBase", "Requires", "SparseArrays", "StaticArraysCore", "Statistics", "SymbolicIndexingInterface", "Tables"]
+git-tree-sha1 = "27ee1c03e732c488ecce1a25f0d7da9b5d936574"
 uuid = "731186ca-8d62-57ce-b412-fbd966d074cd"
-version = "2.38.10"
+version = "3.3.3"
 
     [deps.RecursiveArrayTools.extensions]
+    RecursiveArrayToolsFastBroadcastExt = "FastBroadcast"
     RecursiveArrayToolsMeasurementsExt = "Measurements"
     RecursiveArrayToolsMonteCarloMeasurementsExt = "MonteCarloMeasurements"
     RecursiveArrayToolsTrackerExt = "Tracker"
     RecursiveArrayToolsZygoteExt = "Zygote"
 
     [deps.RecursiveArrayTools.weakdeps]
+    FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898"
     Measurements = "eff96d63-e80a-5855-80a2-b1b0885c5ab7"
     MonteCarloMeasurements = "0987c9cc-fe09-11e8-30f0-b96dd679fdca"
     Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
@@ -1664,10 +1619,10 @@ uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa"
 version = "0.6.42"
 
 [[deps.SciMLBase]]
-deps = ["ADTypes", "ArrayInterface", "CommonSolve", "ConstructionBase", "Distributed", "DocStringExtensions", "EnumX", "FillArrays", "FunctionWrappersWrappers", "IteratorInterfaceExtensions", "LinearAlgebra", "Logging", "Markdown", "PrecompileTools", "Preferences", "Printf", "QuasiMonteCarlo", "RecipesBase", "RecursiveArrayTools", "Reexport", "RuntimeGeneratedFunctions", "SciMLOperators", "StaticArraysCore", "Statistics", "SymbolicIndexingInterface", "Tables", "TruncatedStacktraces"]
-git-tree-sha1 = "32ea825941f7b58a6f48268f4b76971ae8eb9eec"
+deps = ["ADTypes", "ArrayInterface", "CommonSolve", "ConstructionBase", "Distributed", "DocStringExtensions", "EnumX", "FillArrays", "FunctionWrappersWrappers", "IteratorInterfaceExtensions", "LinearAlgebra", "Logging", "Markdown", "PrecompileTools", "Preferences", "Printf", "RecipesBase", "RecursiveArrayTools", "Reexport", "RuntimeGeneratedFunctions", "SciMLOperators", "StaticArraysCore", "Statistics", "SymbolicIndexingInterface", "Tables", "TruncatedStacktraces"]
+git-tree-sha1 = "2c3706caa9adab5031f30937699c46e159ae477f"
 uuid = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
-version = "2.10.0"
+version = "2.13.0"
 
     [deps.SciMLBase.extensions]
     SciMLBaseChainRulesCoreExt = "ChainRulesCore"
@@ -1686,12 +1641,6 @@ version = "2.10.0"
     RCall = "6f49c342-dc21-5d91-9882-a32aef131414"
     Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
-[[deps.SciMLNLSolve]]
-deps = ["DiffEqBase", "LineSearches", "NLsolve", "Reexport", "SciMLBase"]
-git-tree-sha1 = "765b788339abd7d983618c09cfc0192e2b6b15fd"
-uuid = "e9a6253c-8580-4d32-9898-8661bb511710"
-version = "0.1.9"
-
 [[deps.SciMLOperators]]
 deps = ["ArrayInterface", "DocStringExtensions", "Lazy", "LinearAlgebra", "Setfield", "SparseArrays", "StaticArraysCore", "Tricks"]
 git-tree-sha1 = "51ae235ff058a64815e0a2c34b1db7578a06813d"
@@ -1736,9 +1685,15 @@ version = "1.1.0"
 
 [[deps.SimpleNonlinearSolve]]
 deps = ["ADTypes", "ArrayInterface", "ConcreteStructs", "DiffEqBase", "FiniteDiff", "ForwardDiff", "LinearAlgebra", "MaybeInplace", "PrecompileTools", "Reexport", "SciMLBase", "StaticArraysCore"]
-git-tree-sha1 = "62b08ae70b9fe7e4dacffcf9cbb0e5c2d6688d09"
+git-tree-sha1 = "8d672bd91dc432fb286b6d4bcf1a5dc417e932a3"
 uuid = "727e6d20-b764-4bd8-a329-72de5adea6c7"
-version = "1.0.2"
+version = "1.2.0"
+
+    [deps.SimpleNonlinearSolve.extensions]
+    SimpleNonlinearSolvePolyesterForwardDiffExt = "PolyesterForwardDiff"
+
+    [deps.SimpleNonlinearSolve.weakdeps]
+    PolyesterForwardDiff = "98d1487c-24ca-40b6-b7ab-df2af84e126b"
 
 [[deps.SimpleTraits]]
 deps = ["InteractiveUtils", "MacroTools"]
@@ -1751,30 +1706,25 @@ git-tree-sha1 = "58e6353e72cde29b90a69527e56df1b5c3d8c437"
 uuid = "ce78b400-467f-4804-87d8-8f486da07d0a"
 version = "1.1.0"
 
-[[deps.Sobol]]
-deps = ["DelimitedFiles", "Random"]
-git-tree-sha1 = "5a74ac22a9daef23705f010f72c81d6925b19df8"
-uuid = "ed01d8cd-4d21-5b2a-85b4-cc3bdc58bad4"
-version = "1.5.0"
-
 [[deps.Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
 [[deps.SortingAlgorithms]]
 deps = ["DataStructures"]
-git-tree-sha1 = "5165dfb9fd131cf0c6957a3a7605dede376e7b63"
+git-tree-sha1 = "66e0a8e672a0bdfca2c3f5937efb8538b9ddc085"
 uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
-version = "1.2.0"
+version = "1.2.1"
 
 [[deps.SparseArrays]]
 deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+version = "1.10.0"
 
 [[deps.SparseDiffTools]]
 deps = ["ADTypes", "Adapt", "ArrayInterface", "Compat", "DataStructures", "FiniteDiff", "ForwardDiff", "Graphs", "LinearAlgebra", "PackageExtensionCompat", "Random", "Reexport", "SciMLOperators", "Setfield", "SparseArrays", "StaticArrayInterface", "StaticArrays", "Tricks", "UnPack", "VertexSafeGraphs"]
-git-tree-sha1 = "ddea63e5de5405878d990a2cea4fc1daf2d52d41"
+git-tree-sha1 = "c281e11db4eacb36a292a054bac83c5a0aca2a26"
 uuid = "47a9eef4-7e08-11e9-0b38-333d64bd3804"
-version = "2.14.0"
+version = "2.15.0"
 
     [deps.SparseDiffTools.extensions]
     SparseDiffToolsEnzymeExt = "Enzyme"
@@ -1823,14 +1773,18 @@ weakdeps = ["OffsetArrays", "StaticArrays"]
 
 [[deps.StaticArrays]]
 deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"]
-git-tree-sha1 = "5ef59aea6f18c25168842bded46b16662141ab87"
+git-tree-sha1 = "fba11dbe2562eecdfcac49a05246af09ee64d055"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.7.0"
-weakdeps = ["Statistics"]
+version = "1.8.1"
 
     [deps.StaticArrays.extensions]
+    StaticArraysChainRulesCoreExt = "ChainRulesCore"
     StaticArraysStatisticsExt = "Statistics"
 
+    [deps.StaticArrays.weakdeps]
+    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+    Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
 [[deps.StaticArraysCore]]
 git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
 uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
@@ -1839,7 +1793,7 @@ version = "1.4.2"
 [[deps.Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-version = "1.9.0"
+version = "1.10.0"
 
 [[deps.StatsAPI]]
 deps = ["LinearAlgebra"]
@@ -1882,15 +1836,14 @@ deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"]
 uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
 
 [[deps.SuiteSparse_jll]]
-deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
+deps = ["Artifacts", "Libdl", "libblastrampoline_jll"]
 uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
-version = "5.10.1+6"
+version = "7.2.1+1"
 
 [[deps.SymbolicIndexingInterface]]
-deps = ["DocStringExtensions"]
-git-tree-sha1 = "f8ab052bfcbdb9b48fad2c80c873aa0d0344dfe5"
+git-tree-sha1 = "be414bfd80c2c91197823890c66ef4b74f5bf5fe"
 uuid = "2efcf032-c050-4f8e-a9bb-153293bab1f5"
-version = "0.2.2"
+version = "0.3.1"
 
 [[deps.TOML]]
 deps = ["Dates"]
@@ -1993,12 +1946,15 @@ deps = ["Dates", "LinearAlgebra", "Random"]
 git-tree-sha1 = "3c793be6df9dd77a0cf49d80984ef9ff996948fa"
 uuid = "1986cc42-f94f-5a68-af5c-568840ba703d"
 version = "1.19.0"
-weakdeps = ["ConstructionBase", "InverseFunctions"]
 
     [deps.Unitful.extensions]
     ConstructionBaseUnitfulExt = "ConstructionBase"
     InverseFunctionsUnitfulExt = "InverseFunctions"
 
+    [deps.Unitful.weakdeps]
+    ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
+    InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
+
 [[deps.UnitfulLatexify]]
 deps = ["LaTeXStrings", "Latexify", "Unitful"]
 git-tree-sha1 = "e2d817cc500e960fdbafcf988ac8436ba3208bfd"
@@ -2239,7 +2195,7 @@ version = "0.4.9"
 [[deps.Zlib_jll]]
 deps = ["Libdl"]
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.13+0"
+version = "1.2.13+1"
 
 [[deps.Zstd_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
@@ -2286,7 +2242,7 @@ version = "0.15.1+0"
 [[deps.libblastrampoline_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
-version = "5.8.0+0"
+version = "5.8.0+1"
 
 [[deps.libevdev_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
@@ -2332,7 +2288,7 @@ version = "1.52.0+1"
 [[deps.p7zip_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
-version = "17.4.0+0"
+version = "17.4.0+2"
 
 [[deps.x264_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]

From 2b7bb240a77f00ed3da4a6012ed7c60a0c710d6c Mon Sep 17 00:00:00 2001
From: termi-official <termi-official@users.noreply.github.com>
Date: Tue, 27 Feb 2024 15:48:11 +0100
Subject: [PATCH 16/16] Tune runtimes

---
 docs/src/literate-tutorials/gpu_assembly.jl | 127 ++++++++++++--------
 1 file changed, 76 insertions(+), 51 deletions(-)

diff --git a/docs/src/literate-tutorials/gpu_assembly.jl b/docs/src/literate-tutorials/gpu_assembly.jl
index 5277eb5e7a..362b498a20 100644
--- a/docs/src/literate-tutorials/gpu_assembly.jl
+++ b/docs/src/literate-tutorials/gpu_assembly.jl
@@ -67,7 +67,7 @@ function GPUGrid(::Type{T}, grid::Grid) where T
     )
 end
 
-struct GPUDofHandler{GRID <: GPUGrid, CADOFS <: AbstractArray{Int,1}, CAOFFSETS <: AbstractArray{Int,1}}
+struct GPUDofHandler{GRID <: GPUGrid, CADOFS <: AbstractArray{Int,1}, CAOFFSETS <: AbstractArray{Int,1}} <: Ferrite.AbstractDofHandler
     cell_dofs::CADOFS
     cell_dofs_offset::CAOFFSETS
     grid::GRID
@@ -93,6 +93,11 @@ function Ferrite.celldofs(dh::GPUDofHandler, cell_index::Int)
 end
 Adapt.@adapt_structure GPUDofHandler
 
+function Ferrite.celldofs(dh::DofHandler, cell_index::Int) 
+    cell_dof_range = dh.cell_dofs_offset[cell_index]:(dh.cell_dofs_offset[cell_index+1]-1)
+    return @view dh.cell_dofs[cell_dof_range]
+end
+
 function shape_values(ip::Lagrange{RefQuadrilateral, 1}, ξ::Vec{2})
     ξ_x = ξ[1]
     ξ_y = ξ[2]
@@ -302,7 +307,7 @@ function gpu_full_mass_action!(uout::AbstractVector, u::AbstractVector, dh::GPUD
     synchronize()
 
     # Apply action one time
-    numthreads = 128
+    numthreads = 8
     for color ∈ colors
         numblocks = ceil(Int, length(color)/numthreads)
         # try
@@ -341,24 +346,22 @@ Base.size(A::FerriteMFMassOperator, d) = ndofs(dh) # Square operator
 
 struct FerriteEAMassOperator{D2E, XE2, EAMATS <: AbstractArray{<:Any,3}}
     d2e::D2E
+    e2d::D2E
+    xe::XE2
     xe2::XE2
     element_matrices::EAMATS
 end
 
 function gemv_batched!(xe2::CuDeviceArray{T,2}, emats::CuDeviceArray{T,3}, xe::CuDeviceArray{T,2}) where T
-    # base_cell_index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
-    # stride = blockDim().x
-    # for cell_index ∈ base_cell_index:min(base_cell_index+stride,size(emats,3))
-
     cell_index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
     stride = gridDim().x * blockDim().x
     while cell_index ≤ size(emats,3)
         xei = @view xe[:,cell_index]
         xe2i = @view xe2[:,cell_index]
         emati = @view emats[:,:,cell_index]
-        # mul!(xe2i, emati, xei) # fails?
         xe2i .= T(0.0)
-        for j ∈ 1:size(emats,2), i ∈ 1:size(emats,1)
+        # mul!(xe2i, emati, xei) # fails?
+        for j ∈ 1:size(emati,2), i ∈ 1:size(emati,1)
             xe2i[i] += emati[i,j]*xei[j]
         end
         cell_index += stride
@@ -366,17 +369,17 @@ function gemv_batched!(xe2::CuDeviceArray{T,2}, emats::CuDeviceArray{T,3}, xe::C
 end
 
 function LinearAlgebra.mul!(y,A::FerriteEAMassOperator{<:Any,<:Any,<:CuArray},x)
-    xe = A.d2e*x
+    mul!(A.xe, A.d2e, x)
     synchronize()
-    xev = reshape(xe, (size(A.element_matrices,2), size(A.element_matrices,3)))
+    xev = reshape(A.xe, (size(A.element_matrices,2), size(A.element_matrices,3)))
     xe2v = reshape(A.xe2, (size(A.element_matrices,2), size(A.element_matrices,3)))
-    synchronize()
+    # synchronize()
     # CUDA.CUBLAS.gemv_batched!(???)
-    numthreads = 128
+    numthreads = 8
     numblocks = ceil(Int, size(A.element_matrices,3)/numthreads)
     @cuda threads=numthreads blocks=numblocks gemv_batched!(xe2v, A.element_matrices, xev)
     synchronize()
-    mul!(y, transpose(A.d2e), A.xe2)
+    mul!(y, A.e2d, A.xe2)
     synchronize()
 end
 Base.eltype(A::FerriteEAMassOperator{<:Grid{<:Any,<:Any,T}}) where T = T
@@ -392,47 +395,66 @@ struct FerriteRHS{DH, COLS, IP, GIP, QR}
     qr::QR
 end
 
+function rhs_kernel_inner!(rhs::AbstractVector{T}, dh::Union{DofHandler{<:Any,<:Grid{sdim,<:Any,T}}, GPUDofHandler{<:GPUGrid{sdim,<:Any,T}}}, qr::SmallQuadratureRule{<:Any,T}, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation, cell_index::Int) where {sdim,T}
+    ## Grab the actual cell
+    cell = dh.grid.cells[cell_index]
+    # ## Grab the dofs on the cell
+    cell_dofs = celldofs(dh, cell_index)
+    ## Grab the buffers for the y and x
+    rhsₑ = @view rhs[cell_dofs]
+    ## Grab coordinate array
+    coords = cellnodesvec(cell, dh.grid.nodes)
+    ## Apply local action for y = Ax
+    for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues)
+        ξ = qr.points[q_point]
+        # TODO recover abstraction layer
+        J = getJ(ip_geo, coords, ξ)
+        dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point)
+
+        # # TODO spatial_coordinate
+        x = zero(Vec{sdim,T})
+        ϕᵍ = shape_values(ip_geo, ξ)
+        for i in 1:getnbasefunctions(ip_geo)
+            x += ϕᵍ[i]*coords[i] #geometric_value(fe_v, q_point, i) * x[i]
+        end
+
+        ϕᶠ = shape_values(ip, ξ)
+        @inbounds for i in 1:getnbasefunctions(ip)
+            rhsₑ[i] += cos(x[1]/π)*cos(x[2]/π)*ϕᶠ[i]*dΩ
+        end
+    end
+end
+
 function gpu_rhs_kernel!(rhs::CuDeviceVector{T}, gpudh::GPUDofHandler{<:GPUGrid{sdim,<:Any,T}}, qr::SmallQuadratureRule{<:Any,T}, ip::Ferrite.Interpolation, ip_geo::Ferrite.Interpolation, cell_indices::AbstractVector) where {sdim,T}
     index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
     stride = gridDim().x * blockDim().x
     while index <= length(cell_indices)
         ## Get index of the current cell
         cell_index = cell_indices[index]
-        ## Grab the actual cell
-        cell = gpudh.grid.cells[cell_index]
-        # ## Grab the dofs on the cell
-        cell_dofs = celldofs(gpudh, cell_index)
-        ## Grab the buffers for the y and x
-        rhsₑ = @view rhs[cell_dofs]
-        ## Grab coordinate array
-        coords = cellnodesvec(cell, gpudh.grid.nodes)
-        ## Apply local action for y = Ax
-        for q_point in 1:length(qr.weights) #getnquadpoints(cellvalues)
-            ξ = qr.points[q_point]
-            # TODO recover abstraction layer
-            J = getJ(ip_geo, coords, ξ)
-            dΩ = det(J)*qr.weights[q_point] #getdetJdV(cellvalues, q_point)
-
-            # # TODO spatial_coordinate
-            x = zero(Vec{sdim,T})
-            ϕᵍ = shape_values(ip_geo, ξ)
-            for i in 1:getnbasefunctions(ip_geo)
-                x += ϕᵍ[i]*coords[i] #geometric_value(fe_v, q_point, i) * x[i]
-            end
-
-            ϕᶠ = shape_values(ip, ξ)
-            @inbounds for i in 1:getnbasefunctions(ip)
-                rhsₑ[i] += cos(x[1]/π)*cos(x[2]/π)*ϕᶠ[i]*dΩ
-            end
-        end
+        ## Launch assembly kernel
+        rhs_kernel_inner!(rhs, gpudh, qr, ip, ip_geo, cell_index)
+        ## Shift index up
         index += stride
     end
 end
 
+function generate_rhs(cpurhs::FerriteRHS{<:DofHandler{<:Any, <:Grid{<:Any,<:Any,T}}}) where T
+    rhs = zeros(T,ndofs(dh))
+    for color ∈ cpurhs.colors
+        # try
+        for cell_index ∈ color
+            # rhs_kernel_inner!(rhs, cpurhs.dh, cpurhs.qr, cpurhs.ip, cpurhs.ip_geo, cell_index)
+            # return @benchmark rhs_kernel_inner!($rhs, $(cpurhs.dh), $(cpurhs.qr), $cpurhs.ip, $cpurhs.ip_geo, $cell_index)
+        end
+        # catch err
+        #     code_typed(err; interactive = true)
+        # end
+    end
+    return rhs
+end
 
-function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler{<:GPUGrid{<:Any,<:Any,T}}}) where T
+function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler{<:GPUGrid{<:Any,<:Any,T}}}, numthreads) where T
     rhs = CuArray(zeros(T,ndofs(dh)))
-    numthreads = 128
     for color ∈ gpurhs.colors
         numblocks = ceil(Int, length(color)/numthreads)
         # try
@@ -464,24 +486,27 @@ function generate_rhs(gpurhs::FerriteRHS{<:GPUDofHandler{<:GPUGrid{<:Any,<:Any,T
     return rhs
 end
 
+cpurhsop = FerriteRHS(dh, colors, ip, ip, SmallQuadratureRule(qr))
+bcpu = generate_rhs(cpurhsop)
+@benchmark generate_rhs($cpurhsop)
+
 cudh = GPUDofHandler(dh)
 cucolors = CuArray.(colors)
 Agpu = FerriteMFMassOperator(cudh, cucolors, ip, ip, SmallQuadratureRule(qr))
-rhsop = FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr))
-bgpu = generate_rhs(rhsop)
+gpurhsop = FerriteRHS(cudh, cucolors, ip, ip, SmallQuadratureRule(qr))
+bgpu = generate_rhs(gpurhsop, 16)
+@benchmark generate_rhs($gpurhsop, 16)
 ugpu = CUDA.fill(0.f0, ndofs(dh));
 # cg!(ugpu, Agpu, bgpu; verbose=true);
-# @benchmark CUDA.@sync mul!($u,$Agpu,$bgpu)
 
-Aeagpu = FerriteEAMassOperator(CUDA.CUSPARSE.CuSparseMatrixCSC(d2e), CuArray(zeros(Float32, size(d2e,1))), CuArray(Keacpu))
+Aeagpu = FerriteEAMassOperator(CUDA.CUSPARSE.CuSparseMatrixCSR(d2e), CUDA.CUSPARSE.CuSparseMatrixCSR(transpose(d2e)), CuArray(zeros(Float32, size(d2e,1))), CuArray(zeros(Float32, size(d2e,1))), CuArray(Keacpu))
 # @benchmark CUDA.@sync mul!($ugpu,$Aeagpu,$bgpu)
 
-ucpu = copy(Array(ugpu))
-bcpu = copy(Array(bgpu))
-# @benchmark mul!($u,$Kcpu,$(Array(bgpu)))
+ucpu = fill(0.f0, ndofs(dh));
+# @benchmark mul!($ucpu,$Kcpu,$(Array(bcpu)))
 
-Kgpu = CUDA.CUSPARSE.CuSparseMatrixCSC(Kcpu)
-# @benchmark mul!($ugpu,$Kcpu,$bgpu)
+Kgpu = CUDA.CUSPARSE.CuSparseMatrixCSR(Kcpu)
+# @benchmark CUDA.@sync mul!($ugpu,$Kgpu,$bgpu)
 
 # ### Exporting to VTK
 # To visualize the result we export the grid and our field `u`