-
Notifications
You must be signed in to change notification settings - Fork 5
Alya
This tutorial explains how we used BOAST to port one of Alya
Kernels. Here, we focus on the kernel
nsi_element_assembly_split_oss
which is a kernel assembly of Navier
Stokes equations using the split OSS Variational Multiscale Model.
This tutorial is written using the org-mode of Emacs, it follows
the literate programming style and contains the entire source code
to generate the kernel. The source code is split into blocks which
are explained step by step. Two different kinds of sections compose
this tutorial, the first kind explains how the core of the kernel is
ported with BOAST, this is the description of the kernel. The second
kind addresses more technical aspects such as assembling all the
different pieces of the kernel together and is marked by the tag
technical. As the code is split this way, code blocks are
referenced by <<some-block>>
and are included in other code blocks
using org-mode functionality.
The .org version of this tutorial (available here: https://github.com/swhatelse/Alya/blob/master/org/Tutorial.org) can be used to generate the necessary ruby files to generate the kernel by using the org-mode tangling function by press C-c C-v t.
This section contains 2 FORTRAN source code versions of the kernel
nsi_element_assembly_split_oss
. The version 1 is deprecated and has
only been kept for information purpose. Only the second version
is used as reference implementation to check the BOAST generated
version.
subroutine nsi_element_assembly_split_oss(&
pnode,pgaus,pevat,gpden,gpvis,gppor,gpsp1,gpsp2,&
gpvol,gpsha,gpcar,gpadv,gpvep,gpprp,gpgrp,gprhs,&
gpvel,gpsgs,wgrgr,agrau,elvel,elauu,elaup,elapp,&
elapu,elrbu,elrbp,dtinv_loc,dtsgs)
integer(ip), intent(in) :: pnode,pgaus,pevat
real(rp), intent(in) :: gpden(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gpvis(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gppor(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gpsp1(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gpsp2(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gpvol(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gpsha(VECTOR_SIZE,pnode,pgaus)
real(rp), intent(in) :: gpcar(VECTOR_SIZE,ndime,mnode,pgaus)
real(rp), intent(in) :: gpadv(VECTOR_SIZE,ndime,pgaus)
real(rp), intent(inout) :: gpvep(VECTOR_SIZE,ndime,pgaus)
real(rp), intent(inout) :: gpprp(VECTOR_SIZE,pgaus)
real(rp), intent(inout) :: gpgrp(VECTOR_SIZE,ndime,pgaus)
real(rp), intent(inout) :: gprhs(VECTOR_SIZE,ndime,pgaus)
real(rp), intent(in) :: gpvel(VECTOR_SIZE,ndime,pgaus,*)
real(rp), intent(in) :: gpsgs(VECTOR_SIZE,ndime,pgaus,*)
real(rp), intent(out) :: wgrgr(VECTOR_SIZE,pnode,pnode,pgaus)
real(rp), intent(out) :: agrau(VECTOR_SIZE,pnode,pgaus)
real(rp), intent(in) :: elvel(VECTOR_SIZE,ndime,pnode,*)
real(rp), intent(out) :: elauu(VECTOR_SIZE,pnode*ndime,pnode*ndime)
real(rp), intent(out) :: elaup(VECTOR_SIZE,pnode*ndime,pnode)
real(rp), intent(out) :: elapp(VECTOR_SIZE,pnode,pnode)
real(rp), intent(out) :: elapu(VECTOR_SIZE,pnode,pnode*ndime)
real(rp), intent(out) :: elrbu(VECTOR_SIZE,ndime,pnode)
real(rp), intent(out) :: elrbp(VECTOR_SIZE,pnode)
real(rp), intent(in) :: dtinv_loc(VECTOR_SIZE)
real(rp), intent(in) :: dtsgs(VECTOR_SIZE)
real(rp) :: gpsp1_p(VECTOR_SIZE,pgaus)
real(rp) :: gpsp1_v(VECTOR_SIZE,pgaus)
real(rp) :: c1(VECTOR_SIZE),c2(VECTOR_SIZE)
real(rp) :: c3(VECTOR_SIZE),c4(VECTOR_SIZE)
real(rp) :: alpha(VECTOR_SIZE),beta(VECTOR_SIZE)
real(rp) :: fact0(VECTOR_SIZE)
real(rp) :: fact1(VECTOR_SIZE)
real(rp) :: fact2(VECTOR_SIZE)
real(rp) :: fact3(VECTOR_SIZE)
real(rp) :: fact4(VECTOR_SIZE)
real(rp) :: fact5(VECTOR_SIZE)
real(rp) :: fact6(VECTOR_SIZE)
real(rp) :: fact7(VECTOR_SIZE)
real(rp) :: fact8(VECTOR_SIZE)
real(rp) :: gpveo(VECTOR_SIZE,3)
real(rp) :: fact1_p(VECTOR_SIZE)
integer(ip) :: inode,jnode,idofn,jdofn,idofv,jdof2,jdof3
integer(ip) :: idof1,idof3,idof2,igaus,idime,jdof1,jdofv,itime
#ifdef OPENACC
#define DEF_VECT ivect
#else
#define DEF_VECT 1:VECTOR_SIZE
#endif
!----------------------------------------------------------------------
!
! possibility of using only pressure stabilization - not ready with limiter - nor with shock capturing
!
!----------------------------------------------------------------------
gpsp1_p = gpsp1
gpsp1_v = gpsp1
!if(1==2) gpsp1_v = 0.0_rp
!if(1==2) gpsp1_p = min(gpsp1_p,1.0_rp/dtinv_loc) ! Activate this line only if runing without subscales
! and you want to limit tau1 like most groups do for small time step
!----------------------------------------------------------------------
!
! Initialization
!
!----------------------------------------------------------------------
elrbp = 0.0_rp
elrbu = 0.0_rp
elapp = 0.0_rp
elauu = 0.0_rp
elaup = 0.0_rp
elapu = 0.0_rp
!----------------------------------------------------------------------
!
! Test functions
!
!----------------------------------------------------------------------
!
! AGRAU = rho * (a.grad) Ni
! WGRGR = grad(Ni) . grad(Nj)
!
if( ndime == 2 ) then
do igaus = 1,pgaus
do inode = 1,pnode
agrau(DEF_VECT,inode,igaus) = gpden(DEF_VECT,igaus) * ( &
& gpadv(DEF_VECT,1,igaus)*gpcar(DEF_VECT,1,inode,igaus) &
& + gpadv(DEF_VECT,2,igaus)*gpcar(DEF_VECT,2,inode,igaus) )
do jnode = 1,pnode
wgrgr(DEF_VECT,inode,jnode,igaus) = &
& gpcar(DEF_VECT,1,inode,igaus)*gpcar(DEF_VECT,1,jnode,igaus) &
& + gpcar(DEF_VECT,2,inode,igaus)*gpcar(DEF_VECT,2,jnode,igaus)
end do
end do
end do
else
do igaus = 1,pgaus
do inode = 1,pnode
agrau(DEF_VECT,inode,igaus) = gpden(DEF_VECT,igaus) * ( &
& gpadv(DEF_VECT,1,igaus)*gpcar(DEF_VECT,1,inode,igaus) &
& + gpadv(DEF_VECT,2,igaus)*gpcar(DEF_VECT,2,inode,igaus) &
& + gpadv(DEF_VECT,3,igaus)*gpcar(DEF_VECT,3,inode,igaus) )
do jnode = 1,pnode
wgrgr(DEF_VECT,inode,jnode,igaus) = &
& gpcar(DEF_VECT,1,inode,igaus)*gpcar(DEF_VECT,1,jnode,igaus) &
& + gpcar(DEF_VECT,2,inode,igaus)*gpcar(DEF_VECT,2,jnode,igaus) &
& + gpcar(DEF_VECT,3,inode,igaus)*gpcar(DEF_VECT,3,jnode,igaus)
end do
end do
end do
end if
!----------------------------------------------------------------------
!
! Auu
!
!----------------------------------------------------------------------
!
! Galerkin + ( tau2 * div(u) , div(v) ) + ( tau1 * rho*a.grad(u), rho*a.grad(v) )
!
if( ndime == 2 ) then
do igaus = 1,pgaus
fact0(DEF_VECT) = gpsp2(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus)
fact6(DEF_VECT) = gpvis(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus)
fact7(DEF_VECT) = gpsp1_v(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus)
fact8(DEF_VECT) = pabdf_nsi(1) * gpden(DEF_VECT,igaus) * dtinv_loc(DEF_VECT) + gppor(DEF_VECT,igaus)
do inode = 1,pnode
idof1 = 2*inode-1
idof2 = 2*inode
fact1(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,1,inode,igaus) ! div(u) * tau2' * dv/dx
fact2(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,2,inode,igaus) ! div(u) * tau2' * dv/dy
fact4(DEF_VECT) = gpsha(DEF_VECT,inode,igaus) * gpvol(DEF_VECT,igaus)
do jnode = 1,pnode
jdof1 = 2*jnode-1
jdof2 = 2*jnode
fact5(DEF_VECT) = fact4(DEF_VECT) * ( agrau(DEF_VECT,jnode,igaus) + fact8(DEF_VECT) * gpsha(DEF_VECT,jnode,igaus) ) & ! ( rho/dt N_j + s Nj + rho*(a.grad)Nj ) Ni
& + fact6(DEF_VECT) * wgrgr(DEF_VECT,inode,jnode,igaus) & ! mu * grad(Ni) . grad(Nj)
& + fact7(DEF_VECT) * agrau(DEF_VECT,jnode,igaus) * agrau(DEF_VECT,inode,igaus) ! tau1 * rho*(a.grad)Nj * rho*(a.grad)Ni
elauu(DEF_VECT,idof1,jdof1) = elauu(DEF_VECT,idof1,jdof1) + fact1(DEF_VECT) * gpcar(DEF_VECT,1,jnode,igaus) + fact5(DEF_VECT)
elauu(DEF_VECT,idof2,jdof1) = elauu(DEF_VECT,idof2,jdof1) + fact2(DEF_VECT) * gpcar(DEF_VECT,1,jnode,igaus)
elauu(DEF_VECT,idof1,jdof2) = elauu(DEF_VECT,idof1,jdof2) + fact1(DEF_VECT) * gpcar(DEF_VECT,2,jnode,igaus)
elauu(DEF_VECT,idof2,jdof2) = elauu(DEF_VECT,idof2,jdof2) + fact2(DEF_VECT) * gpcar(DEF_VECT,2,jnode,igaus) + fact5(DEF_VECT)
end do
end do
end do
else
do igaus = 1,pgaus
fact0(DEF_VECT) = gpsp2(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus)
fact6(DEF_VECT) = gpvis(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus)
fact7(DEF_VECT) = gpsp1_v(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus)
fact8(DEF_VECT) = pabdf_nsi(1) * gpden(DEF_VECT,igaus) * dtinv_loc(DEF_VECT) + gppor(DEF_VECT,igaus)
do inode = 1,pnode
idof1 = 3*inode-2
idof2 = 3*inode-1
idof3 = 3*inode
fact1(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,1,inode,igaus) ! div(u) * tau2' * dv/dx
fact2(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,2,inode,igaus) ! div(u) * tau2' * dv/dy
fact3(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,3,inode,igaus) ! div(u) * tau2' * dv/dz
fact4(DEF_VECT) = gpsha(DEF_VECT,inode,igaus) * gpvol(DEF_VECT,igaus)
do jnode = 1,pnode
jdof1 = 3*jnode-2
jdof2 = 3*jnode-1
jdof3 = 3*jnode
fact5(DEF_VECT) = fact4(DEF_VECT) * ( agrau(DEF_VECT,jnode,igaus) + fact8(DEF_VECT) * gpsha(DEF_VECT,jnode,igaus) ) & ! ( rho/dt N_j + s Nj + rho*(a.grad)Nj ) Ni
+ fact6(DEF_VECT) * wgrgr(DEF_VECT,inode,jnode,igaus) & ! mu * grad(Ni) . grad(Nj)
+ fact7(DEF_VECT) * agrau(DEF_VECT,jnode,igaus) * agrau(DEF_VECT,inode,igaus) ! t1 * rho*(a.grad)Nj * rho*(a.grad)Ni
elauu(DEF_VECT,idof1,jdof1) = elauu(DEF_VECT,idof1,jdof1) + fact1(DEF_VECT) * gpcar(DEF_VECT,1,jnode,igaus) + fact5(DEF_VECT)
elauu(DEF_VECT,idof2,jdof1) = elauu(DEF_VECT,idof2,jdof1) + fact2(DEF_VECT) * gpcar(DEF_VECT,1,jnode,igaus)
elauu(DEF_VECT,idof3,jdof1) = elauu(DEF_VECT,idof3,jdof1) + fact3(DEF_VECT) * gpcar(DEF_VECT,1,jnode,igaus)
elauu(DEF_VECT,idof1,jdof2) = elauu(DEF_VECT,idof1,jdof2) + fact1(DEF_VECT) * gpcar(DEF_VECT,2,jnode,igaus)
elauu(DEF_VECT,idof2,jdof2) = elauu(DEF_VECT,idof2,jdof2) + fact2(DEF_VECT) * gpcar(DEF_VECT,2,jnode,igaus) + fact5(DEF_VECT)
elauu(DEF_VECT,idof3,jdof2) = elauu(DEF_VECT,idof3,jdof2) + fact3(DEF_VECT) * gpcar(DEF_VECT,2,jnode,igaus)
elauu(DEF_VECT,idof1,jdof3) = elauu(DEF_VECT,idof1,jdof3) + fact1(DEF_VECT) * gpcar(DEF_VECT,3,jnode,igaus)
elauu(DEF_VECT,idof2,jdof3) = elauu(DEF_VECT,idof2,jdof3) + fact2(DEF_VECT) * gpcar(DEF_VECT,3,jnode,igaus)
elauu(DEF_VECT,idof3,jdof3) = elauu(DEF_VECT,idof3,jdof3) + fact3(DEF_VECT) * gpcar(DEF_VECT,3,jnode,igaus) + fact5(DEF_VECT)
end do
end do
end do
end if
if( fvins_nsi > 0.9_rp ) then
!
! ( mu*duj/dxi , dv/dxj ) (only div form)
!
if( ndime == 2 ) then
do igaus = 1,pgaus
do inode = 1,pnode
do idime = 1,ndime
idofv = (inode-1)*ndime+idime
do jnode = 1,pnode
fact1 = gpvis(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus) * gpcar(DEF_VECT,idime,jnode,igaus)
jdofv = (jnode-1)*ndime + 1
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,1,inode,igaus)
jdofv = (jnode-1)*ndime + 2
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,2,inode,igaus)
end do
if( fvins_nsi == 2.0_rp ) then
fact1 = -2.0_rp/3.0_rp * gpvis(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus) * gpcar(DEF_VECT,idime,inode,igaus)
do jnode = 1,pnode
jdofv = (jnode-1)*ndime + 1
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,1,jnode,igaus)
jdofv = (jnode-1)*ndime + 2
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,2,jnode,igaus)
end do
end if
end do
end do
end do
else
do igaus = 1,pgaus
do inode = 1,pnode
do idime = 1,ndime
idofv = (inode-1)*ndime + idime
do jnode = 1,pnode
fact1 = gpvis(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus) * gpcar(DEF_VECT,idime,jnode,igaus)
jdofv = (jnode-1)*ndime + 1
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,1,inode,igaus)
jdofv = (jnode-1)*ndime + 2
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,2,inode,igaus)
jdofv = (jnode-1)*ndime + 3
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,3,inode,igaus)
end do
if( fvins_nsi == 2.0_rp ) then
fact1 = -2.0_rp / 3.0_rp * gpvis(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus) * gpcar(DEF_VECT,idime,inode,igaus)
do jnode = 1,pnode
jdofv = (jnode-1)*ndime + 1
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,1,jnode,igaus)
jdofv = (jnode-1)*ndime + 2
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,2,jnode,igaus)
jdofv = (jnode-1)*ndime + 3
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,3,jnode,igaus)
end do
end if
end do
end do
end do
end if
end if
!
! Lumped evolution matrix (only backward euler)
!
if( kfl_lumped == 1 ) then
!
! Remove Galerkin term and add lumped term
!
if( ndime == 2 ) then
call runend('PREGUNTAR A MATIAS QUE LO PROGRAME')
else
do igaus = 1,pgaus
gpveo(DEF_VECT,1:3) = 0.0_rp
do inode = 1,pnode
do idime = 1,ndime
gpveo(DEF_VECT,idime) = gpveo(DEF_VECT,idime) + elvel(DEF_VECT,idime,inode,2) * gpsha(DEF_VECT,inode,igaus)
end do
end do
do inode = 1,pnode
idof1 = 3*inode-2
idof2 = 3*inode-1
idof3 = 3*inode
fact0(DEF_VECT) = gpvol(DEF_VECT,igaus) * gpden(DEF_VECT,igaus) * gpsha(DEF_VECT,inode,igaus) * dtinv_loc(DEF_VECT)
elauu(DEF_VECT,idof1,idof1) = elauu(DEF_VECT,idof1,idof1) + fact0(DEF_VECT)
elauu(DEF_VECT,idof2,idof2) = elauu(DEF_VECT,idof2,idof2) + fact0(DEF_VECT)
elauu(DEF_VECT,idof3,idof3) = elauu(DEF_VECT,idof3,idof3) + fact0(DEF_VECT)
do idime = 1,ndime
elrbu(DEF_VECT,idime,inode) = elrbu(DEF_VECT,idime,inode) - fact0(DEF_VECT) * gpveo(DEF_VECT,idime)
elrbu(DEF_VECT,idime,inode) = elrbu(DEF_VECT,idime,inode) + fact0(DEF_VECT) * elvel(DEF_VECT,idime,inode,2)
end do
do jnode = 1,pnode
jdof1 = 3*jnode-2
jdof2 = 3*jnode-1
jdof3 = 3*jnode
elauu(DEF_VECT,idof1,jdof1) = elauu(DEF_VECT,idof1,jdof1) - fact0*gpsha(DEF_VECT,jnode,igaus)
elauu(DEF_VECT,idof2,jdof2) = elauu(DEF_VECT,idof2,jdof2) - fact0*gpsha(DEF_VECT,jnode,igaus)
elauu(DEF_VECT,idof3,jdof3) = elauu(DEF_VECT,idof3,jdof3) - fact0*gpsha(DEF_VECT,jnode,igaus)
end do
end do
end do
end if
else if( kfl_lumped == 2 ) then
!
! No time term have been added up to now: add Galerkin term
!
do igaus = 1,pgaus
fact0(DEF_VECT) = gpvol(DEF_VECT,igaus) * gpden(DEF_VECT,igaus) * dtinv_loc(DEF_VECT)
do inode = 1, pnode
fact1(DEF_VECT) = fact0(DEF_VECT) * gpsha(DEF_VECT,inode,igaus)
do idime = 1,ndime
idof1 = (inode-1) * ndime + idime
elauu(DEF_VECT,idof1,idof1) = elauu(DEF_VECT,idof1,idof1) + fact1(DEF_VECT)
elrbu(DEF_VECT,idime,inode) = elrbu(DEF_VECT,idime,inode) + fact1(DEF_VECT) * elvel(DEF_VECT,idime,inode,2)
end do
end do
end do
end if
!----------------------------------------------------------------------
!
! Apu and Aup
!
!----------------------------------------------------------------------
!
! ( div(u) , q ) and - ( p , div(v) )
!
if( ndime == 2 ) then
do igaus = 1,pgaus
do inode = 1,pnode
idof1 = 2*inode-1
idof2 = 2*inode
do jnode = 1,pnode
fact0(DEF_VECT) = gpvol(DEF_VECT,igaus) * gpsha(DEF_VECT,jnode,igaus)
fact1(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,1,inode,igaus)
fact2(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,2,inode,igaus)
elapu(DEF_VECT,jnode,idof1) = elapu(DEF_VECT,jnode,idof1) + fact1(DEF_VECT)
elapu(DEF_VECT,jnode,idof2) = elapu(DEF_VECT,jnode,idof2) + fact2(DEF_VECT)
elaup(DEF_VECT,idof1,jnode) = elaup(DEF_VECT,idof1,jnode) - fact1(DEF_VECT)
elaup(DEF_VECT,idof2,jnode) = elaup(DEF_VECT,idof2,jnode) - fact2(DEF_VECT)
end do
end do
end do
else
do igaus = 1,pgaus
do inode = 1,pnode
idof1 = 3*inode-2
idof2 = 3*inode-1
idof3 = 3*inode
do jnode = 1,pnode
fact0(DEF_VECT) = gpvol(DEF_VECT,igaus) * gpsha(DEF_VECT,jnode,igaus)
fact1(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,1,inode,igaus)
fact2(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,2,inode,igaus)
fact3(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,3,inode,igaus)
elapu(DEF_VECT,jnode,idof1) = elapu(DEF_VECT,jnode,idof1) + fact1(DEF_VECT)
elapu(DEF_VECT,jnode,idof2) = elapu(DEF_VECT,jnode,idof2) + fact2(DEF_VECT)
elapu(DEF_VECT,jnode,idof3) = elapu(DEF_VECT,jnode,idof3) + fact3(DEF_VECT)
elaup(DEF_VECT,idof1,jnode) = elaup(DEF_VECT,idof1,jnode) - fact1(DEF_VECT)
elaup(DEF_VECT,idof2,jnode) = elaup(DEF_VECT,idof2,jnode) - fact2(DEF_VECT)
elaup(DEF_VECT,idof3,jnode) = elaup(DEF_VECT,idof3,jnode) - fact3(DEF_VECT)
end do
end do
end do
end if
!----------------------------------------------------------------------
!
! App
!
!----------------------------------------------------------------------
!
! Pressure: ( tau1' * grad(p) , grad(q) )
!
do igaus = 1,pgaus
do inode = 1,pnode
do jnode = inode+1,pnode
fact1(DEF_VECT) = gpsp1_p(DEF_VECT,igaus) * wgrgr(DEF_VECT,jnode,inode,igaus) * gpvol(DEF_VECT,igaus)
elapp(DEF_VECT,jnode,inode) = elapp(DEF_VECT,jnode,inode) + fact1(DEF_VECT)
elapp(DEF_VECT,inode,jnode) = elapp(DEF_VECT,inode,jnode) + fact1(DEF_VECT)
end do
fact1(DEF_VECT) = gpsp1_p(DEF_VECT,igaus) * wgrgr(DEF_VECT,inode,inode,igaus) * gpvol(DEF_VECT,igaus)
elapp(DEF_VECT,inode,inode) = elapp(DEF_VECT,inode,inode) + fact1(DEF_VECT)
end do
end do
!----------------------------------------------------------------------
!
! bu and bp
!
! P1 = P [ tau1' * rho * a.grad(u) ]
! P1' = P1 + tau1' * rho * u'n / dt
!
! P2 = P [ tau1' * ( grad(p) - f ) ]
! P2' = P2 + tau1' * rho * u'n / dt + tau1' * f
!
!----------------------------------------------------------------------
!
! Limiter
!
if( kfl_limit_nsi == -1 ) then
gpvep(DEF_VECT,:,:) = 0.0_rp
else if( kfl_limit_nsi > 0 ) then
do igaus = 1,pgaus
c1(DEF_VECT) = 0.0_rp
c2(DEF_VECT) = 0.0_rp
c3(DEF_VECT) = 0.0_rp
do idime = 1,ndime
c4(DEF_VECT) = 0.0_rp
do inode = 1,pnode
c4(DEF_VECT) = c4(DEF_VECT) + agrau(DEF_VECT,inode,igaus) * elvel(DEF_VECT,idime,inode,1)
end do
c4(DEF_VECT) = gpsp1(DEF_VECT,igaus) * c4(DEF_VECT)
c1(DEF_VECT) = c1(DEF_VECT) + ( gpvep(DEF_VECT,idime,igaus) - c4(DEF_VECT) )**2
c3(DEF_VECT) = c3(DEF_VECT) + gpvep(DEF_VECT,idime,igaus) * gpvep(DEF_VECT,idime,igaus)
c2(DEF_VECT) = c2(DEF_VECT) + c4(DEF_VECT) * c4(DEF_VECT)
end do
c3(DEF_VECT) = sqrt( c2(DEF_VECT) ) + sqrt( c3(DEF_VECT) )
c1(DEF_VECT) = sqrt( c1(DEF_VECT) )
beta(DEF_VECT) = c1(DEF_VECT) / ( c3(DEF_VECT) + epsilon(1.0_rp) )
if( kfl_limit_nsi == 1 ) then
alpha(DEF_VECT) = min(1.0_rp,2.0_rp*(1.0_rp-beta(DEF_VECT)))
else if( kfl_limit_nsi == 2 ) then
alpha(DEF_VECT) = 0.5_rp*(tanh(20.0_rp*(beta(DEF_VECT)-0.8_rp))+1.0_rp)
end if
do idime = 1,ndime
gpvep(DEF_VECT,idime,igaus) = alpha(DEF_VECT) * gpvep(DEF_VECT,idime,igaus)
end do
end do
end if
!
! P2 <= P2 + tau1' * f
!
do igaus = 1,pgaus
do idime = 1,ndime
gpgrp(DEF_VECT,idime,igaus) = gpgrp(DEF_VECT,idime,igaus) + gpsp1_p(DEF_VECT,igaus) * gprhs(DEF_VECT,idime,igaus)
end do
end do
!
! P1 <= P1 + tau1' * rho * u'n / dt
! P2 <= P2 + tau1' * rho * u'n / dt
!
if( kfl_sgsti_nsi == 1 ) then
do igaus = 1,pgaus
fact1(DEF_VECT) = gpden(DEF_VECT,igaus) * dtsgs(DEF_VECT) * gpsp1_v(DEF_VECT,igaus)
fact1_p (DEF_VECT) = gpden(DEF_VECT,igaus) * dtsgs(DEF_VECT) * gpsp1_p(DEF_VECT,igaus)
do idime = 1,ndime
gpvep(DEF_VECT,idime,igaus) = gpvep(DEF_VECT,idime,igaus) + fact1(DEF_VECT) * gpsgs(DEF_VECT,idime,igaus,2)
gpgrp(DEF_VECT,idime,igaus) = gpgrp(DEF_VECT,idime,igaus) + fact1_p(DEF_VECT) * gpsgs(DEF_VECT,idime,igaus,2)
end do
end do
end if
!
! bu = ( f + rho*u^n/dt , v ) + ( rho * a.grad(v) , tau1' * rho u'^n/dt + P1 )
! = ( f + rho*u^n/dt , v ) + ( rho * a.grad(v) , P1' )
!
! bp = ( f + rho*u'^n/dt , tau1' grad(q) ) + ( P2 , grad(q) )
! = ( P2' , grad(q) )
!
if( ndime == 2 ) then
do igaus = 1,pgaus
fact4(DEF_VECT) = gpden(DEF_VECT,igaus) * dtinv_loc(DEF_VECT)
do itime = 2,nbdfp_nsi
gprhs(DEF_VECT,1,igaus) = gprhs(DEF_VECT,1,igaus) - pabdf_nsi(itime) * fact4(DEF_VECT) * gpvel(DEF_VECT,1,igaus,itime)
gprhs(DEF_VECT,2,igaus) = gprhs(DEF_VECT,2,igaus) - pabdf_nsi(itime) * fact4(DEF_VECT) * gpvel(DEF_VECT,2,igaus,itime)
end do
do inode = 1,pnode
fact1(DEF_VECT) = gpvol(DEF_VECT,igaus) * gpsha(DEF_VECT,inode,igaus) ! ( f + rho*u^n/dt , v )
fact3(DEF_VECT) = gpvol(DEF_VECT,igaus) * agrau(DEF_VECT,inode,igaus) ! ( rho * a.grad(v) , P1' )
elrbu(DEF_VECT,1,inode) = elrbu(DEF_VECT,1,inode) + fact1(DEF_VECT) * gprhs(DEF_VECT,1,igaus) + fact3(DEF_VECT) * gpvep(DEF_VECT,1,igaus)
elrbu(DEF_VECT,2,inode) = elrbu(DEF_VECT,2,inode) + fact1(DEF_VECT) * gprhs(DEF_VECT,2,igaus) + fact3(DEF_VECT) * gpvep(DEF_VECT,2,igaus)
elrbp(DEF_VECT,inode) = elrbp(DEF_VECT,inode) + gpvol(DEF_VECT,igaus) * ( & ! ( P2' , grad(q) )
& gpcar(DEF_VECT,1,inode,igaus) * gpgrp(DEF_VECT,1,igaus) &
& + gpcar(DEF_VECT,2,inode,igaus) * gpgrp(DEF_VECT,2,igaus) )
end do
end do
else
do igaus = 1,pgaus
fact4(DEF_VECT) = gpden(DEF_VECT,igaus) * dtinv_loc(DEF_VECT)
do itime = 2,nbdfp_nsi
gprhs(DEF_VECT,1,igaus) = gprhs(DEF_VECT,1,igaus) - pabdf_nsi(itime) * fact4(DEF_VECT) * gpvel(DEF_VECT,1,igaus,itime)
gprhs(DEF_VECT,2,igaus) = gprhs(DEF_VECT,2,igaus) - pabdf_nsi(itime) * fact4(DEF_VECT) * gpvel(DEF_VECT,2,igaus,itime)
gprhs(DEF_VECT,3,igaus) = gprhs(DEF_VECT,3,igaus) - pabdf_nsi(itime) * fact4(DEF_VECT) * gpvel(DEF_VECT,3,igaus,itime)
end do
do inode = 1,pnode
fact1 = gpvol(DEF_VECT,igaus) * gpsha(DEF_VECT,inode,igaus)
fact3 = gpvol(DEF_VECT,igaus) * agrau(DEF_VECT,inode,igaus)
elrbu(DEF_VECT,1,inode) = elrbu(DEF_VECT,1,inode) + fact1(DEF_VECT) * gprhs(DEF_VECT,1,igaus) + fact3(DEF_VECT) * gpvep(DEF_VECT,1,igaus)
elrbu(DEF_VECT,2,inode) = elrbu(DEF_VECT,2,inode) + fact1(DEF_VECT) * gprhs(DEF_VECT,2,igaus) + fact3(DEF_VECT) * gpvep(DEF_VECT,2,igaus)
elrbu(DEF_VECT,3,inode) = elrbu(DEF_VECT,3,inode) + fact1(DEF_VECT) * gprhs(DEF_VECT,3,igaus) + fact3(DEF_VECT) * gpvep(DEF_VECT,3,igaus)
elrbp(DEF_VECT,inode) = elrbp(DEF_VECT,inode) + gpvol(DEF_VECT,igaus) * ( &
& gpcar(DEF_VECT,1,inode,igaus) * gpgrp(DEF_VECT,1,igaus) &
& + gpcar(DEF_VECT,2,inode,igaus) * gpgrp(DEF_VECT,2,igaus) &
& + gpcar(DEF_VECT,3,inode,igaus) * gpgrp(DEF_VECT,3,igaus) )
end do
end do
end if
end subroutine nsi_element_assembly_split_oss
subroutine nsi_element_assembly_split_oss(&
pnode,pgaus,gpden,gpvis,gppor,gpsp1,gpsp2,gpvol, &
gpsha,gpcar,gpadv,gpvep,gpgrp,gprhs,gprhc,gpvel, &
gpsgs,elvel,elpre,elbub,elauu,elaup,elapp,elapu, &
elrbu,elrbp,dtinv_loc,dtsgs,pbubl,gpsha_bub, &
gpcar_bub,elauq,elapq,elaqu,elaqp,elaqq,elrbq,&
! Original global variables
kfl_lumped,&
mnode,ntens,&
kfl_duatss,&
fact_duatss,&
kfl_stabi_nsi,&
fvins_nsi,fcons_nsi,bemol_nsi,kfl_regim_nsi,&
fvela_nsi,kfl_rmom2_nsi,kfl_press_nsi,&
kfl_p1ve2_nsi,kfl_linea_nsi,pabdf_nsi,&
kfl_confi_nsi,nbdfp_nsi,kfl_sgsti_nsi,&
kfl_nota1_nsi,kfl_limit_nsi,kfl_penal_nsi,&
penal_nsi,&
kfl_bubbl_nsi)
integer(ip), intent(in) :: kfl_lumped
integer(ip), intent(in) :: mnode
integer(ip), intent(in) :: ntens
integer(ip), intent(in) :: kfl_duatss
integer(ip), intent(in) :: fact_duatss
integer(ip), intent(in) :: kfl_stabi_nsi
real(rp), intent(in) :: fvins_nsi
real(rp), intent(in) :: fcons_nsi
real(rp), intent(in) :: bemol_nsi
integer(ip), intent(in) :: kfl_regim_nsi
real(rp), intent(in) :: fvela_nsi(3)
integer(ip), intent(in) :: kfl_rmom2_nsi
integer(ip), intent(in) :: kfl_press_nsi
integer(ip), intent(in) :: kfl_p1ve2_nsi
integer(ip), intent(in) :: kfl_linea_nsi
real(rp), intent(in) :: pabdf_nsi(*)
integer(ip), intent(in) :: kfl_confi_nsi
integer(ip), intent(in) :: nbdfp_nsi
integer(ip), intent(in) :: kfl_sgsti_nsi
integer(ip), intent(in) :: kfl_nota1_nsi
integer(ip), intent(in) :: kfl_limit_nsi
integer(ip), intent(in) :: kfl_penal_nsi
real(rp), intent(in) :: penal_nsi
integer(ip), intent(in) :: kfl_bubbl_nsi
integer(ip), intent(in) :: pnode,pgaus
real(rp), intent(in) :: gpden(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gpvis(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gppor(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gpsp1(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gpsp2(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gpvol(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gpsha(VECTOR_SIZE,pnode,pgaus)
real(rp), intent(in) :: gpcar(VECTOR_SIZE,ndime,mnode,pgaus)
real(rp), intent(in) :: gpadv(VECTOR_SIZE,ndime,pgaus)
real(rp), intent(inout) :: gpvep(VECTOR_SIZE,ndime,pgaus)
real(rp), intent(inout) :: gpgrp(VECTOR_SIZE,ndime,pgaus)
real(rp), intent(inout) :: gprhs(VECTOR_SIZE,ndime,pgaus)
real(rp), intent(inout) :: gprhc(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gpvel(VECTOR_SIZE,ndime,pgaus,*)
real(rp), intent(in) :: gpsgs(VECTOR_SIZE,ndime,pgaus,*)
real(rp), intent(in) :: elvel(VECTOR_SIZE,ndime,pnode,*)
real(rp), intent(in) :: elpre(VECTOR_SIZE,pnode,*)
real(rp), intent(in) :: elbub(VECTOR_SIZE)
! Matrices
real(rp), intent(out) :: elauu(VECTOR_SIZE,pnode*ndime,pnode*ndime)
real(rp), intent(out) :: elaup(VECTOR_SIZE,pnode*ndime,pnode)
real(rp), intent(out) :: elapp(VECTOR_SIZE,pnode,pnode)
real(rp), intent(out) :: elapu(VECTOR_SIZE,pnode,pnode*ndime)
real(rp), intent(out) :: elrbu(VECTOR_SIZE,ndime,pnode)
real(rp), intent(out) :: elrbp(VECTOR_SIZE,pnode)
! Others
real(rp), intent(in) :: dtinv_loc(VECTOR_SIZE)
real(rp), intent(in) :: dtsgs(VECTOR_SIZE)
integer(ip), intent(in) :: pbubl(VECTOR_SIZE)
real(rp), intent(in) :: gpsha_bub(VECTOR_SIZE,pgaus)
real(rp), intent(in) :: gpcar_bub(VECTOR_SIZE,ndime,pgaus)
! Enrichement Element matrices
real(rp), intent(out) :: elauq(VECTOR_SIZE,pnode*ndime,1)
real(rp), intent(out) :: elapq(VECTOR_SIZE,pnode,1)
real(rp), intent(out) :: elaqu(VECTOR_SIZE,1,pnode*ndime)
real(rp), intent(out) :: elaqp(VECTOR_SIZE,1,pnode)
real(rp), intent(out) :: elaqq(VECTOR_SIZE,1,1)
real(rp), intent(out) :: elrbq(VECTOR_SIZE,1)
! Local arrays
real(rp) :: wgrgr(VECTOR_SIZE,pnode,pnode,pgaus)
real(rp) :: agrau(VECTOR_SIZE,pnode,pgaus)
real(rp) :: gpsp1_p(VECTOR_SIZE,pgaus)
real(rp) :: gpsp1_v(VECTOR_SIZE,pgaus)
real(rp) :: gpsp2_v(VECTOR_SIZE,pgaus)
real(rp) :: c1(VECTOR_SIZE)
real(rp) :: c2(VECTOR_SIZE)
real(rp) :: c3(VECTOR_SIZE)
real(rp) :: c4(VECTOR_SIZE)
real(rp) :: alpha(VECTOR_SIZE)
real(rp) :: beta(VECTOR_SIZE)
real(rp) :: fact0(VECTOR_SIZE)
real(rp) :: fact1(VECTOR_SIZE)
real(rp) :: fact2(VECTOR_SIZE)
real(rp) :: fact3(VECTOR_SIZE)
real(rp) :: fact4(VECTOR_SIZE)
real(rp) :: fact5(VECTOR_SIZE)
real(rp) :: fact6(VECTOR_SIZE)
real(rp) :: fact7(VECTOR_SIZE)
real(rp) :: fact8(VECTOR_SIZE)
real(rp) :: gpveo(VECTOR_SIZE,3)
real(rp) :: fact1_p(VECTOR_SIZE)
real(rp) :: dtinv_mod(VECTOR_SIZE)
integer(ip) :: inode,jnode,jdime
integer(ip) :: idofv,jdof2,jdof3,ivect
integer(ip) :: idof1,idof3,idof2,igaus
integer(ip) :: idime,jdof1,jdofv,itime
#ifdef OPENACC
#define DEF_VECT ivect
#else
#define DEF_VECT 1:VECTOR_SIZE
#endif
dtinv_mod = dtinv_loc
!----------------------------------------------------------------------
!
! possibility of using only pressure stabilization - not ready with limiter - nor with shock capturing
!
!----------------------------------------------------------------------
gpsp1_p = gpsp1
gpsp1_v = gpsp1
gpsp2_v = gpsp2
if( kfl_nota1_nsi == 1 ) gpsp1_v = 0.0_rp
!----------------------------------------------------------------------
!
! Initialization
!
!----------------------------------------------------------------------
elrbp = 0.0_rp
elrbu = 0.0_rp
elapp = 0.0_rp
elauu = 0.0_rp
elaup = 0.0_rp
elapu = 0.0_rp
!----------------------------------------------------------------------
!
! Test functions
!
!----------------------------------------------------------------------
!
! AGRAU = rho * (a.grad) Ni
! WGRGR = grad(Ni) . grad(Nj)
!
agrau(DEF_VECT,:,:) = 0.0_rp
wgrgr(DEF_VECT,:,:,:) = 0.0_rp
do igaus = 1,pgaus
do inode = 1,pnode
do idime = 1,ndime
agrau(DEF_VECT,inode,igaus) = agrau(DEF_VECT,inode,igaus) + &
& gpadv(DEF_VECT,idime,igaus) * gpcar(DEF_VECT,idime,inode,igaus)
end do
agrau(DEF_VECT,inode,igaus) = gpden(DEF_VECT,igaus) * agrau(DEF_VECT,inode,igaus)
do jnode = 1,pnode
do idime = 1,ndime
wgrgr(DEF_VECT,inode,jnode,igaus) = wgrgr(DEF_VECT,inode,jnode,igaus) + &
& gpcar(DEF_VECT,idime,inode,igaus)*gpcar(DEF_VECT,idime,jnode,igaus)
end do
end do
end do
end do
!----------------------------------------------------------------------
!
! Auu
!
!----------------------------------------------------------------------
!
! Galerkin + ( tau2 * div(u) , div(v) ) + ( tau1 * rho*a.grad(u), rho*a.grad(v) )
!
do igaus = 1,pgaus
fact0(DEF_VECT) = gpsp2_v(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus)
fact6(DEF_VECT) = gpvis(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus)
fact7(DEF_VECT) = gpsp1_v(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus)
fact8(DEF_VECT) = pabdf_nsi(1) * gpden(DEF_VECT,igaus) * dtinv_mod(DEF_VECT) + gppor(DEF_VECT,igaus)
do inode = 1,pnode
do idime = 1,ndime
idofv = (inode-1)*ndime+idime
fact1(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,idime,inode,igaus)
do jnode = 1,pnode
!
! div(u) * tau2' * div(v)
!
do jdime = 1,ndime
jdofv = (jnode-1)*ndime+jdime
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,jdime,jnode,igaus)
end do
!
! ( rho/dt N_j + s Nj + rho*(a.grad)Nj ) Ni
! + mu * grad(Ni) . grad(Nj)
! + t1 * rho*(a.grad)Nj * rho*(a.grad)Ni
!
jdofv = (jnode-1)*ndime+idime
fact4(DEF_VECT) = gpsha(DEF_VECT,inode,igaus) * gpvol(DEF_VECT,igaus)
fact5(DEF_VECT) = fact4(DEF_VECT) * ( agrau(DEF_VECT,jnode,igaus) + fact8(DEF_VECT) * gpsha(DEF_VECT,jnode,igaus) ) &
& + fact6(DEF_VECT) * wgrgr(DEF_VECT,inode,jnode,igaus) &
& + fact7(DEF_VECT) * agrau(DEF_VECT,jnode,igaus) * agrau(DEF_VECT,inode,igaus)
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact5(DEF_VECT)
end do
end do
end do
end do
!
! ( mu*duj/dxi , dv/dxj ) (only div form)
!
if( fvins_nsi > 0.9_rp ) then
do igaus = 1,pgaus
do inode = 1,pnode
do idime = 1,ndime
idofv = (inode-1)*ndime + idime
do jnode = 1,pnode
fact1(DEF_VECT) = gpvis(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus) * gpcar(DEF_VECT,idime,jnode,igaus)
do jdime = 1,ndime
jdofv = (jnode-1)*ndime + jdime
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,jdime,inode,igaus)
end do
end do
if( fvins_nsi == 2.0_rp ) then
fact1(DEF_VECT) = -2.0_rp / 3.0_rp * gpvis(DEF_VECT,igaus) * gpvol(DEF_VECT,igaus) * gpcar(DEF_VECT,idime,inode,igaus)
do jnode = 1,pnode
do jdime = 1,ndime
jdofv = (jnode-1)*ndime + jdime
elauu(DEF_VECT,idofv,jdofv) = elauu(DEF_VECT,idofv,jdofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,jdime,jnode,igaus)
end do
end do
end if
end do
end do
end do
end if
!call nsi_element_system_output(&
! pnode,elauu(1,:,:),elaup(1,:,:),elapp(1,:,:),elapu(1,:,:),elrbu(1,:,:),elrbp(1,:),&
! elauq(1,:,:),elapq(1,:,:),elaqu(1,:,:),elaqp(1,:,:),elaqq(1,:,:),elrbq(1,:))
!stop
!
! Lumped evolution matrix (only backward euler)
!
if( kfl_lumped == 1 ) then
!
! Remove Galerkin term and add lumped term
!
if( ndime == 2 ) then
stop
else
do igaus = 1,pgaus
gpveo(DEF_VECT,1:3) = 0.0_rp
do inode = 1,pnode
do idime = 1,ndime
gpveo(DEF_VECT,idime) = gpveo(DEF_VECT,idime) + elvel(DEF_VECT,idime,inode,2) * gpsha(DEF_VECT,inode,igaus)
end do
end do
do inode = 1,pnode
idof1 = 3*inode-2
idof2 = 3*inode-1
idof3 = 3*inode
fact0(DEF_VECT) = gpvol(DEF_VECT,igaus) * gpden(DEF_VECT,igaus) * gpsha(DEF_VECT,inode,igaus) * dtinv_mod(DEF_VECT)
elauu(DEF_VECT,idof1,idof1) = elauu(DEF_VECT,idof1,idof1) + fact0(DEF_VECT)
elauu(DEF_VECT,idof2,idof2) = elauu(DEF_VECT,idof2,idof2) + fact0(DEF_VECT)
elauu(DEF_VECT,idof3,idof3) = elauu(DEF_VECT,idof3,idof3) + fact0(DEF_VECT)
do idime = 1,ndime
elrbu(DEF_VECT,idime,inode) = elrbu(DEF_VECT,idime,inode) - fact0(DEF_VECT) * gpveo(DEF_VECT,idime)
elrbu(DEF_VECT,idime,inode) = elrbu(DEF_VECT,idime,inode) + fact0(DEF_VECT) * elvel(DEF_VECT,idime,inode,2)
end do
do jnode = 1,pnode
jdof1 = 3*jnode-2
jdof2 = 3*jnode-1
jdof3 = 3*jnode
elauu(DEF_VECT,idof1,jdof1) = elauu(DEF_VECT,idof1,jdof1) - fact0(DEF_VECT) * gpsha(DEF_VECT,jnode,igaus)
elauu(DEF_VECT,idof2,jdof2) = elauu(DEF_VECT,idof2,jdof2) - fact0(DEF_VECT) * gpsha(DEF_VECT,jnode,igaus)
elauu(DEF_VECT,idof3,jdof3) = elauu(DEF_VECT,idof3,jdof3) - fact0(DEF_VECT) * gpsha(DEF_VECT,jnode,igaus)
end do
end do
end do
end if
else if( kfl_lumped == 2 ) then
!
! No time term have been added up to now: add Galerkin term
!
do igaus = 1,pgaus
fact0(DEF_VECT) = gpvol(DEF_VECT,igaus) * gpden(DEF_VECT,igaus) * dtinv_mod(DEF_VECT)
do inode = 1, pnode
fact1(DEF_VECT) = fact0(DEF_VECT) * gpsha(DEF_VECT,inode,igaus)
do idime = 1,ndime
idof1 = (inode-1) * ndime + idime
elauu(DEF_VECT,idof1,idof1) = elauu(DEF_VECT,idof1,idof1) + fact1(DEF_VECT)
elrbu(DEF_VECT,idime,inode) = elrbu(DEF_VECT,idime,inode) + fact1(DEF_VECT) * elvel(DEF_VECT,idime,inode,2)
end do
end do
end do
end if
!----------------------------------------------------------------------
!
! Apu and Aup
!
!----------------------------------------------------------------------
!
! ( div(u) , q ) and - ( p , div(v) )
!
if( ndime == 2 ) then
do igaus = 1,pgaus
do inode = 1,pnode
idof1 = 2*inode-1
idof2 = 2*inode
do jnode = 1,pnode
fact0(DEF_VECT) = gpvol(DEF_VECT,igaus) * gpsha(DEF_VECT,jnode,igaus)
fact1(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,1,inode,igaus)
fact2(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,2,inode,igaus)
elapu(DEF_VECT,jnode,idof1) = elapu(DEF_VECT,jnode,idof1) + fact1(DEF_VECT)
elapu(DEF_VECT,jnode,idof2) = elapu(DEF_VECT,jnode,idof2) + fact2(DEF_VECT)
elaup(DEF_VECT,idof1,jnode) = elaup(DEF_VECT,idof1,jnode) - fact1(DEF_VECT)
elaup(DEF_VECT,idof2,jnode) = elaup(DEF_VECT,idof2,jnode) - fact2(DEF_VECT)
end do
end do
end do
else
do igaus = 1,pgaus
do inode = 1,pnode
idof1 = 3*inode-2
idof2 = 3*inode-1
idof3 = 3*inode
do jnode = 1,pnode
fact0(DEF_VECT) = gpvol(DEF_VECT,igaus) * gpsha(DEF_VECT,jnode,igaus)
fact1(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,1,inode,igaus)
fact2(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,2,inode,igaus)
fact3(DEF_VECT) = fact0(DEF_VECT) * gpcar(DEF_VECT,3,inode,igaus)
elapu(DEF_VECT,jnode,idof1) = elapu(DEF_VECT,jnode,idof1) + fact1(DEF_VECT)
elapu(DEF_VECT,jnode,idof2) = elapu(DEF_VECT,jnode,idof2) + fact2(DEF_VECT)
elapu(DEF_VECT,jnode,idof3) = elapu(DEF_VECT,jnode,idof3) + fact3(DEF_VECT)
elaup(DEF_VECT,idof1,jnode) = elaup(DEF_VECT,idof1,jnode) - fact1(DEF_VECT)
elaup(DEF_VECT,idof2,jnode) = elaup(DEF_VECT,idof2,jnode) - fact2(DEF_VECT)
elaup(DEF_VECT,idof3,jnode) = elaup(DEF_VECT,idof3,jnode) - fact3(DEF_VECT)
end do
end do
end do
end if
!----------------------------------------------------------------------
!
! App
!
!----------------------------------------------------------------------
!
! Pressure: ( tau1' * grad(p) , grad(q) )
!
if( kfl_stabi_nsi /= -1 ) then
do igaus = 1,pgaus
do inode = 1,pnode
do jnode = inode+1,pnode
fact1(DEF_VECT) = gpsp1_p(DEF_VECT,igaus) * wgrgr(DEF_VECT,jnode,inode,igaus) * gpvol(DEF_VECT,igaus)
elapp(DEF_VECT,jnode,inode) = elapp(DEF_VECT,jnode,inode) + fact1(DEF_VECT)
elapp(DEF_VECT,inode,jnode) = elapp(DEF_VECT,inode,jnode) + fact1(DEF_VECT)
end do
fact1(DEF_VECT) = gpsp1_p(DEF_VECT,igaus) * wgrgr(DEF_VECT,inode,inode,igaus) * gpvol(DEF_VECT,igaus)
elapp(DEF_VECT,inode,inode) = elapp(DEF_VECT,inode,inode) + fact1(DEF_VECT)
end do
end do
end if
!
! Penalization
!
do igaus = 1,pgaus
fact1(DEF_VECT) = penal_nsi * gpvol(DEF_VECT,igaus)
do inode = 1,pnode
elapp(DEF_VECT,inode,inode) = elapp(DEF_VECT,inode,inode) + fact1(DEF_VECT) * gpsha(DEF_VECT,inode,igaus)
elrbp(DEF_VECT,inode) = elrbp(DEF_VECT,inode) + fact1(DEF_VECT) * gpsha(DEF_VECT,inode,igaus) * elpre(DEF_VECT,inode,1)
end do
end do
!----------------------------------------------------------------------
!
! bu and bp
!
! P1 = P [ tau1' * rho * a.grad(u) ]
! P1' = P1 + tau1' * rho * u'n / dt
!
! P2 = P [ tau1' * ( grad(p) - f ) ]
! P2' = P2 + tau1' * rho * u'n / dt + tau1' * f
!
!----------------------------------------------------------------------
!
! Limiter
!
if( kfl_stabi_nsi == -1 ) then
gpvep(DEF_VECT,:,:) = 0.0_rp
else if( kfl_limit_nsi == -1 ) then
gpvep(DEF_VECT,:,:) = 0.0_rp
else if( kfl_limit_nsi > 0 ) then
do igaus = 1,pgaus
c1(DEF_VECT) = 0.0_rp
c2(DEF_VECT) = 0.0_rp
c3(DEF_VECT) = 0.0_rp
do idime = 1,ndime
c4(DEF_VECT) = 0.0_rp
do inode = 1,pnode
c4(DEF_VECT) = c4(DEF_VECT) + agrau(DEF_VECT,inode,igaus) * elvel(DEF_VECT,idime,inode,1)
end do
c4(DEF_VECT) = gpsp1(DEF_VECT,igaus) * c4(DEF_VECT)
c1(DEF_VECT) = c1(DEF_VECT) + ( gpvep(DEF_VECT,idime,igaus) - c4(DEF_VECT) )**2
c3(DEF_VECT) = c3(DEF_VECT) + gpvep(DEF_VECT,idime,igaus) * gpvep(DEF_VECT,idime,igaus)
c2(DEF_VECT) = c2(DEF_VECT) + c4(DEF_VECT) * c4(DEF_VECT)
end do
c3(DEF_VECT) = sqrt( c2(DEF_VECT) ) + sqrt( c3(DEF_VECT) )
c1(DEF_VECT) = sqrt( c1(DEF_VECT) )
beta(DEF_VECT) = c1(DEF_VECT) / ( c3(DEF_VECT) + epsilon(1.0_rp) )
if( kfl_limit_nsi == 1 ) then
alpha(DEF_VECT) = min(1.0_rp,2.0_rp*(1.0_rp-beta(DEF_VECT)))
else if( kfl_limit_nsi == 2 ) then
alpha(DEF_VECT) = 0.5_rp*(tanh(20.0_rp*(beta(DEF_VECT)-0.8_rp))+1.0_rp)
end if
do idime = 1,ndime
gpvep(DEF_VECT,idime,igaus) = alpha(DEF_VECT) * gpvep(DEF_VECT,idime,igaus)
end do
end do
end if
!
! P2 <= P2 + tau1' * f
!
if( kfl_stabi_nsi == -1 ) then
gpgrp(DEF_VECT,:,:) = 0.0_rp
else
do igaus = 1,pgaus
do idime = 1,ndime
gpgrp(DEF_VECT,idime,igaus) = gpgrp(DEF_VECT,idime,igaus) + gpsp1_p(DEF_VECT,igaus) * gprhs(DEF_VECT,idime,igaus)
end do
end do
!
! P1 <= P1 + tau1' * rho * u'n / dt
! P2 <= P2 + tau1' * rho * u'n / dt
!
if( kfl_sgsti_nsi == 1 ) then
do igaus = 1,pgaus
fact1(DEF_VECT) = gpden(DEF_VECT,igaus) * dtsgs(DEF_VECT) * gpsp1_v(DEF_VECT,igaus)
fact1_p (DEF_VECT) = gpden(DEF_VECT,igaus) * dtsgs(DEF_VECT) * gpsp1_p(DEF_VECT,igaus)
do idime = 1,ndime
gpvep(DEF_VECT,idime,igaus) = gpvep(DEF_VECT,idime,igaus) + fact1(DEF_VECT) * gpsgs(DEF_VECT,idime,igaus,2)
gpgrp(DEF_VECT,idime,igaus) = gpgrp(DEF_VECT,idime,igaus) + fact1_p(DEF_VECT) * gpsgs(DEF_VECT,idime,igaus,2)
end do
end do
end if
end if
!
! bu = ( f + rho*u^n/dt , v ) + ( rho * a.grad(v) , tau1' * rho u'^n/dt + P1 )
! = ( f + rho*u^n/dt , v ) + ( rho * a.grad(v) , P1' )
!
! bp = ( f + rho*u'^n/dt , tau1' grad(q) ) + ( P2 , grad(q) )
! = ( P2' , grad(q) )
!
do igaus = 1,pgaus
fact4(DEF_VECT) = gpden(DEF_VECT,igaus) * dtinv_mod(DEF_VECT)
do itime = 2,nbdfp_nsi
do idime = 1,ndime
gprhs(DEF_VECT,idime,igaus) = gprhs(DEF_VECT,idime,igaus) - pabdf_nsi(itime) * fact4(DEF_VECT) * gpvel(DEF_VECT,idime,igaus,itime)
end do
end do
do inode = 1,pnode
fact1(DEF_VECT) = gpvol(DEF_VECT,igaus) * gpsha(DEF_VECT,inode,igaus) ! ( f + rho*u^n/dt , v )
fact3(DEF_VECT) = gpvol(DEF_VECT,igaus) * agrau(DEF_VECT,inode,igaus) ! ( rho * a.grad(v) , P1' )
do idime = 1,ndime
elrbu(DEF_VECT,idime,inode) = elrbu(DEF_VECT,idime,inode) + fact1(DEF_VECT) * gprhs(DEF_VECT,idime,igaus) &
& + fact3(DEF_VECT) * gpvep(DEF_VECT,idime,igaus)
end do
elrbp(DEF_VECT,inode) = elrbp(DEF_VECT,inode) + gpvol(DEF_VECT,igaus) * gpsha(DEF_VECT,inode,igaus) * gprhc(DEF_VECT,igaus) ! ( rhs, q )
do idime = 1,ndime
elrbp(DEF_VECT,inode) = elrbp(DEF_VECT,inode) + gpvol(DEF_VECT,igaus) * gpcar(DEF_VECT,idime,inode,igaus) * gpgrp(DEF_VECT,idime,igaus) ! ( P2' , grad(q) )
end do
end do
end do
!--------------------------------------------------------------------
!
! Pressure bubble
!
!--------------------------------------------------------------------
if( maxval(pbubl) == 1 ) then
if( kfl_stabi_nsi /= -1 ) then
write(6,*) 'BUBBLE NOT CODED FOR SPLIT OSS'
stop
end if
!
! Initialization
!
elauq = 0.0_rp
elapq = 0.0_rp
elaqu = 0.0_rp
elaqp = 0.0_rp
elaqq = 0.0_rp
elrbq = 0.0_rp
!
! Auq and Aqu
!
if( kfl_press_nsi == 1 ) then
do igaus = 1,pgaus
fact1(DEF_VECT) = gpvol(DEF_VECT,igaus) * gpsha_bub(DEF_VECT,igaus)
do inode = 1,pnode
do idime = 1,ndime
idofv = (inode-1)*ndime + idime
elauq(DEF_VECT,idofv,1) = elauq(DEF_VECT,idofv,1) - fact1(DEF_VECT) * gpcar(DEF_VECT,idime,inode,igaus)
elaqu(DEF_VECT,1,idofv) = elaqu(DEF_VECT,1,idofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,idime,inode,igaus)
end do
end do
end do
else
do igaus = 1,pgaus
fact1(DEF_VECT) = gpvol(DEF_VECT,igaus) * gpsha_bub(DEF_VECT,igaus)
do inode = 1,pnode
do idime = 1,ndime
idofv = (inode-1)*ndime + idime
elauq(DEF_VECT,idofv,1) = elauq(DEF_VECT,idofv,1) + gpvol(DEF_VECT,igaus) * gpsha(DEF_VECT,inode,igaus) * gpcar_bub(DEF_VECT,idime,igaus)
elaqu(DEF_VECT,1,idofv) = elaqu(DEF_VECT,1,idofv) + fact1(DEF_VECT) * gpcar(DEF_VECT,idime,inode,igaus)
end do
end do
end do
end if
!
! Penalization and others
!
do igaus = 1,pgaus
elaqq(DEF_VECT,1,1) = elaqq(DEF_VECT,1,1) + gpvol(DEF_VECT,igaus) * gpsha_bub(DEF_VECT,igaus) * penal_nsi
elrbq(DEF_VECT,1) = elrbq(DEF_VECT,1) + gpvol(DEF_VECT,igaus) * gpsha_bub(DEF_VECT,igaus) * penal_nsi * elbub(DEF_VECT)
elrbq(DEF_VECT,1) = elrbq(DEF_VECT,1) + gpvol(DEF_VECT,igaus) * gpsha_bub(DEF_VECT,igaus) * gprhc(DEF_VECT,igaus)
end do
end if
end subroutine nsi_element_assembly_split_oss
Here our objective is twofold, we do not only want to generate a
kernel with BOAST that can be easily optimized and specialized but
we also want generate to the exact same kernel than the reference
version and to run it using BOAST, in order to compare the results
in term of correctness and performances. For the reference
implementation we simply need to keep the FORTRAN code and wrap it
with BOAST. For the BOAST implementation we use the BOAST dedicated
language and the meta-programming style.
The kernel nsi_element_assembly_split_oss
contains 11 loop nests
that we isolated to eases the porting. This gave us the possibility
to test and bench them separately and extract them into different
subroutine that can be either pasted in the main kernel procedure,
either inlined or simply called as normal function.
For the both implementations we need to tell to BOAST what are the parameters of the kernel.
As the different implementations (reference and BOAST) of the
kernel and the different loop nests encapsulated into subroutines
share common variable definition, the definition of all the
variables used by nsi_element_assembly_split_oss
(including local
variables) are centralized in the following module. When a
variable is needed by the main routine (the kernel), it can be
used as it is. However when needed by a subroutine, it must be
asked by the copy
function. Indeed a subroutine may need a
variable but with a different direction than originally
defined. For instance the agrau
variable is defined as :out
by the
kernel but a subroutine may need to read only this variable and
only the :in
direction is necessary. Thus copy
returns either
directly the variable if there is no need to change the original
direction or a copy in a different direction than the original one
direction is asked. The reason why a copy is needed when changing
the original direction of a parameter is that otherwise it will
impact the declaration of all the procedures using this parameter.
require 'BOAST'
include BOAST
module Parameters
$ndime = Int("ndime", :dir => :in )
$kfl_lumped = Int("kfl_lumped", :dir => :in)
$mnode = Int("mnode", :dir => :in)
$ntens = Int("ntens", :dir => :in)
$kfl_duatss = Int("kfl_duatss", :dir => :in)
$fact_duatss = Int("fact_duatss", :dir => :in)
$kfl_stabi_nsi = Int("kfl_stabi_nsi", :dir => :in)
$fvins_nsi = Real("fvins_nsi", :dir => :in)
$fcons_nsi = Real("fcons_nsi", :dir => :in)
$bemol_nsi = Real("bemol_nsi", :dir => :in)
$kfl_regim_nsi = Int("kfl_regim_nsi", :dir => :in)
$fvela_nsi = Real("fvela_nsi", :dir => :in, :dim => [Dim(3)])
$kfl_rmom2_nsi = Int("kfl_rmom2_nsi", :dir => :in)
$kfl_press_nsi = Int("kfl_press_nsi", :dir => :in)
$kfl_p1ve2_nsi = Int("kfl_p1ve2_nsi", :dir => :in)
$kfl_linea_nsi = Int("kfl_linea_nsi", :dir => :in)
$pabdf_nsi = Real("pabdf_nsi", :dir => :in, :dim => [Dim()])
$kfl_confi_nsi = Int("kfl_confi_nsi", :dir => :in)
$nbdfp_nsi = Int("nbdfp_nsi", :dir => :in)
$kfl_sgsti_nsi = Int("kfl_sgsti_nsi", :dir => :in)
$kfl_nota1_nsi = Int("kfl_nota1_nsi", :dir => :in)
$kfl_limit_nsi = Int("kfl_limit_nsi", :dir => :in)
$kfl_penal_nsi = Int("kfl_penal_nsi", :dir => :in)
$penal_nsi = Real("penal_nsi", :dir => :in)
$kfl_bubbl_nsi = Int("kfl_bubbl_nsi", :dir => :in)
$pnode = Int("pnode", :dir => :in)
$pgaus = Int("pgaus", :dir => :in)
$inode = Int("inode")
$jnode = Int("jnode")
$jdime = Int("jdime")
$idofv = Int("idofv")
$ivect = Int("ivect")
$igaus = Int("igaus")
$idime = Int("idime")
$jdofv = Int("jdofv")
$itime = Int("itime")
def self.initialize(vector_length)
allocate = get_lang == C ? true : false
$gpden = Real("gpden", :dir => :in, :vector_length => vector_length, :dim => [Dim($pgaus)])
$gpvis = Real("gpvis", :dir => :in, :vector_length => vector_length, :dim => [Dim($pgaus)])
$gppor = Real("gppor", :dir => :in, :vector_length => vector_length, :dim => [Dim($pgaus)])
$gpgvi = Real("gpgvi", :dir => :in, :vector_length => vector_length, :dim => [Dim($ndime),Dim($pgaus)])
$gptt1 = Real("gptt1", :dir => :in, :vector_length => vector_length, :dim => [Dim($pgaus)])
$gptt2 = Real("gptt2", :dir => :in, :vector_length => vector_length, :dim => [Dim($pgaus)])
$gplap = Real("gplap", :dir => :in, :vector_length => vector_length, :dim => [Dim($pnode),Dim($pgaus)])
$gphes = Real("gphes", :dir => :in, :vector_length => vector_length, :dim => [Dim($ntens),Dim($mnode),Dim($pgaus)])
$gprhs_sgs = Real("gprhs_sgs", :dir => :inout, :vector_length => vector_length, :dim => [Dim($ndime),Dim($pgaus)])
$gprh2 = Real("gprh2", :dir => :in, :vector_length => vector_length, :dim => [Dim($pgaus)])
$rmomu = Real("rmomu", :dir => :in, :vector_length => vector_length, :dim => [Dim($pnode),Dim($pgaus)])
$rcont = Real("rcont", :dir => :in, :vector_length => vector_length, :dim => [Dim($ndime),Dim($pnode),Dim($pgaus)])
$gpsp1 = Real("gpsp1", :dir => :in, :vector_length => vector_length, :dim => [Dim($pgaus)])
$gpsp2 = Real("gpsp2", :dir => :in, :vector_length => vector_length, :dim => [Dim($pgaus)])
$gpvol = Real("gpvol", :dir => :in, :vector_length => vector_length, :dim => [Dim($pgaus)])
$gpsha = Real("gpsha", :dir => :in, :vector_length => vector_length, :dim => [Dim($pnode),Dim($pgaus)])
$gpcar = Real("gpcar", :dir => :in, :vector_length => vector_length, :dim => [Dim($ndime),Dim($mnode),Dim($pgaus)])
$gpadv = Real("gpadv", :dir => :in, :vector_length => vector_length, :dim => [Dim($ndime),Dim($pgaus)])
$gpvep = Real("gpvep", :dir => :inout , :vector_length => vector_length, :dim => [Dim($ndime),Dim($pgaus)])
$gpgrp = Real("gpgrp", :dir => :inout , :vector_length => vector_length, :dim => [Dim($ndime),Dim($pgaus)])
$gprhs = Real("gprhs", :dir => :inout , :vector_length => vector_length, :dim => [Dim($ndime),Dim($pgaus)])
$gprhc = Real("gprhc", :dir => :inout , :vector_length => vector_length, :dim => [Dim($pgaus)])
$gpvel = Real("gpvel", :dir => :in, :vector_length => vector_length, :dim => [Dim($ndime),Dim($pgaus),Dim()])
$gpsgs = Real("gpsgs", :dir => :in, :vector_length => vector_length, :dim => [Dim($ndime),Dim($pgaus),Dim()])
$elvel = Real("elvel", :dir => :in, :vector_length => vector_length, :dim => [Dim($ndime),Dim($pnode),Dim()])
$elpre = Real("elpre", :dir => :in, :vector_length => vector_length, :dim => [Dim($pnode),Dim()])
$elbub = Real("elbub", :dir => :in, :vector_length => vector_length, :dim => [Dim(1)])
$wgrgr = Real("wgrgr", :dir => :out, :vector_length => vector_length, :dim => [Dim($pnode),Dim($pnode),Dim($pgaus)])
$agrau = Real("agrau", :dir => :out, :vector_length => vector_length, :dim => [Dim($pnode),Dim($pgaus)])
# Matrices
$elauu = Real("elauu", :dir => :out, :vector_length => vector_length, :dim => [Dim($pnode*$ndime),Dim($pnode*$ndime)])
$elaup = Real("elaup", :dir => :out, :vector_length => vector_length, :dim => [Dim($pnode*$ndime),Dim($pnode)])
$elapp = Real("elapp", :dir => :out, :vector_length => vector_length, :dim => [Dim($pnode),Dim($pnode)])
$elapu = Real("elapu", :dir => :out, :vector_length => vector_length, :dim => [Dim($pnode),Dim($pnode*$ndime)])
$elrbu = Real("elrbu", :dir => :out, :vector_length => vector_length, :dim => [Dim($ndime),Dim($pnode)])
$elrbp = Real("elrbp", :dir => :out, :vector_length => vector_length, :dim => [Dim($pnode)])
$rmom2 = Real("rmom2", :dir => :in, :vector_length => vector_length, :dim => [Dim($ndime),Dim($ndime),Dim($pnode),Dim($pgaus)])
$gpst1 = Real("gpst1", :dir => :in, :vector_length => vector_length, :dim => [Dim($pgaus)])
$gpgve = Real("gpgve", :dir => :in, :vector_length => vector_length, :dim => [Dim($ndime),Dim($ndime),Dim($pgaus)])
$ellum = Real("ellum", :dir => :out, :vector_length => vector_length, :dim => [Dim($pnode)])
$gppre = Real("gppre", :dir => :in, :vector_length => vector_length, :dim => [Dim($pgaus)])
# Others
$dtinv_loc = Real("dtinv_loc", :dir => :in, :vector_length => vector_length, :dim => [Dim(1)])
$dtsgs = Real("dtsgs", :dir => :in, :vector_length => vector_length, :dim => [Dim(1)])
$pbubl = Int("pbubl", :dir => :in, :vector_length => vector_length, :dim => [Dim(1)])
$gpsha_bub = Real("gpsha_bub", :dir => :in, :vector_length => vector_length, :dim => [Dim($pgaus)])
$gpcar_bub = Real("gpcar_bub", :dir => :in, :vector_length => vector_length, :dim => [Dim($ndime),Dim($pgaus)])
# Enrichement Element matrices
$elauq = Real("elauq", :dir => :out, :vector_length => vector_length, :dim => [Dim($pnode*$ndime),Dim(1)])
$elapq = Real("elapq", :dir => :out, :vector_length => vector_length, :dim => [Dim($pnode),Dim(1)])
$elaqu = Real("elaqu", :dir => :out, :vector_length => vector_length, :dim => [Dim(1),Dim($pnode*$ndime)])
$elaqp = Real("elaqp", :dir => :out, :vector_length => vector_length, :dim => [Dim(1),Dim($pnode)])
$elaqq = Real("elaqq", :dir => :out, :vector_length => vector_length, :dim => [Dim(1),Dim(1)])
$elrbq = Real("elrbq", :dir => :out, :vector_length => vector_length, :dim => [Dim(1)])
# Locals
$ivect = Int("ivect")
$kdime = Int("kdime")
$xvis2 = Real("xvis2")
$xvisc = Real("xvisc")
$one_rp = Real("one_rp")
$p1ve2 = Real("p1ve2", :vector_length => vector_length, :dim => [Dim($ndime),Dim($ndime),Dim($pnode),Dim($pgaus)], :allocate => allocate)
$p1vec = Real("p1vec", :vector_length => vector_length, :dim => [Dim($pnode),Dim($pgaus)], :allocate => allocate)
$p2vec = Real("p2vec", :vector_length => vector_length, :dim => [Dim($ndime),Dim($pnode),Dim($pgaus)], :allocate => allocate)
$p2sca = Real("p2sca", :vector_length => vector_length, :dim => [Dim($pnode),Dim($pgaus)], :allocate => allocate)
$wgrvi = Real("wgrvi", :vector_length => vector_length, :dim => [Dim($pnode),Dim($pgaus)], :allocate => allocate)
$factx = Real("factx", :vector_length => vector_length)
$facty = Real("facty", :vector_length => vector_length)
$facx1 = Real("facx1", :vector_length => vector_length)
$facy1 = Real("facy1", :vector_length => vector_length)
$ugraN = Real("ugraN", :vector_length => vector_length)
$gramugraN = Real("gramugraN", :vector_length => vector_length)
$penal = Real("penal", :vector_length => vector_length)
$gprhh = Real("gprhh", :vector_length => vector_length, :dim => [Dim($ndime),Dim($pgaus)], :allocate => allocate)
$taupr = Real("taupr", :vector_length => vector_length, :dim => [Dim($pgaus)], :allocate => allocate)
$gpveo = Real("gpveo", :vector_length => vector_length, :dim => [Dim($ndime)], :allocate => allocate)
$gpcar1ji = Real("gpcar1ji", :vector_length => vector_length)
$gpcar2ji = Real("gpcar2ji", :vector_length => vector_length)
$gpcar3ji = Real("gpcar3ji", :vector_length => vector_length)
$p2sca_bub = Real("p2sca_bub", :vector_length => vector_length, :dim => [Dim($pgaus)], :allocate => allocate)
$p2vec_bub = Real("p2vec_bub", :vector_length => vector_length, :dim => [Dim($ndime),Dim($pgaus)], :allocate => allocate)
for i in 1..3 do
for j in 1..3 do
eval '$elauu#{i}#{j} = Real("elauu#{i}#{j}", :vector_length => vector_length, :dim => [Dim(4*((pnode+3)/4)), Dim($pnode)], :allocate => allocate)'
end
end
for i in 1..4 do
eval '$factvec#{i} = Real("factvec#{i}", :vector_length => vector_length, :dim=> [Dim(4*((pnode+3)/4))], :allocate => allocate)'
end
$gpsp1_p = Real("gpsp1_p", :vector_length => vector_length, :dim => [Dim($pgaus)], :allocate => allocate)
$gpsp1_v = Real("gpsp1_v", :vector_length => vector_length, :dim => [Dim($pgaus)], :allocate => allocate)
$gpsp2_v = Real("gpsp2_v", :vector_length => vector_length, :dim => [Dim($pgaus)], :allocate => allocate)
$c1 = Real("c1", :vector_length => vector_length, :dim => [Dim(1)], :allocate => allocate)
$c2 = Real("c2", :vector_length => vector_length, :dim => [Dim(1)], :allocate => allocate)
$c3 = Real("c3", :vector_length => vector_length, :dim => [Dim(1)], :allocate => allocate)
$c4 = Real("c4", :vector_length => vector_length, :dim => [Dim(1)], :allocate => allocate)
$alpha = Real("alpha", :vector_length => vector_length, :dim => [Dim(1)], :allocate => allocate)
$beta = Real("beta", :vector_length => vector_length, :dim => [Dim(1)], :allocate => allocate)
$gpveo = Real("gpveo", :vector_length => vector_length, :dim => [Dim(3)], :allocate => allocate)
$fact1_p = Real("fact1_p", :vector_length => vector_length, :dim => [Dim(1)], :allocate => allocate)
$dtinv_mod = Real("dtinv_mod", :vector_length => vector_length, :dim => [Dim(1)], :allocate => allocate)
$fact = Real('fact', :vector_length => vector_length, :dim => [Dim(9)], :allocate => allocate)
$idof = Int("idof", :dim => [Dim(3)], :allocate => allocate)
$jdof = Int("jdof", :dim => [Dim(3)], :allocate => allocate)
end
def self.copy(arg, direction = nil)
if direction == arg.direction or direction.nil? then
return arg
else
(copy = arg.clone).direction = direction
return copy
end
end
end
For each implementation we need to declare the prototype of the
function nsi_element_assembly_split_oss
with BOAST.
First we declare the parameters of the kernel with BOAST using
the module Argument.
Parameters.initialize(@opts[:vector_length])
@args = []
@args.push @ndime = $ndime
@args.push @kfl_lumped = $kfl_lumped
@args.push @mnode = $mnode
@args.push @ntens = $ntens
@args.push @kfl_duatss = $kfl_duatss
@args.push @fact_duatss = $fact_duatss
@args.push @kfl_stabi_nsi = $kfl_stabi_nsi
@args.push @fvins_nsi = $fvins_nsi
@args.push @fcons_nsi = $fcons_nsi
@args.push @bemol_nsi = $bemol_nsi
@args.push @kfl_regim_nsi = $kfl_regim_nsi
@args.push @fvela_nsi = $fvela_nsi
@args.push @kfl_rmom2_nsi = $kfl_rmom2_nsi
@args.push @kfl_press_nsi = $kfl_press_nsi
@args.push @kfl_p1ve2_nsi = $kfl_p1ve2_nsi
@args.push @kfl_linea_nsi = $kfl_linea_nsi
@args.push @kfl_confi_nsi = $kfl_confi_nsi
@args.push @nbdfp_nsi = $nbdfp_nsi
@args.push @kfl_sgsti_nsi = $kfl_sgsti_nsi
@args.push @kfl_nota1_nsi = $kfl_nota1_nsi
@args.push @kfl_limit_nsi = $kfl_limit_nsi
@args.push @kfl_penal_nsi = $kfl_penal_nsi
@args.push @penal_nsi = $penal_nsi
@args.push @kfl_bubbl_nsi = $kfl_bubbl_nsi
@args.push @pnode = $pnode
@args.push @pgaus = $pgaus
@args.push @gpden = $gpden
@args.push @gpvis = $gpvis
@args.push @gppor = $gppor
@args.push @gpsp1 = $gpsp1
@args.push @gpsp2 = $gpsp2
@args.push @gpvol = $gpvol
@args.push @gpsha = $gpsha
@args.push @gpcar = $gpcar
@args.push @gpadv = $gpadv
@args.push @gpvep = $gpvep
@args.push @gpgrp = $gpgrp
@args.push @gprhs = $gprhs
@args.push @gprhc = $gprhc
@args.push @gpvel = $gpvel
@args.push @gpsgs = $gpsgs
@args.push @elvel = $elvel
@args.push @elpre = $elpre
@args.push @elbub = $elbub
@args.push @wgrgr = $wgrgr
@args.push @agrau = $agrau
# Matrices
@args.push @elauu = $elauu
@args.push @elaup = $elaup
@args.push @elapp = $elapp
@args.push @elapu = $elapu
@args.push @elrbu = $elrbu
@args.push @elrbp = $elrbp
# Others
@args.push @dtinv_loc = $dtinv_loc
@args.push @dtsgs = $dtsgs
@args.push @pbubl = $pbubl
@args.push @gpsha_bub = $gpsha_bub
@args.push @gpcar_bub = $gpcar_bub
# Enrichement Element matrices
@args.push @elauq = $elauq
@args.push @elapq = $elapq
@args.push @elaqu = $elaqu
@args.push @elaqp = $elaqp
@args.push @elaqq = $elaqq
@args.push @elrbq = $elrbq
Then we create the Procedure
nsi_element_assembly_split_oss
using
the parameters defined previously and specifying the functions
that will be used by the kernel.
def declare_procedure(functions = nil)
return Procedure("nsi_element_assembly_split_oss", @args, :functions => functions)
end
The previous operations are common for the reference and BOAST implementation, thus, the following class is used to encapsulate the different implementation of the kernel by gathering the parameters declaration and the generation of the procedure header.
require_relative '../Common/Parameters.rb'
class KSplitOss
include Parameters
attr_reader :kernel
def initialize(options)
@opts = {:vector_length => 1, :preprocessor => false, :nests => (1..10).to_a, :unroll => false, :inline => :included}
@opts.update(options)
<<common-declare-parameters>>
end
<<common-declare-procedure>>
end
Here we simply reuse the FORTRAN code that we simply manipulate as
string. For convenience purpose we replaced the macro VECTOR_LENGTH
and DEF_VECT
by ruby variable $p_vector_length
and $p_def_vect
that
will either use DEF_VECT
and VECTOR_LENGTH
or either paste directly
the corresponding values depending to the options passed to the
kernel.
Some variables/functions are not available so we need to fake
them like it is the case with pabdf_nsi
:
def generate_mocks
mocks = <<EOF
function pabdf_nsi(x) result(y)
integer,intent(in) :: x
real :: y
y = 1.0
end function pabdf_nsi
EOF
return mocks
end
Here is the code extracted from def_parameters.f90
.
We can get rid of ndime
and vector_size
because we can easily
modify these parameters with BOAST.
def gen_def_parameters
parameters =<<EOF
integer, parameter :: ip = 4 ! 4-byte integer
integer, parameter :: rp = 8 ! Double precision
real(rp), parameter :: zeror = epsilon(1.0_rp) ! Almost zero
integer(ip), parameter :: TET04 = 30 ! 3D
integer(ip), parameter :: TET10 = 31 ! 3D
integer(ip), parameter :: PYR05 = 32 ! 3D
integer(ip), parameter :: PYR14 = 33 ! 3D
integer(ip), parameter :: PEN06 = 34 ! 3D
integer(ip), parameter :: PEN15 = 35 ! 3D
integer(ip), parameter :: PEN18 = 36 ! 3D
integer(ip), parameter :: HEX08 = 37 ! 3D
integer(ip), parameter :: HEX20 = 38 ! 3D
integer(ip), parameter :: HEX27 = 39 ! 3D
integer(ip), parameter :: HEX64 = 40 ! 3D
integer(ip), parameter :: SHELL = 51 ! 3D shell element
integer(ip), parameter :: BAR3D = 52 ! 3D bar element
EOF
return parameters
end
Here by calling gen_def_parameters
we just paste the code of
def_parameters
just between the function header and the
parameters declaration.
def generate_ref_declaration
decl_ref =<<EOF
subroutine nsi_element_assembly_split_oss(ndime,&
kfl_lumped,mnode,ntens,kfl_duatss,fact_duatss,&
kfl_stabi_nsi,fvins_nsi,fcons_nsi,bemol_nsi,&
kfl_regim_nsi,fvela_nsi,kfl_rmom2_nsi,kfl_press_nsi,&
kfl_p1ve2_nsi,kfl_linea_nsi,kfl_confi_nsi,nbdfp_nsi,&
kfl_sgsti_nsi,kfl_nota1_nsi,kfl_limit_nsi,kfl_penal_nsi,&
penal_nsi,kfl_bubbl_nsi,pnode,pgaus,gpden,gpvis,gppor,&
gpsp1,gpsp2,gpvol,gpsha,gpcar,gpadv,gpvep,gpgrp,gprhs,&
gprhc,gpvel,gpsgs,elvel,elpre,elbub,wgrgr,agrau,elauu,&
elaup,elapp,elapu,elrbu,elrbp,dtinv_loc,dtsgs,pbubl,&
gpsha_bub,gpcar_bub,elauq,elapq,elaqu,elaqp,elaqq,elrbq)
#{gen_def_parameters}
integer(ip), intent(in) :: ndime
integer(ip), intent(in) :: kfl_lumped
integer(ip), intent(in) :: mnode
integer(ip), intent(in) :: ntens
integer(ip), intent(in) :: kfl_duatss
integer(ip), intent(in) :: fact_duatss
integer(ip), intent(in) :: kfl_stabi_nsi
real(rp), intent(in) :: fvins_nsi
real(rp), intent(in) :: fcons_nsi
real(rp), intent(in) :: bemol_nsi
integer(ip), intent(in) :: kfl_regim_nsi
real(rp), intent(in) :: fvela_nsi(3)
integer(ip), intent(in) :: kfl_rmom2_nsi
integer(ip), intent(in) :: kfl_press_nsi
integer(ip), intent(in) :: kfl_p1ve2_nsi
integer(ip), intent(in) :: kfl_linea_nsi
integer(ip), intent(in) :: kfl_confi_nsi
integer(ip), intent(in) :: nbdfp_nsi
integer(ip), intent(in) :: kfl_sgsti_nsi
integer(ip), intent(in) :: kfl_nota1_nsi
integer(ip), intent(in) :: kfl_limit_nsi
integer(ip), intent(in) :: kfl_penal_nsi
real(rp), intent(in) :: penal_nsi
integer(ip), intent(in) :: kfl_bubbl_nsi
integer(ip), intent(in) :: pnode,pgaus
real(rp), intent(in) :: gpden(#{$p_vector_length},pgaus)
real(rp), intent(in) :: gpvis(#{$p_vector_length},pgaus)
real(rp), intent(in) :: gppor(#{$p_vector_length},pgaus)
real(rp), intent(in) :: gpsp1(#{$p_vector_length},pgaus)
real(rp), intent(in) :: gpsp2(#{$p_vector_length},pgaus)
real(rp), intent(in) :: gpvol(#{$p_vector_length},pgaus)
real(rp), intent(in) :: gpsha(#{$p_vector_length},pnode,pgaus)
real(rp), intent(in) :: gpcar(#{$p_vector_length},ndime,mnode,pgaus)
real(rp), intent(in) :: gpadv(#{$p_vector_length},ndime,pgaus)
real(rp), intent(inout) :: gpvep(#{$p_vector_length},ndime,pgaus)
real(rp), intent(inout) :: gpgrp(#{$p_vector_length},ndime,pgaus)
real(rp), intent(inout) :: gprhs(#{$p_vector_length},ndime,pgaus)
real(rp), intent(inout) :: gprhc(#{$p_vector_length},pgaus)
real(rp), intent(in) :: gpvel(#{$p_vector_length},ndime,pgaus,*)
real(rp), intent(in) :: gpsgs(#{$p_vector_length},ndime,pgaus,*)
real(rp), intent(in) :: elvel(#{$p_vector_length},ndime,pnode,*)
real(rp), intent(in) :: elpre(#{$p_vector_length},pnode,*)
real(rp), intent(in) :: elbub(#{$p_vector_length})
real(rp), intent(out) :: wgrgr(#{$p_vector_length},pnode,pnode,pgaus)
real(rp), intent(out) :: agrau(#{$p_vector_length},pnode,pgaus)
! Matrices
real(rp), intent(out) :: elauu(#{$p_vector_length},pnode*ndime,pnode*ndime)
real(rp), intent(out) :: elaup(#{$p_vector_length},pnode*ndime,pnode)
real(rp), intent(out) :: elapp(#{$p_vector_length},pnode,pnode)
real(rp), intent(out) :: elapu(#{$p_vector_length},pnode,pnode*ndime)
real(rp), intent(out) :: elrbu(#{$p_vector_length},ndime,pnode)
real(rp), intent(out) :: elrbp(#{$p_vector_length},pnode)
! Others
real(rp), intent(in) :: dtinv_loc(#{$p_vector_length})
real(rp), intent(in) :: dtsgs(#{$p_vector_length})
integer(ip), intent(in) :: pbubl(#{$p_vector_length})
real(rp), intent(in) :: gpsha_bub(#{$p_vector_length},pgaus)
real(rp), intent(in) :: gpcar_bub(#{$p_vector_length},ndime,pgaus)
! Enrichement Element matrices
real(rp), intent(out) :: elauq(#{$p_vector_length},pnode*ndime,1)
real(rp), intent(out) :: elapq(#{$p_vector_length},pnode,1)
real(rp), intent(out) :: elaqu(#{$p_vector_length},1,pnode*ndime)
real(rp), intent(out) :: elaqp(#{$p_vector_length},1,pnode)
real(rp), intent(out) :: elaqq(#{$p_vector_length},1,1)
real(rp), intent(out) :: elrbq(#{$p_vector_length},1)
! Local arrays
real(rp) :: gpsp1_p(#{$p_vector_length},pgaus)
real(rp) :: gpsp1_v(#{$p_vector_length},pgaus)
real(rp) :: gpsp2_v(#{$p_vector_length},pgaus)
real(rp) :: c1(#{$p_vector_length})
real(rp) :: c2(#{$p_vector_length})
real(rp) :: c3(#{$p_vector_length})
real(rp) :: c4(#{$p_vector_length})
real(rp) :: alpha(#{$p_vector_length})
real(rp) :: beta(#{$p_vector_length})
real(rp) :: fact0(#{$p_vector_length})
real(rp) :: fact1(#{$p_vector_length})
real(rp) :: fact2(#{$p_vector_length})
real(rp) :: fact3(#{$p_vector_length})
real(rp) :: fact4(#{$p_vector_length})
real(rp) :: fact5(#{$p_vector_length})
real(rp) :: fact6(#{$p_vector_length})
real(rp) :: fact7(#{$p_vector_length})
real(rp) :: fact8(#{$p_vector_length})
real(rp) :: gpveo(#{$p_vector_length},3)
real(rp) :: fact1_p(#{$p_vector_length})
real(rp) :: dtinv_mod(#{$p_vector_length})
integer(ip) :: inode,jnode,jdime
integer(ip) :: idofv,jdof2,jdof3,ivect
integer(ip) :: idof1,idof3,idof2,igaus
integer(ip) :: idime,jdof1,jdofv,itime
EOF
if @opts[:preprocessor] then
decl_ref = decl_ref + <<EOF
#ifdef OPENACC
#define DEF_VECT ivect
#else
#define DEF_VECT 1:#{$p_vector_length}
#endif
EOF
end
return decl_ref
end
def generate_ref_initialization
init = <<EOF
dtinv_mod = dtinv_loc
gpsp1_p = gpsp1
gpsp1_v = gpsp1
gpsp2_v = gpsp2
if( kfl_nota1_nsi == 1 ) gpsp1_v = 0.0_rp
elrbp = 0.0_rp
elrbu = 0.0_rp
elapp = 0.0_rp
elauu = 0.0_rp
elaup = 0.0_rp
elapu = 0.0_rp
EOF
return init
end
first_nest = <<EOF
if( ndime == 2 ) then
do igaus = 1,pgaus
do inode = 1,pnode
agrau(#{$p_def_vect},inode,igaus) = gpden(#{$p_def_vect},igaus) * ( &
& gpadv(#{$p_def_vect},1,igaus)*gpcar(#{$p_def_vect},1,inode,igaus) &
& + gpadv(#{$p_def_vect},2,igaus)*gpcar(#{$p_def_vect},2,inode,igaus) )
do jnode = 1,pnode
wgrgr(#{$p_def_vect},inode,jnode,igaus) = &
& gpcar(#{$p_def_vect},1,inode,igaus)*gpcar(#{$p_def_vect},1,jnode,igaus) &
& + gpcar(#{$p_def_vect},2,inode,igaus)*gpcar(#{$p_def_vect},2,jnode,igaus)
end do
end do
end do
else
do igaus = 1,pgaus
do inode = 1,pnode
agrau(#{$p_def_vect},inode,igaus) = gpden(#{$p_def_vect},igaus) * ( &
& gpadv(#{$p_def_vect},1,igaus)*gpcar(#{$p_def_vect},1,inode,igaus) &
& + gpadv(#{$p_def_vect},2,igaus)*gpcar(#{$p_def_vect},2,inode,igaus) &
& + gpadv(#{$p_def_vect},3,igaus)*gpcar(#{$p_def_vect},3,inode,igaus) )
do jnode = 1,pnode
wgrgr(#{$p_def_vect},inode,jnode,igaus) = &
& gpcar(#{$p_def_vect},1,inode,igaus)*gpcar(#{$p_def_vect},1,jnode,igaus) &
& + gpcar(#{$p_def_vect},2,inode,igaus)*gpcar(#{$p_def_vect},2,jnode,igaus) &
& + gpcar(#{$p_def_vect},3,inode,igaus)*gpcar(#{$p_def_vect},3,jnode,igaus)
end do
end do
end do
end if
EOF
second_nest = <<EOF
if( ndime == 2 ) then
do igaus = 1,pgaus
fact0(#{$p_def_vect}) = gpsp2(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus)
fact6(#{$p_def_vect}) = gpvis(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus)
fact7(#{$p_def_vect}) = gpsp1_v(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus)
fact8(#{$p_def_vect}) = pabdf_nsi(1) * gpden(#{$p_def_vect},igaus) * dtinv_loc(#{$p_def_vect}) + gppor(#{$p_def_vect},igaus)
do inode = 1,pnode
idof1 = 2*inode-1
idof2 = 2*inode
fact1(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,inode,igaus)
fact2(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,inode,igaus)
fact4(#{$p_def_vect}) = gpsha(#{$p_def_vect},inode,igaus) * gpvol(#{$p_def_vect},igaus)
do jnode = 1,pnode
jdof1 = 2*jnode-1
jdof2 = 2*jnode
fact5(#{$p_def_vect}) = fact4(#{$p_def_vect}) * ( agrau(#{$p_def_vect},jnode,igaus) + fact8(#{$p_def_vect}) * gpsha(#{$p_def_vect},jnode,igaus) ) &
& + fact6(#{$p_def_vect}) * wgrgr(#{$p_def_vect},inode,jnode,igaus) &
& + fact7(#{$p_def_vect}) * agrau(#{$p_def_vect},jnode,igaus) * agrau(#{$p_def_vect},inode,igaus)
elauu(#{$p_def_vect},idof1,jdof1) = elauu(#{$p_def_vect},idof1,jdof1) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,jnode,igaus) + fact5(#{$p_def_vect})
elauu(#{$p_def_vect},idof2,jdof1) = elauu(#{$p_def_vect},idof2,jdof1) + fact2(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,jnode,igaus)
elauu(#{$p_def_vect},idof1,jdof2) = elauu(#{$p_def_vect},idof1,jdof2) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,jnode,igaus)
elauu(#{$p_def_vect},idof2,jdof2) = elauu(#{$p_def_vect},idof2,jdof2) + fact2(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,jnode,igaus) + fact5(#{$p_def_vect})
end do
end do
end do
else
do igaus = 1,pgaus
fact0(#{$p_def_vect}) = gpsp2(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus)
fact6(#{$p_def_vect}) = gpvis(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus)
fact7(#{$p_def_vect}) = gpsp1_v(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus)
fact8(#{$p_def_vect}) = pabdf_nsi(1) * gpden(#{$p_def_vect},igaus) * dtinv_loc(#{$p_def_vect}) + gppor(#{$p_def_vect},igaus)
do inode = 1,pnode
idof1 = 3*inode-2
idof2 = 3*inode-1
idof3 = 3*inode
fact1(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,inode,igaus)
fact2(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,inode,igaus)
fact3(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},3,inode,igaus)
fact4(#{$p_def_vect}) = gpsha(#{$p_def_vect},inode,igaus) * gpvol(#{$p_def_vect},igaus)
do jnode = 1,pnode
jdof1 = 3*jnode-2
jdof2 = 3*jnode-1
jdof3 = 3*jnode
fact5(#{$p_def_vect}) = fact4(#{$p_def_vect}) * ( agrau(#{$p_def_vect},jnode,igaus) + fact8(#{$p_def_vect}) * gpsha(#{$p_def_vect},jnode,igaus) ) &
+ fact6(#{$p_def_vect}) * wgrgr(#{$p_def_vect},inode,jnode,igaus) &
+ fact7(#{$p_def_vect}) * agrau(#{$p_def_vect},jnode,igaus) * agrau(#{$p_def_vect},inode,igaus)
elauu(#{$p_def_vect},idof1,jdof1) = elauu(#{$p_def_vect},idof1,jdof1) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,jnode,igaus) + fact5(#{$p_def_vect})
elauu(#{$p_def_vect},idof2,jdof1) = elauu(#{$p_def_vect},idof2,jdof1) + fact2(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,jnode,igaus)
elauu(#{$p_def_vect},idof3,jdof1) = elauu(#{$p_def_vect},idof3,jdof1) + fact3(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,jnode,igaus)
elauu(#{$p_def_vect},idof1,jdof2) = elauu(#{$p_def_vect},idof1,jdof2) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,jnode,igaus)
elauu(#{$p_def_vect},idof2,jdof2) = elauu(#{$p_def_vect},idof2,jdof2) + fact2(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,jnode,igaus) + fact5(#{$p_def_vect})
elauu(#{$p_def_vect},idof3,jdof2) = elauu(#{$p_def_vect},idof3,jdof2) + fact3(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,jnode,igaus)
elauu(#{$p_def_vect},idof1,jdof3) = elauu(#{$p_def_vect},idof1,jdof3) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},3,jnode,igaus)
elauu(#{$p_def_vect},idof2,jdof3) = elauu(#{$p_def_vect},idof2,jdof3) + fact2(#{$p_def_vect}) * gpcar(#{$p_def_vect},3,jnode,igaus)
elauu(#{$p_def_vect},idof3,jdof3) = elauu(#{$p_def_vect},idof3,jdof3) + fact3(#{$p_def_vect}) * gpcar(#{$p_def_vect},3,jnode,igaus) + fact5(#{$p_def_vect})
end do
end do
end do
end if
EOF
third_nest = <<EOF
if( fvins_nsi > 0.9_rp ) then
if( ndime == 2 ) then
do igaus = 1,pgaus
do inode = 1,pnode
do idime = 1,ndime
idofv = (inode-1)*ndime+idime
do jnode = 1,pnode
fact1 = gpvis(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus) * gpcar(#{$p_def_vect},idime,jnode,igaus)
jdofv = (jnode-1)*ndime + 1
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,inode,igaus)
jdofv = (jnode-1)*ndime + 2
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,inode,igaus)
end do
if( fvins_nsi == 2.0_rp ) then
fact1 = -2.0_rp/3.0_rp * gpvis(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus) * gpcar(#{$p_def_vect},idime,inode,igaus)
do jnode = 1,pnode
jdofv = (jnode-1)*ndime + 1
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,jnode,igaus)
jdofv = (jnode-1)*ndime + 2
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,jnode,igaus)
end do
end if
end do
end do
end do
else
do igaus = 1,pgaus
do inode = 1,pnode
do idime = 1,ndime
idofv = (inode-1)*ndime + idime
do jnode = 1,pnode
fact1 = gpvis(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus) * gpcar(#{$p_def_vect},idime,jnode,igaus)
jdofv = (jnode-1)*ndime + 1
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,inode,igaus)
jdofv = (jnode-1)*ndime + 2
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,inode,igaus)
jdofv = (jnode-1)*ndime + 3
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},3,inode,igaus)
end do
if( fvins_nsi == 2.0 ) then
fact1 = -2.0_rp / 3.0_rp * gpvis(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus) * gpcar(#{$p_def_vect},idime,inode,igaus)
do jnode = 1,pnode
jdofv = (jnode-1)*ndime + 1
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,jnode,igaus)
jdofv = (jnode-1)*ndime + 2
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,jnode,igaus)
jdofv = (jnode-1)*ndime + 3
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},3,jnode,igaus)
end do
end if
end do
end do
end do
end if
end if
EOF
fourth_nest = <<EOF
if( kfl_lumped == 1 ) then
if( ndime == 2 ) then
call runend('PREGUNTAR A MATIAS QUE LO PROGRAME')
else
do igaus = 1,pgaus
gpveo(#{$p_def_vect},1:3) = 0.0_rp
do inode = 1,pnode
do idime = 1,ndime
gpveo(#{$p_def_vect},idime) = gpveo(#{$p_def_vect},idime) + elvel(#{$p_def_vect},idime,inode,2) * gpsha(#{$p_def_vect},inode,igaus)
end do
end do
do inode = 1,pnode
idof1 = 3*inode-2
idof2 = 3*inode-1
idof3 = 3*inode
fact0(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * gpden(#{$p_def_vect},igaus) * gpsha(#{$p_def_vect},inode,igaus) * dtinv_loc(#{$p_def_vect})
elauu(#{$p_def_vect},idof1,idof1) = elauu(#{$p_def_vect},idof1,idof1) + fact0(#{$p_def_vect})
elauu(#{$p_def_vect},idof2,idof2) = elauu(#{$p_def_vect},idof2,idof2) + fact0(#{$p_def_vect})
elauu(#{$p_def_vect},idof3,idof3) = elauu(#{$p_def_vect},idof3,idof3) + fact0(#{$p_def_vect})
do idime = 1,ndime
elrbu(#{$p_def_vect},idime,inode) = elrbu(#{$p_def_vect},idime,inode) - fact0(#{$p_def_vect}) * gpveo(#{$p_def_vect},idime)
elrbu(#{$p_def_vect},idime,inode) = elrbu(#{$p_def_vect},idime,inode) + fact0(#{$p_def_vect}) * elvel(#{$p_def_vect},idime,inode,2)
end do
do jnode = 1,pnode
jdof1 = 3*jnode-2
jdof2 = 3*jnode-1
jdof3 = 3*jnode
elauu(#{$p_def_vect},idof1,jdof1) = elauu(#{$p_def_vect},idof1,jdof1) - fact0*gpsha(#{$p_def_vect},jnode,igaus)
elauu(#{$p_def_vect},idof2,jdof2) = elauu(#{$p_def_vect},idof2,jdof2) - fact0*gpsha(#{$p_def_vect},jnode,igaus)
elauu(#{$p_def_vect},idof3,jdof3) = elauu(#{$p_def_vect},idof3,jdof3) - fact0*gpsha(#{$p_def_vect},jnode,igaus)
end do
end do
end do
end if
else if( kfl_lumped == 2 ) then
do igaus = 1,pgaus
fact0(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * gpden(#{$p_def_vect},igaus) * dtinv_loc(#{$p_def_vect})
do inode = 1, pnode
fact1(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpsha(#{$p_def_vect},inode,igaus)
do idime = 1,ndime
idof1 = (inode-1) * ndime + idime
elauu(#{$p_def_vect},idof1,idof1) = elauu(#{$p_def_vect},idof1,idof1) + fact1(#{$p_def_vect})
elrbu(#{$p_def_vect},idime,inode) = elrbu(#{$p_def_vect},idime,inode) + fact1(#{$p_def_vect}) * elvel(#{$p_def_vect},idime,inode,2)
end do
end do
end do
end if
EOF
fifth_nest = <<EOF
if( ndime == 2 ) then
do igaus = 1,pgaus
do inode = 1,pnode
idof1 = 2*inode-1
idof2 = 2*inode
do jnode = 1,pnode
fact0(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * gpsha(#{$p_def_vect},jnode,igaus)
fact1(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,inode,igaus)
fact2(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,inode,igaus)
elapu(#{$p_def_vect},jnode,idof1) = elapu(#{$p_def_vect},jnode,idof1) + fact1(#{$p_def_vect})
elapu(#{$p_def_vect},jnode,idof2) = elapu(#{$p_def_vect},jnode,idof2) + fact2(#{$p_def_vect})
elaup(#{$p_def_vect},idof1,jnode) = elaup(#{$p_def_vect},idof1,jnode) - fact1(#{$p_def_vect})
elaup(#{$p_def_vect},idof2,jnode) = elaup(#{$p_def_vect},idof2,jnode) - fact2(#{$p_def_vect})
end do
end do
end do
else
do igaus = 1,pgaus
do inode = 1,pnode
idof1 = 3*inode-2
idof2 = 3*inode-1
idof3 = 3*inode
do jnode = 1,pnode
fact0(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * gpsha(#{$p_def_vect},jnode,igaus)
fact1(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,inode,igaus)
fact2(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,inode,igaus)
fact3(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},3,inode,igaus)
elapu(#{$p_def_vect},jnode,idof1) = elapu(#{$p_def_vect},jnode,idof1) + fact1(#{$p_def_vect})
elapu(#{$p_def_vect},jnode,idof2) = elapu(#{$p_def_vect},jnode,idof2) + fact2(#{$p_def_vect})
elapu(#{$p_def_vect},jnode,idof3) = elapu(#{$p_def_vect},jnode,idof3) + fact3(#{$p_def_vect})
elaup(#{$p_def_vect},idof1,jnode) = elaup(#{$p_def_vect},idof1,jnode) - fact1(#{$p_def_vect})
elaup(#{$p_def_vect},idof2,jnode) = elaup(#{$p_def_vect},idof2,jnode) - fact2(#{$p_def_vect})
elaup(#{$p_def_vect},idof3,jnode) = elaup(#{$p_def_vect},idof3,jnode) - fact3(#{$p_def_vect})
end do
end do
end do
end if
EOF
sixth_nest = <<EOF
do igaus = 1,pgaus
do inode = 1,pnode
do jnode = inode+1,pnode
fact1(#{$p_def_vect}) = gpsp1_p(#{$p_def_vect},igaus) * wgrgr(#{$p_def_vect},jnode,inode,igaus) * gpvol(#{$p_def_vect},igaus)
elapp(#{$p_def_vect},jnode,inode) = elapp(#{$p_def_vect},jnode,inode) + fact1(#{$p_def_vect})
elapp(#{$p_def_vect},inode,jnode) = elapp(#{$p_def_vect},inode,jnode) + fact1(#{$p_def_vect})
end do
fact1(#{$p_def_vect}) = gpsp1_p(#{$p_def_vect},igaus) * wgrgr(#{$p_def_vect},inode,inode,igaus) * gpvol(#{$p_def_vect},igaus)
elapp(#{$p_def_vect},inode,inode) = elapp(#{$p_def_vect},inode,inode) + fact1(#{$p_def_vect})
end do
end do
EOF
seventh_nest = <<EOF
if( kfl_limit_nsi == -1 ) then
gpvep(#{$p_def_vect},:,:) = 0.0_rp
else if( kfl_limit_nsi > 0 ) then
do igaus = 1,pgaus
c1(#{$p_def_vect}) = 0.0_rp
c2(#{$p_def_vect}) = 0.0_rp
c3(#{$p_def_vect}) = 0.0_rp
do idime = 1,ndime
c4(#{$p_def_vect}) = 0.0_rp
do inode = 1,pnode
c4(#{$p_def_vect}) = c4(#{$p_def_vect}) + agrau(#{$p_def_vect},inode,igaus) * elvel(#{$p_def_vect},idime,inode,1)
end do
c4(#{$p_def_vect}) = gpsp1(#{$p_def_vect},igaus) * c4(#{$p_def_vect})
c1(#{$p_def_vect}) = c1(#{$p_def_vect}) + ( gpvep(#{$p_def_vect},idime,igaus) - c4(#{$p_def_vect}) )**2
c3(#{$p_def_vect}) = c3(#{$p_def_vect}) + gpvep(#{$p_def_vect},idime,igaus) * gpvep(#{$p_def_vect},idime,igaus)
c2(#{$p_def_vect}) = c2(#{$p_def_vect}) + c4(#{$p_def_vect}) * c4(#{$p_def_vect})
end do
c3(#{$p_def_vect}) = sqrt( c2(#{$p_def_vect}) ) + sqrt( c3(#{$p_def_vect}) )
c1(#{$p_def_vect}) = sqrt( c1(#{$p_def_vect}) )
beta(#{$p_def_vect}) = c1(#{$p_def_vect}) / ( c3(#{$p_def_vect}) + epsilon(1.0_rp) )
if( kfl_limit_nsi == 1 ) then
alpha(#{$p_def_vect}) = min(1.0_rp,2.0_rp*(1.0-beta(#{$p_def_vect})))
else if( kfl_limit_nsi == 2 ) then
alpha(#{$p_def_vect}) = 0.5_rp*(tanh(20.0_rp*(beta(#{$p_def_vect})-0.8_rp))+1.0_rp)
end if
do idime = 1,ndime
gpvep(#{$p_def_vect},idime,igaus) = alpha(#{$p_def_vect}) * gpvep(#{$p_def_vect},idime,igaus)
end do
end do
end if
EOF
eighth_nest = <<EOF
do igaus = 1,pgaus
do idime = 1,ndime
gpgrp(#{$p_def_vect},idime,igaus) = gpgrp(#{$p_def_vect},idime,igaus) + gpsp1_p(#{$p_def_vect},igaus) * gprhs(#{$p_def_vect},idime,igaus)
end do
end do
EOF
nineth_nest = <<EOF
if( kfl_sgsti_nsi == 1 ) then
do igaus = 1,pgaus
fact1(#{$p_def_vect}) = gpden(#{$p_def_vect},igaus) * dtsgs(#{$p_def_vect}) * gpsp1_v(#{$p_def_vect},igaus)
fact1_p (#{$p_def_vect}) = gpden(#{$p_def_vect},igaus) * dtsgs(#{$p_def_vect}) * gpsp1_p(#{$p_def_vect},igaus)
do idime = 1,ndime
gpvep(#{$p_def_vect},idime,igaus) = gpvep(#{$p_def_vect},idime,igaus) + fact1(#{$p_def_vect}) * gpsgs(#{$p_def_vect},idime,igaus,2)
gpgrp(#{$p_def_vect},idime,igaus) = gpgrp(#{$p_def_vect},idime,igaus) + fact1_p(#{$p_def_vect}) * gpsgs(#{$p_def_vect},idime,igaus,2)
end do
end do
end if
EOF
tenth_nest = <<EOF
if( ndime == 2 ) then
do igaus = 1,pgaus
fact4(#{$p_def_vect}) = gpden(#{$p_def_vect},igaus) * dtinv_loc(#{$p_def_vect})
do itime = 2,nbdfp_nsi
gprhs(#{$p_def_vect},1,igaus) = gprhs(#{$p_def_vect},1,igaus) - pabdf_nsi(itime) * fact4(#{$p_def_vect}) * gpvel(#{$p_def_vect},1,igaus,itime)
gprhs(#{$p_def_vect},2,igaus) = gprhs(#{$p_def_vect},2,igaus) - pabdf_nsi(itime) * fact4(#{$p_def_vect}) * gpvel(#{$p_def_vect},2,igaus,itime)
end do
do inode = 1,pnode
fact1(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * gpsha(#{$p_def_vect},inode,igaus)
fact3(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * agrau(#{$p_def_vect},inode,igaus)
elrbu(#{$p_def_vect},1,inode) = elrbu(#{$p_def_vect},1,inode) + fact1(#{$p_def_vect}) * gprhs(#{$p_def_vect},1,igaus) + fact3(#{$p_def_vect}) * gpvep(#{$p_def_vect},1,igaus)
elrbu(#{$p_def_vect},2,inode) = elrbu(#{$p_def_vect},2,inode) + fact1(#{$p_def_vect}) * gprhs(#{$p_def_vect},2,igaus) + fact3(#{$p_def_vect}) * gpvep(#{$p_def_vect},2,igaus)
elrbp(#{$p_def_vect},inode) = elrbp(#{$p_def_vect},inode) + gpvol(#{$p_def_vect},igaus) * ( & ! ( P2' , grad(q) )
& gpcar(#{$p_def_vect},1,inode,igaus) * gpgrp(#{$p_def_vect},1,igaus) &
& + gpcar(#{$p_def_vect},2,inode,igaus) * gpgrp(#{$p_def_vect},2,igaus) )
end do
end do
else
do igaus = 1,pgaus
fact4(#{$p_def_vect}) = gpden(#{$p_def_vect},igaus) * dtinv_loc(#{$p_def_vect})
do itime = 2,nbdfp_nsi
gprhs(#{$p_def_vect},1,igaus) = gprhs(#{$p_def_vect},1,igaus) - pabdf_nsi(itime) * fact4(#{$p_def_vect}) * gpvel(#{$p_def_vect},1,igaus,itime)
gprhs(#{$p_def_vect},2,igaus) = gprhs(#{$p_def_vect},2,igaus) - pabdf_nsi(itime) * fact4(#{$p_def_vect}) * gpvel(#{$p_def_vect},2,igaus,itime)
gprhs(#{$p_def_vect},3,igaus) = gprhs(#{$p_def_vect},3,igaus) - pabdf_nsi(itime) * fact4(#{$p_def_vect}) * gpvel(#{$p_def_vect},3,igaus,itime)
end do
do inode = 1,pnode
fact1 = gpvol(#{$p_def_vect},igaus) * gpsha(#{$p_def_vect},inode,igaus)
fact3 = gpvol(#{$p_def_vect},igaus) * agrau(#{$p_def_vect},inode,igaus)
elrbu(#{$p_def_vect},1,inode) = elrbu(#{$p_def_vect},1,inode) + fact1(#{$p_def_vect}) * gprhs(#{$p_def_vect},1,igaus) + fact3(#{$p_def_vect}) * gpvep(#{$p_def_vect},1,igaus)
elrbu(#{$p_def_vect},2,inode) = elrbu(#{$p_def_vect},2,inode) + fact1(#{$p_def_vect}) * gprhs(#{$p_def_vect},2,igaus) + fact3(#{$p_def_vect}) * gpvep(#{$p_def_vect},2,igaus)
elrbu(#{$p_def_vect},3,inode) = elrbu(#{$p_def_vect},3,inode) + fact1(#{$p_def_vect}) * gprhs(#{$p_def_vect},3,igaus) + fact3(#{$p_def_vect}) * gpvep(#{$p_def_vect},3,igaus)
elrbp(#{$p_def_vect},inode) = elrbp(#{$p_def_vect},inode) + gpvol(#{$p_def_vect},igaus) * ( &
& gpcar(#{$p_def_vect},1,inode,igaus) * gpgrp(#{$p_def_vect},1,igaus) &
& + gpcar(#{$p_def_vect},2,inode,igaus) * gpgrp(#{$p_def_vect},2,igaus) &
& + gpcar(#{$p_def_vect},3,inode,igaus) * gpgrp(#{$p_def_vect},3,igaus) )
end do
end do
end if
EOF
require_relative './KSplitOss.rb'
<<ref-mocks>>
<<generate-ref-declaration>>
<<def-parameters>>
<<ref-init>>
class KSplitOssRef_v1 < KSplitOss
def generate
macros = ""
if @opts[:preprocessor] then
$p_vector_length = "VECTOR_SIZE"
$p_def_vect = "DEF_VECT"
macros = <<EOF
#define VECTOR_SIZE #{@opts[:vector_length]}
EOF
else
$p_vector_length = "#{@opts[:vector_length]}"
$p_def_vect = "1:#{@opts[:vector_length]}"
end
<<ref-nest1-v1>>
<<ref-nest2-v1>>
<<ref-nest3-v1>>
<<ref-nest4-v1>>
<<ref-nest5-v1>>
<<ref-nest6-v1>>
<<ref-nest7-v1>>
<<ref-nest8-v1>>
<<ref-nest9-v1>>
<<ref-nest10-v1>>
set_lang(FORTRAN)
@kernel = CKernel::new(:includes => "immintrin.h")
@kernel.procedure = declare_procedure
get_output.print macros
get_output.print generate_mocks
get_output.print generate_ref_declaration
get_output.print generate_ref_initialization
get_output.print first_nest if @opts[:nests].include? 1
get_output.print second_nest if @opts[:nests].include? 2
get_output.print third_nest if @opts[:nests].include? 3
get_output.print fourth_nest if @opts[:nests].include? 4
get_output.print fifth_nest if @opts[:nests].include? 5
get_output.print sixth_nest if @opts[:nests].include? 6
get_output.print seventh_nest if @opts[:nests].include? 7
get_output.print eighth_nest if @opts[:nests].include? 8
get_output.print nineth_nest if @opts[:nests].include? 9
get_output.print tenth_nest if @opts[:nests].include? 10
get_output.print "end subroutine nsi_element_assembly_split_oss"
end
end
agrau(#{$p_def_vect},:,:) = 0.0_rp
wgrgr(#{$p_def_vect},:,:,:) = 0.0_rp
do igaus = 1,pgaus
do inode = 1,pnode
do idime = 1,ndime
agrau(#{$p_def_vect},inode,igaus) = agrau(#{$p_def_vect},inode,igaus) + &
gpadv(#{$p_def_vect},idime,igaus) * gpcar(#{$p_def_vect},idime,inode,igaus)
end do
agrau(#{$p_def_vect},inode,igaus) = gpden(#{$p_def_vect},igaus) * agrau(#{$p_def_vect},inode,igaus)
do jnode = 1,pnode
do idime = 1,ndime
wgrgr(#{$p_def_vect},inode,jnode,igaus) = wgrgr(#{$p_def_vect},inode,jnode,igaus) + &
gpcar(#{$p_def_vect},idime,inode,igaus)*gpcar(#{$p_def_vect},idime,jnode,igaus)
end do
end do
end do
end do
do igaus = 1,pgaus
fact0(#{$p_def_vect}) = gpsp2_v(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus)
fact6(#{$p_def_vect}) = gpvis(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus)
fact7(#{$p_def_vect}) = gpsp1_v(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus)
fact8(#{$p_def_vect}) = pabdf_nsi(1) * gpden(#{$p_def_vect},igaus) * dtinv_mod(#{$p_def_vect}) + gppor(#{$p_def_vect},igaus)
do inode = 1,pnode
do idime = 1,ndime
idofv = (inode-1)*ndime+idime
fact1(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},idime,inode,igaus)
do jnode = 1,pnode
do jdime = 1,ndime
jdofv = (jnode-1)*ndime+jdime
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},jdime,jnode,igaus)
end do
jdofv = (jnode-1)*ndime+idime
fact4(#{$p_def_vect}) = gpsha(#{$p_def_vect},inode,igaus) * gpvol(#{$p_def_vect},igaus)
fact5(#{$p_def_vect}) = fact4(#{$p_def_vect}) * ( agrau(#{$p_def_vect},jnode,igaus) + fact8(#{$p_def_vect}) * gpsha(#{$p_def_vect},jnode,igaus) ) + fact6(#{$p_def_vect}) * wgrgr(#{$p_def_vect},inode,jnode,igaus) + fact7(#{$p_def_vect}) * agrau(#{$p_def_vect},jnode,igaus) * agrau(#{$p_def_vect},inode,igaus)
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact5(#{$p_def_vect})
end do
end do
end do
end do
if( fvins_nsi > 0.9_rp ) then
do igaus = 1,pgaus
do inode = 1,pnode
do idime = 1,ndime
idofv = (inode-1)*ndime + idime
do jnode = 1,pnode
fact1(#{$p_def_vect}) = gpvis(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus) * gpcar(#{$p_def_vect},idime,jnode,igaus)
do jdime = 1,ndime
jdofv = (jnode-1)*ndime + jdime
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},jdime,inode,igaus)
end do
end do
if( fvins_nsi == 2.0_rp ) then
fact1(#{$p_def_vect}) = -2.0_rp / 3.0_rp * gpvis(#{$p_def_vect},igaus) * gpvol(#{$p_def_vect},igaus) * gpcar(#{$p_def_vect},idime,inode,igaus)
do jnode = 1,pnode
do jdime = 1,ndime
jdofv = (jnode-1)*ndime + jdime
elauu(#{$p_def_vect},idofv,jdofv) = elauu(#{$p_def_vect},idofv,jdofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},jdime,jnode,igaus)
end do
end do
end if
end do
end do
end do
end if
if( kfl_lumped == 1 ) then
if( ndime == 2 ) then
stop
else
do igaus = 1,pgaus
gpveo(#{$p_def_vect},1:3) = 0.0_rp
do inode = 1,pnode
do idime = 1,ndime
gpveo(#{$p_def_vect},idime) = gpveo(#{$p_def_vect},idime) + elvel(#{$p_def_vect},idime,inode,2) * gpsha(#{$p_def_vect},inode,igaus)
end do
end do
do inode = 1,pnode
idof1 = 3*inode-2
idof2 = 3*inode-1
idof3 = 3*inode
fact0(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * gpden(#{$p_def_vect},igaus) * gpsha(#{$p_def_vect},inode,igaus) * dtinv_mod(#{$p_def_vect})
elauu(#{$p_def_vect},idof1,idof1) = elauu(#{$p_def_vect},idof1,idof1) + fact0(#{$p_def_vect})
elauu(#{$p_def_vect},idof2,idof2) = elauu(#{$p_def_vect},idof2,idof2) + fact0(#{$p_def_vect})
elauu(#{$p_def_vect},idof3,idof3) = elauu(#{$p_def_vect},idof3,idof3) + fact0(#{$p_def_vect})
do idime = 1,ndime
elrbu(#{$p_def_vect},idime,inode) = elrbu(#{$p_def_vect},idime,inode) - fact0(#{$p_def_vect}) * gpveo(#{$p_def_vect},idime)
elrbu(#{$p_def_vect},idime,inode) = elrbu(#{$p_def_vect},idime,inode) + fact0(#{$p_def_vect}) * elvel(#{$p_def_vect},idime,inode,2)
end do
do jnode = 1,pnode
jdof1 = 3*jnode-2
jdof2 = 3*jnode-1
jdof3 = 3*jnode
elauu(#{$p_def_vect},idof1,jdof1) = elauu(#{$p_def_vect},idof1,jdof1) - fact0(#{$p_def_vect}) * gpsha(#{$p_def_vect},jnode,igaus)
elauu(#{$p_def_vect},idof2,jdof2) = elauu(#{$p_def_vect},idof2,jdof2) - fact0(#{$p_def_vect}) * gpsha(#{$p_def_vect},jnode,igaus)
elauu(#{$p_def_vect},idof3,jdof3) = elauu(#{$p_def_vect},idof3,jdof3) - fact0(#{$p_def_vect}) * gpsha(#{$p_def_vect},jnode,igaus)
end do
end do
end do
end if
else if( kfl_lumped == 2 ) then
do igaus = 1,pgaus
fact0(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * gpden(#{$p_def_vect},igaus) * dtinv_mod(#{$p_def_vect})
do inode = 1, pnode
fact1(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpsha(#{$p_def_vect},inode,igaus)
do idime = 1,ndime
idof1 = (inode-1) * ndime + idime
elauu(#{$p_def_vect},idof1,idof1) = elauu(#{$p_def_vect},idof1,idof1) + fact1(#{$p_def_vect})
elrbu(#{$p_def_vect},idime,inode) = elrbu(#{$p_def_vect},idime,inode) + fact1(#{$p_def_vect}) * elvel(#{$p_def_vect},idime,inode,2)
end do
end do
end do
end if
if( ndime == 2 ) then
do igaus = 1,pgaus
do inode = 1,pnode
idof1 = 2*inode-1
idof2 = 2*inode
do jnode = 1,pnode
fact0(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * gpsha(#{$p_def_vect},jnode,igaus)
fact1(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,inode,igaus)
fact2(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,inode,igaus)
elapu(#{$p_def_vect},jnode,idof1) = elapu(#{$p_def_vect},jnode,idof1) + fact1(#{$p_def_vect})
elapu(#{$p_def_vect},jnode,idof2) = elapu(#{$p_def_vect},jnode,idof2) + fact2(#{$p_def_vect})
elaup(#{$p_def_vect},idof1,jnode) = elaup(#{$p_def_vect},idof1,jnode) - fact1(#{$p_def_vect})
elaup(#{$p_def_vect},idof2,jnode) = elaup(#{$p_def_vect},idof2,jnode) - fact2(#{$p_def_vect})
end do
end do
end do
else
do igaus = 1,pgaus
do inode = 1,pnode
idof1 = 3*inode-2
idof2 = 3*inode-1
idof3 = 3*inode
do jnode = 1,pnode
fact0(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * gpsha(#{$p_def_vect},jnode,igaus)
fact1(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},1,inode,igaus)
fact2(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},2,inode,igaus)
fact3(#{$p_def_vect}) = fact0(#{$p_def_vect}) * gpcar(#{$p_def_vect},3,inode,igaus)
elapu(#{$p_def_vect},jnode,idof1) = elapu(#{$p_def_vect},jnode,idof1) + fact1(#{$p_def_vect})
elapu(#{$p_def_vect},jnode,idof2) = elapu(#{$p_def_vect},jnode,idof2) + fact2(#{$p_def_vect})
elapu(#{$p_def_vect},jnode,idof3) = elapu(#{$p_def_vect},jnode,idof3) + fact3(#{$p_def_vect})
elaup(#{$p_def_vect},idof1,jnode) = elaup(#{$p_def_vect},idof1,jnode) - fact1(#{$p_def_vect})
elaup(#{$p_def_vect},idof2,jnode) = elaup(#{$p_def_vect},idof2,jnode) - fact2(#{$p_def_vect})
elaup(#{$p_def_vect},idof3,jnode) = elaup(#{$p_def_vect},idof3,jnode) - fact3(#{$p_def_vect})
end do
end do
end do
end if
if( kfl_stabi_nsi /= -1 ) then
do igaus = 1,pgaus
do inode = 1,pnode
do jnode = inode+1,pnode
fact1(#{$p_def_vect}) = gpsp1_p(#{$p_def_vect},igaus) * wgrgr(#{$p_def_vect},jnode,inode,igaus) * gpvol(#{$p_def_vect},igaus)
elapp(#{$p_def_vect},jnode,inode) = elapp(#{$p_def_vect},jnode,inode) + fact1(#{$p_def_vect})
elapp(#{$p_def_vect},inode,jnode) = elapp(#{$p_def_vect},inode,jnode) + fact1(#{$p_def_vect})
end do
fact1(#{$p_def_vect}) = gpsp1_p(#{$p_def_vect},igaus) * wgrgr(#{$p_def_vect},inode,inode,igaus) * gpvol(#{$p_def_vect},igaus)
elapp(#{$p_def_vect},inode,inode) = elapp(#{$p_def_vect},inode,inode) + fact1(#{$p_def_vect})
end do
end do
end if
do igaus = 1,pgaus
fact1(#{$p_def_vect}) = penal_nsi * gpvol(#{$p_def_vect},igaus)
do inode = 1,pnode
elapp(#{$p_def_vect},inode,inode) = elapp(#{$p_def_vect},inode,inode) + fact1(#{$p_def_vect}) * gpsha(#{$p_def_vect},inode,igaus)
elrbp(#{$p_def_vect},inode) = elrbp(#{$p_def_vect},inode) + fact1(#{$p_def_vect}) * gpsha(#{$p_def_vect},inode,igaus) * elpre(#{$p_def_vect},inode,1)
end do
end do
if( kfl_stabi_nsi == -1 ) then
gpvep(#{$p_def_vect},:,:) = 0.0_rp
else if( kfl_limit_nsi == -1 ) then
gpvep(#{$p_def_vect},:,:) = 0.0_rp
else if( kfl_limit_nsi > 0 ) then
do igaus = 1,pgaus
c1(#{$p_def_vect}) = 0.0_rp
c2(#{$p_def_vect}) = 0.0_rp
c3(#{$p_def_vect}) = 0.0_rp
do idime = 1,ndime
c4(#{$p_def_vect}) = 0.0_rp
do inode = 1,pnode
c4(#{$p_def_vect}) = c4(#{$p_def_vect}) + agrau(#{$p_def_vect},inode,igaus) * elvel(#{$p_def_vect},idime,inode,1)
end do
c4(#{$p_def_vect}) = gpsp1(#{$p_def_vect},igaus) * c4(#{$p_def_vect})
c1(#{$p_def_vect}) = c1(#{$p_def_vect}) + ( gpvep(#{$p_def_vect},idime,igaus) - c4(#{$p_def_vect}) )**2
c3(#{$p_def_vect}) = c3(#{$p_def_vect}) + gpvep(#{$p_def_vect},idime,igaus) * gpvep(#{$p_def_vect},idime,igaus)
c2(#{$p_def_vect}) = c2(#{$p_def_vect}) + c4(#{$p_def_vect}) * c4(#{$p_def_vect})
end do
c3(#{$p_def_vect}) = sqrt( c2(#{$p_def_vect}) ) + sqrt( c3(#{$p_def_vect}) )
c1(#{$p_def_vect}) = sqrt( c1(#{$p_def_vect}) )
beta(#{$p_def_vect}) = c1(#{$p_def_vect}) / ( c3(#{$p_def_vect}) + epsilon(1.0_rp) )
if( kfl_limit_nsi == 1 ) then
alpha(#{$p_def_vect}) = min(1.0_rp,2.0_rp*(1.0_rp-beta(#{$p_def_vect})))
else if( kfl_limit_nsi == 2 ) then
alpha(#{$p_def_vect}) = 0.5_rp*(tanh(20.0_rp*(beta(#{$p_def_vect})-0.8_rp))+1.0_rp)
end if
do idime = 1,ndime
gpvep(#{$p_def_vect},idime,igaus) = alpha(#{$p_def_vect}) * gpvep(#{$p_def_vect},idime,igaus)
end do
end do
end if
if( kfl_stabi_nsi == -1 ) then
gpgrp(#{$p_def_vect},:,:) = 0.0_rp
else
do igaus = 1,pgaus
do idime = 1,ndime
gpgrp(#{$p_def_vect},idime,igaus) = gpgrp(#{$p_def_vect},idime,igaus) + gpsp1_p(#{$p_def_vect},igaus) * gprhs(#{$p_def_vect},idime,igaus)
end do
end do
if( kfl_sgsti_nsi == 1 ) then
do igaus = 1,pgaus
fact1(#{$p_def_vect}) = gpden(#{$p_def_vect},igaus) * dtsgs(#{$p_def_vect}) * gpsp1_v(#{$p_def_vect},igaus)
fact1_p (#{$p_def_vect}) = gpden(#{$p_def_vect},igaus) * dtsgs(#{$p_def_vect}) * gpsp1_p(#{$p_def_vect},igaus)
do idime = 1,ndime
gpvep(#{$p_def_vect},idime,igaus) = gpvep(#{$p_def_vect},idime,igaus) + fact1(#{$p_def_vect}) * gpsgs(#{$p_def_vect},idime,igaus,2)
gpgrp(#{$p_def_vect},idime,igaus) = gpgrp(#{$p_def_vect},idime,igaus) + fact1_p(#{$p_def_vect}) * gpsgs(#{$p_def_vect},idime,igaus,2)
end do
end do
end if
end if
do igaus = 1,pgaus
fact4(#{$p_def_vect}) = gpden(#{$p_def_vect},igaus) * dtinv_mod(#{$p_def_vect})
do itime = 2,nbdfp_nsi
do idime = 1,ndime
gprhs(#{$p_def_vect},idime,igaus) = gprhs(#{$p_def_vect},idime,igaus) - pabdf_nsi(itime) * fact4(#{$p_def_vect}) * gpvel(#{$p_def_vect},idime,igaus,itime)
end do
end do
do inode = 1,pnode
fact1(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * gpsha(#{$p_def_vect},inode,igaus) ! ( f + rho*u^n/dt , v )
fact3(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * agrau(#{$p_def_vect},inode,igaus) ! ( rho * a.grad(v) , P1' )
do idime = 1,ndime
elrbu(#{$p_def_vect},idime,inode) = elrbu(#{$p_def_vect},idime,inode) + fact1(#{$p_def_vect}) * gprhs(#{$p_def_vect},idime,igaus) &
& + fact3(#{$p_def_vect}) * gpvep(#{$p_def_vect},idime,igaus)
end do
elrbp(#{$p_def_vect},inode) = elrbp(#{$p_def_vect},inode) + gpvol(#{$p_def_vect},igaus) * gpsha(#{$p_def_vect},inode,igaus) * gprhc(#{$p_def_vect},igaus) ! ( rhs, q )
do idime = 1,ndime
elrbp(#{$p_def_vect},inode) = elrbp(#{$p_def_vect},inode) + gpvol(#{$p_def_vect},igaus) * gpcar(#{$p_def_vect},idime,inode,igaus) * gpgrp(#{$p_def_vect},idime,igaus) ! ( P2' , grad(q) )
end do
end do
end do
if( maxval(pbubl) == 1 ) then
if( kfl_stabi_nsi /= -1 ) then
write(6,*) 'BUBBLE NOT CODED FOR SPLIT OSS'
stop
end if
elauq = 0.0_rp
elapq = 0.0_rp
elaqu = 0.0_rp
elaqp = 0.0_rp
elaqq = 0.0_rp
elrbq = 0.0_rp
if( kfl_press_nsi == 1 ) then
do igaus = 1,pgaus
fact1(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * gpsha_bub(#{$p_def_vect},igaus)
do inode = 1,pnode
do idime = 1,ndime
idofv = (inode-1)*ndime + idime
elauq(#{$p_def_vect},idofv,1) = elauq(#{$p_def_vect},idofv,1) - fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},idime,inode,igaus)
elaqu(#{$p_def_vect},1,idofv) = elaqu(#{$p_def_vect},1,idofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},idime,inode,igaus)
end do
end do
end do
else
do igaus = 1,pgaus
fact1(#{$p_def_vect}) = gpvol(#{$p_def_vect},igaus) * gpsha_bub(#{$p_def_vect},igaus)
do inode = 1,pnode
do idime = 1,ndime
idofv = (inode-1)*ndime + idime
elauq(#{$p_def_vect},idofv,1) = elauq(#{$p_def_vect},idofv,1) + gpvol(#{$p_def_vect},igaus) * gpsha(#{$p_def_vect},inode,igaus) * gpcar_bub(#{$p_def_vect},idime,igaus)
elaqu(#{$p_def_vect},1,idofv) = elaqu(#{$p_def_vect},1,idofv) + fact1(#{$p_def_vect}) * gpcar(#{$p_def_vect},idime,inode,igaus)
end do
end do
end do
end if
do igaus = 1,pgaus
elaqq(#{$p_def_vect},1,1) = elaqq(#{$p_def_vect},1,1) + gpvol(#{$p_def_vect},igaus) * gpsha_bub(#{$p_def_vect},igaus) * penal_nsi
elrbq(#{$p_def_vect},1) = elrbq(#{$p_def_vect},1) + gpvol(#{$p_def_vect},igaus) * gpsha_bub(#{$p_def_vect},igaus) * penal_nsi * elbub(#{$p_def_vect})
elrbq(#{$p_def_vect},1) = elrbq(#{$p_def_vect},1) + gpvol(#{$p_def_vect},igaus) * gpsha_bub(#{$p_def_vect},igaus) * gprhc(#{$p_def_vect},igaus)
end do
end if
Finally this class assembles everything to generate the reference implementation:
- the mocks,
- the variables declaration,
- the initialization,
- the different loop nests
The FORTRAN code is then simply printed in the body of the main
procedure using get_output.print
.
require_relative './KSplitOss.rb'
<<ref-mocks>>
<<generate-ref-declaration>>
<<def-parameters>>
<<ref-init>>
class KSplitOssRef_v2 < KSplitOss
def generate
# opts = {:vector_length => 1, :preprocessor => false, :nests => (1..10).to_a}
# opts.update(options)
macros = ""
if @opts[:preprocessor] then
$p_vector_length = "VECTOR_SIZE"
$p_def_vect = "DEF_VECT"
macros = <<EOF
#define VECTOR_SIZE #{@opts[:vector_length]}
EOF
else
$p_vector_length = "#{@opts[:vector_length]}"
$p_def_vect = "1:#{@opts[:vector_length]}"
end
nests = []
nests.push <<EOF
<<ref-nest1-v2>>
EOF
nests.push <<EOF
<<ref-nest2-v2>>
EOF
nests.push <<EOF
<<ref-nest3-v2>>
EOF
nests.push <<EOF
<<ref-nest4-v2>>
EOF
nests.push <<EOF
<<ref-nest5-v2>>
EOF
nests.push <<EOF
<<ref-nest6-v2>>
EOF
nests.push <<EOF
<<ref-nest7-v2>>
EOF
nests.push <<EOF
<<ref-nest8-v2>>
EOF
nests.push <<EOF
<<ref-nest9-v2>>
EOF
nests.push <<EOF
<<ref-nest10-v2>>
EOF
nests.push <<EOF
<<ref-nest11-v2>>
EOF
set_lang(FORTRAN)
@kernel = CKernel::new(:includes => "immintrin.h")
@kernel.procedure = declare_procedure
get_output.print macros
get_output.print generate_mocks
get_output.print generate_ref_declaration
get_output.print generate_ref_initialization
@opts[:nests].each{|n|
get_output.print nests[n-1]
}
get_output.print "end subroutine nsi_element_assembly_split_oss"
return @kernel
end
end
For the BOAST implementation we followed the same logic as
previous. We need to declare the kernel header (which is done by
the class KSPlitOss
) and the local variables, initialize the
variables, create mocks and split the kernel according to the
different nests.
The local variable declaration is simply done using the parameters defined in the module Arguments.
# Local arrays
@gpsp1_p = $gpsp1_p
@gpsp1_v = $gpsp1_v
@gpsp2_v = $gpsp2_v
@c1 = $c1
@c2 = $c2
@c3 = $c3
@c4 = $c4
@alpha = $alpha
@beta = $beta
@gpveo = $gpveo
@fact1_p = $fact1_p
@dtinv_mod = $dtinv_mod
@inode = $inode
@jnode = $jnode
@jdime = $jdime
@idofv = $idofv
@ivect = $ivect
@igaus = $igaus
@idime = $idime
@jdofv = $jdofv
@itime = $itime
@fact = $fact
@idof = $idof
@jdof = $jdof
decl @gpsp1_p,@gpsp1_v,@gpsp2_v,@c1,@c2,@c3,@c4,@alpha,@beta,
@gpveo,@fact1_p,@dtinv_mod,@inode,@jnode,@jdime,@idofv,
@ivect,@igaus,@idime,@jdofv,@itime,@fact,@idof,@jdof
Writing the initialization of the arrays for FORTRAN with BOAST is
straight forward but in C we need to use C code to the set the
arrays efficiently. These instruction are stored as strings
and
included into the kernel with get_output.print
.
<<boast-initialization>>
if get_lang == FORTRAN
pr @dtinv_mod === @dtinv_loc
pr @gpsp1_p === @gpsp1
pr @gpsp1_v === @gpsp1
pr @gpsp2_v === @gpsp2
pr If( @kfl_nota1_nsi == 1 => lambda{ pr @gpsp1_v === 0.0 } )
pr @elrbp === 0.0
pr @elrbu === 0.0
pr @elapp === 0.0
pr @elauu === 0.0
pr @elaup === 0.0
pr @elapu === 0.0
elsif get_lang == C
code =<<EOF
memcpy(dtinv_mod, dtinv_loc, sizeof(dtinv_mod));
memcpy(gpsp1_p, gpsp1, sizeof(gpsp1_p));
memcpy(gpsp1_v, gpsp1, sizeof(gpsp1_v));
memcpy(gpsp2_v, gpsp2, sizeof(gpsp2_v));
if (kfl_nota1_nsi == 1) memset(gpsp1_v, 0, sizeof(gpsp1_v));
memset(elrbp, 0, sizeof(#{@elrbp.type.decl}) * pnode);
memset(elrbu, 0, sizeof(#{@elrbu.type.decl}) * ndime * pnode);
memset(elapp, 0, sizeof(#{@elapp.type.decl}) * pnode * pnode);
memset(elauu, 0, sizeof(#{@elauu.type.decl}) * pnode * ndime * pnode * ndime);
memset(elaup, 0, sizeof(#{@elaup.type.decl}) * pnode * ndime * pnode);
memset(elapu, 0, sizeof(#{@elapu.type.decl}) * pnode * pnode * ndime);
EOF
get_output.print code
end
Like in the reference implementation we split the kernel according
to the different loop nests. This gave us the possibility to
either wrap the nests into subroutines. We can either inline them
manually, rely on the compiler to do it or just use a simple
function call. Subroutines are wrapped into classes for more
convenience. Like it is the case for the kernel, for each
subroutine, the parameters (with their direction :in
, :out
,
:inout
) and variables used have to be defined.
The goal of the class Subroutine
is to wrap the different loop
nests into subroutines which can either be inlined manually, by
the compiler or used as simple subroutine. The properties of a
subroutine (such as the vector length) are registered but also the
parameters and the variables used by the subroutine which are created
dynamically. The usage of a subroutine can either be :included
,
:call
, :inline
. Depending on the usage, the function construct
will
either only return the code of the subroutine to paste it in the
kernel or generate a Procedure
that can be inlined or not.
The function call
is here only just avoid to have to specify the
arguments when calling the subroutine effectively.
require_relative './arguments.rb'
class Subroutine
include Arguments
attr_accessor :code
def initialize(name,options,functions)
opts = {:vector_length => 1, :unroll => false, :inline => :included}
opts.update(options)
@name = name
@functions = functions
@args = []
@code = nil
@vector_length = options[:vector_length]
@usage = opts[:inline]
@unroll = opts[:unroll]
@dimensions = @unroll ? [2,3] : [$ndime]
@nb_original_param = instance_variables.length + 1
end
def construct(block)
if @usage == :included then
@code = block
else
functions = @functions.nil? ? nil : @functions.values
inlined = @usage == :inlined ? true : false
@code = Procedure(@name, @args, :inline => inlined, :functions => functions, &block)
end
end
def call
if @usage == :included then
@code.call
else
eval "pr @code.call( #{instance_variables[@nb_original_param...instance_variables.length].join(',')} )"
end
end
end
The values of ndime
can be either 2 or 3 and here we unroll the
computation of agrau
and wgrgr
according to this. For this, we
build 2 expressions exp1
and exp2
which are unrolled 2 and 3 times.
Then these expressions are pasted into the formulas that compute
agrau
and wgrgr
. Next with these formulas, we build 2 loop nests
(for each possibility to unroll the computation of agrau
and
wgrgr
, here 2 and 3) that we stack into for1
, and finally we wrap
these loops in a if statement.
<<boast-nest1-body>>
for1 = []
[2,3].each{|dim|
exp1 = ""
exp2 = ""
dim.times{|i|
exp1 += "@gpadv[#{i+1},igaus]*@gpcar[#{i+1},inode,igaus]"
exp2 += "@gpcar[#{i+1},inode,igaus]*@gpcar[#{i+1},jnode,igaus]"
exp1 += " + " if i+1 < dim
exp2 += " + " if i+1 < dim
}
form_agrau = "pr @agrau[inode,igaus] === @gpden[igaus] * (#{exp1})"
form_wgrgr = "pr @wgrgr[inode,jnode,igaus] === #{exp2}"
for1.push For(igaus,1,@pgaus){
pr For(inode,1,@pnode){
eval form_agrau
pr For(jnode,1,@pnode){
eval form_wgrgr
}
}
}
}
pr If(@ndime == 2 => lambda{
pr for1[0]
}, else: lambda{
pr for1[1]
})
<<boast-nest1-class>>
The nest 1 is encapsulated into the class Nest1
. First,
parameters and local variables are defined. Then the block
that
contains the nest1 is built by declaring the local variables if
the inline is done manually (otherwise they will conflict with
the kernel ones) and using the previous descriptions of the
nest1 <<boast-nest1-body>>.
require_relative './subroutine.rb'
require_relative './KSplitOss.rb'
class Nest1 < Subroutine
def initialize(options,functions = nil)
super("nest1",options,functions)
# ins
@ndime = Parameters.copy($ndime,:in)
@mnode = Parameters.copy($mnode,:in)
@pnode = Parameters.copy($pnode,:in)
@pgaus = Parameters.copy($pgaus,:in)
@gpden = Parameters.copy($gpden,:in)
@gpcar = Parameters.copy($gpcar,:in)
@gpadv = Parameters.copy($gpadv,:in)
# inouts
@wgrgr = Parameters.copy($wgrgr,:inout)
@agrau = Parameters.copy($agrau,:inout)
@args.push @ndime,@mnode,@pnode,@pgaus,@gpden,@gpcar,@gpadv,@wgrgr,@agrau
end
def generate
# locals
inode = $inode
jnode = $jnode
igaus = $igaus
block = lambda{
decl inode,jnode,igaus unless @usage == :included
<<boast-nest1-body>>
}
construct(block)
end
end
Same principle as before, the code is unrolled according to
ndime
or not unrolled at all, then the different version of the
nest loop are stacked into array, and wrap into an if
statement.
for1 = []
@dimensions.each{|dim|
for1.push For(igaus,1,@pgaus){
pr fact[1] === @gpsp2[igaus] * @gpvol[igaus]
pr fact[7] === @gpvis[igaus] * @gpvol[igaus]
pr fact[8] === @gpsp1_v[igaus] * @gpvol[igaus]
pr fact[9] === @gpden[igaus] * @functions[:pabdf_nsi].call(1) * @dtinv_loc[1] + @gppor[igaus]
pr For(inode,1,@pnode){
pr For(idime,1,dim){
pr idof[1] === (inode-1)*@ndime+idime
pr fact[2] === fact[1] * @gpcar[idime,inode,igaus]
pr For(jnode,1,@pnode){
pr For(jdime,1,dim){
pr jdof[jdime] === (jnode-1)*@ndime+jdime
pr @elauu[idof[1],jdof[jdime]] === @elauu[idof[1],jdof[jdime]] + fact[2] * @gpcar[jdime,jnode,igaus]
}.unroll(@unroll)
pr jdof[1] === (jnode-1)*@ndime+idime
pr fact[5] === @gpsha[inode,igaus] * @gpvol[igaus]
pr fact[6] === fact[5] * ( @agrau[jnode,igaus] + fact[9] * @gpsha[jnode,igaus] ) + fact[7] * @wgrgr[inode,jnode,igaus] + fact[8] * @agrau[jnode,igaus] * @agrau[inode,igaus]
pr @elauu[idof[1],jdof[1]] === @elauu[idof[1],jdof[1]] + fact[6]
}
}
}
}
}
main_block = lambda{
decl fact,igaus,inode,jnode,idof,jdof,idime,jdime unless @usage == :included
if @unroll then
pr If(@ndime == 2 => lambda{
pr for1[0]
}, else: lambda{
pr for1[1]
})
else
pr for1[0]
end
}
Same principle as in <<boast-nest1-class>>
class Nest2 < Subroutine
def initialize(options, functions = nil)
super("nest2",options,functions)
# ins
@ndime = Parameters.copy($ndime,:in)
@mnode = Parameters.copy($mnode,:in)
@pnode = Parameters.copy($pnode,:in)
@pgaus = Parameters.copy($pgaus,:in)
@gpden = Parameters.copy($gpden,:in)
@gpcar = Parameters.copy($gpcar,:in)
@gpsp2 = Parameters.copy($gpsp2,:in)
@gpvol = Parameters.copy($gpvol,:in)
@gpvis = Parameters.copy($gpvis,:in)
@dtinv_loc = Parameters.copy($dtinv_loc,:in)
@gppor = Parameters.copy($gppor,:in)
@gpsha = Parameters.copy($gpsha,:in)
@gpsp1_v = Parameters.copy($gpsp1_v,:in)
@wgrgr = Parameters.copy($wgrgr,:in)
@agrau = Parameters.copy($agrau,:in)
#inouts
@elauu = Parameters.copy($elauu,:inout)
@args.push @ndime,@mnode,@pnode,@pgaus,@gpden,@gpcar,@gpsp2,@gpvol,@gpvis,@dtinv_loc,
@gppor,@gpsha,@gpsp1_v,@wgrgr,@agrau,@elauu
end
def generate
# locals
fact = Parameters.copy($fact)
igaus = Parameters.copy($igaus)
inode = Parameters.copy($inode)
jnode = Parameters.copy($jnode)
idof = Parameters.copy($idof)
jdof = Parameters.copy($jdof)
idime = Parameters.copy($idime)
jdime = Parameters.copy($jdime)
<<boast-nest2-body>>
construct(main_block)
end
end
In this nest every expressions and group of expression have been stored into variables, same for the loops as we envision the possibility to reorder the loops as we think it could be beneficial. This way loops and expressions can be reused and eases the reordering. However this feature is not implemented yet and the order follows the reference implementation one.
for_inode = For(inode,1,@pnode)
for_jnode = For(jnode,1,@pnode)
for_idime = For(idime,1,@ndime)
for_jdime = lambda{ |dim| return For(jdime,1,dim) }
exp_fact1 = fact[2] === @gpvis[igaus] * @gpvol[igaus] * @gpcar[idime,jnode,igaus]
exp_idofv = idofv === (inode-1)*@ndime + idime
block_elauu_1 = lambda{
pr jdofv === (jnode-1)*@ndime + jdime
pr @elauu[idofv,jdofv] === @elauu[idofv,jdofv] + fact[2] * @gpcar[jdime,inode,igaus]
}
exp_fact1_2 = fact[2] === -2.0.to_var / 3.0.to_var * @gpvis[igaus] * @gpvol[igaus] * @gpcar[idime,inode,igaus]
block_elauu_2 = lambda{
pr jdofv === (jnode-1)*@ndime + jdime
pr @elauu[idofv,jdofv] === @elauu[idofv,jdofv] + fact[2] * @gpcar[jdime,jnode,igaus]
}
Same principle is in <<boast-nest1-body>> the code can be
unrolled for values ndime
2 and 3 or not. We previously defined
empty loops, their content can be simply filled by opening it with opn
,
print expression as usual end closing it with close
.
main_block = lambda{
decl igaus,idime,jdime,inode,jnode,fact,idofv,jdofv unless @usage == :included
pr If(@fvins_nsi > 0.9){
for1 = []
@dimensions.each{|dim|
for1.push For(igaus,1,@pgaus){
opn for_inode
opn for_idime
pr exp_idofv
opn for_jnode
pr exp_fact1
pr for_jdime.call(dim).unroll(@unroll), &block_elauu_1
close for_jnode
pr If(@fvins_nsi == 2.0){
pr exp_fact1_2
opn for_jnode
pr for_jdime.call(dim).unroll(@unroll), &block_elauu_2
close for_jnode
}
close for_idime
close for_inode
}
}
if @unroll then
pr If(@ndime == 2 => lambda{
pr for1[0]
}, else: lambda{
pr for1[1]
})
else
pr for1[0]
end
}
}
Same principle as in <<boast-nest1-class>>
class Nest3 < Subroutine
def initialize(options, functions = nil)
super("nest3",options,functions)
# ins
@pgaus = Parameters.copy($pgaus,:in)
@pnode = Parameters.copy($pnode,:in)
@mnode = Parameters.copy($mnode,:in)
@ndime = Parameters.copy($ndime,:in)
@gpvis = Parameters.copy($gpvis,:in)
@gpvol = Parameters.copy($gpvol,:in)
@gpcar = Parameters.copy($gpcar,:in)
@fvins_nsi = Parameters.copy($fvins_nsi,:in)
# inouts
@elauu = Parameters.copy($elauu,:inout)
@args.push @pgaus,@pnode,@mnode,@ndime,@gpvis,@gpvol,@gpcar,@fvins_nsi,@elauu
end
def generate
# locals
igaus = $igaus
idime = $idime
jdime = $jdime
inode = $inode
jnode = $jnode
fact = $fact
idofv = $idofv
jdofv = $jdofv
<<boast-nest3-body-expressions>>
<<boast-nest3-body-loops>>
construct(main_block)
end
end
Some expressions have been stored into variables as they are used several time in this nest.
exp_fact0 = fact[1] === @gpvol[igaus] * @gpden[igaus] * @dtinv_mod[1]
exp_fact1 = fact[2] === fact[1] * @gpsha[inode,igaus]
exp_elrbu = @elrbu[idime,inode] === @elrbu[idime,inode] + fact[2] * @elvel[idime,inode,2]
Here we build the loops that compute elauu
and elrbu
. The 2
first (for_elauu1
and for_elrbu2
) can be unrolled for ndime
equals 2 and 3 or not unrolled at all. The 2 other (for_elauu2
and for_elrbu1
) can only be unrolled for ndime
equals 3 or not.
for_elauu1 = []
for_elrbu2 = []
@dimensions.each{|dim|
for_elauu1.push For(idime,1,dim){
pr idof[idime] === (inode-1)*dim+idime
pr @elauu[idof[idime],idof[idime]] === @elauu[idof[idime],idof[idime]] + fact[2]
}
for_elrbu2.push For(idime,1,dim){
pr exp_elrbu
}
}
for_elauu2 = For(jdime,1,3){
pr jdof[jdime] === (jnode-1)*@ndime+jdime
pr @elauu[idof[jdime],jdof[jdime]] === @elauu[idof[jdime],jdof[jdime]] - fact[2] * @gpsha[jnode,igaus]
}
for_elrbu1 = For(idime,1,3){
pr @elrbu[idime,inode] === @elrbu[idime,inode] - fact[2] * @gpveo[idime]
pr exp_elrbu
}
Then we built the different blocks for the different values that
can take kfl_lumped
. Here we create the block for kfl_lumped
equals 1. This time again, we need to use C code to
initialize gpveo
.
block_kfl_lumped_1 = lambda{
pr For(igaus,1,@pgaus){
if get_lang == FORTRAN then
pr @gpveo === 0.0.to_var
elsif get_lang == C
code =<<EOF
memset(gpveo,0,sizeof(#{@gpveo.type.decl}) * #{@gpveo.dimension[0]});
EOF
get_output.print code
end
pr For(inode,1,@pnode){
pr For(idime,1,3){
pr @gpveo[idime] === @gpveo[idime] + @elvel[idime,inode,2] * @gpsha[inode,igaus]
}.unroll(@unroll)
}
pr exp_fact0
pr For(inode,1,@pnode){
pr exp_fact1
if @unroll then
pr for_elauu1[1].unroll
pr for_elrbu1.unroll
else
pr for_elauu1[0]
pr for_elrbu1
end
pr For(jnode,1,@pnode){
pr for_elauu2.unroll(@unroll)
}
}
}
}
<<boast-nest4-body-unroll-kfl-lumped>>
The following block is for kfl_lumped equals
2. However the
loops have been built earlier we actually unroll them here.
block_kfl_lumped_2 = []
@dimensions.length.times{|i|
block_kfl_lumped_2[i] = lambda{
pr For(igaus,1,@pgaus){
pr exp_fact0
pr For(inode,1,@pnode){
pr exp_fact1
pr for_elauu1[i].unroll(@unroll)
pr for_elrbu2[i].unroll(@unroll)
}
}
}
}
Finally, we position the different blocks in if
statements.
if_kfl_lumped_1 = lambda{ pr If(@ndime == 2 => lambda{}, :else => block_kfl_lumped_1) }
if @unroll then
if_kfl_lumped_2 = lambda{ pr If(@ndime == 2 => block_kfl_lumped_2[0] , :else => block_kfl_lumped_2[1])}
else
if_kfl_lumped_2 = block_kfl_lumped_2[0]
end
main_block = lambda{
decl idime,jdime,igaus,inode,jnode,fact,idof,jdof unless @usage == :included
pr If(@kfl_lumped == 1 => if_kfl_lumped_1, @kfl_lumped == 2 => if_kfl_lumped_2)
}
Same principle as in <<boast-nest1-class>>
class Nest4 < Subroutine
def initialize(options, functions = nil)
super("nest4",options,functions)
# ins
@ndime = Parameters.copy($ndime,:in)
@pgaus = Parameters.copy($pgaus,:in)
@pnode = Parameters.copy($pnode,:in)
@gpvol = Parameters.copy($gpvol,:in)
@gpden = Parameters.copy($gpden,:in)
@dtinv_mod = Parameters.copy($dtinv_mod,:in)
@gpsha = Parameters.copy($gpsha,:in)
@elvel = Parameters.copy($elvel,:in)
@kfl_lumped = Parameters.copy($kfl_lumped,:in)
# inouts
@gpveo = Parameters.copy($gpveo,:inout)
@elrbu = Parameters.copy($elrbu,:inout)
@elauu = Parameters.copy($elauu,:inout)
@args.push @ndime,@pgaus,@pnode,@gpvol,@gpden,@dtinv_mod,@gpsha,@elvel,@kfl_lumped,@gpveo,@elrbu,@elauu
end
def generate
# locals
idime = $idime
jdime = $jdime
igaus = $igaus
inode = $inode
jnode = $jnode
fact = $fact
idof = $idof
jdof = $jdof
<<boast-nest4-body-expressions>>
<<boast-nest4-body-build-loops>>
<<boast-nest4-body-kfl-lumped>>
<<boast-nest4-body-unroll-kfl-lumped>>
<<boast-nest4-body-ifs>>
construct(main_block)
end
end
Same principle as in <<boast-nest4-body-unroll-kfl-lumped>>, we unroll
according to the values that can take ndime
or not. And we place the
block in if
statement.
for1 = []
@dimensions.each{|dim|
for1.push For(igaus,1,@pgaus){
pr For(inode,1,@pnode){
pr For(idime,1,dim){
pr idof[idime] === (inode-1)*dim + idime
}.unroll(@unroll)
pr For(jnode,1,@pnode){
pr fact[1] === @gpvol[igaus] * @gpsha[jnode,igaus]
pr For(jdime,1,dim){
pr fact[jdime+1] === fact[1] * @gpcar[jdime,inode,igaus]
pr @elapu[jnode,idof[jdime]] === @elapu[jnode,idof[jdime]] + fact[jdime+1]
}.unroll(@unroll)
pr For(jdime,1,dim){
pr @elaup[idof[jdime],jnode] === @elaup[idof[jdime],jnode] - fact[jdime+1]
}.unroll(@unroll)
}
}
}
}
main_block = lambda{
decl igaus,inode,jnode,idime,jdime,idof,fact unless @usage == :included
if @unroll then
pr If(@ndime == 2 => lambda{
pr for1[0]
}, else: lambda{
pr for1[1]
})
else
pr for1[0]
end
}
Same principle as in <<boast-nest1-class>>
class Nest5 < Subroutine
def initialize(options, functions = nil)
super("nest5",options,functions)
# ins
@ndime = Parameters.copy($ndime,:in)
@mnode = Parameters.copy($mnode,:in)
@pgaus = Parameters.copy($pgaus,:in)
@pnode = Parameters.copy($pnode,:in)
@gpvol = Parameters.copy($gpvol,:in)
@gpsha = Parameters.copy($gpsha,:in)
@gpcar = Parameters.copy($gpcar,:in)
# inouts
@elapu = Parameters.copy($elapu,:inout)
@elaup = Parameters.copy($elaup,:inout)
@args.push @ndime,@mnode,@pgaus,@pnode,@gpvol,@gpsha,@gpcar,@elapu,@elaup
end
def generate
# locals
igaus = $igaus
inode = $inode
jnode = $jnode
idime = $idime
jdime = $jdime
idof = $idof
fact = $fact
<<boast-nest5-body>>
construct(main_block)
end
end
Same principle as in <<boast-nest1-body>>
main_block = lambda{
decl igaus,inode,jnode,fact unless @usage == :included
pr If(@kfl_stabi_nsi != -1){
pr For(igaus,1,@pgaus){
pr For(inode,1,@pnode){
pr For(jnode,inode+1,@pnode){
pr fact[2] === @gpsp1_p[igaus] * @wgrgr[jnode,inode,igaus] * @gpvol[igaus]
pr @elapp[jnode,inode] === @elapp[jnode,inode] + fact[2]
pr @elapp[inode,jnode] === @elapp[inode,jnode] + fact[2]
}
pr fact[2] === @gpsp1_p[igaus] * @wgrgr[inode,inode,igaus] * @gpvol[igaus]
pr @elapp[inode,inode] === @elapp[inode,inode] + fact[2]
}
}
}
}
Same principle as in <<boast-nest1-class>>
class Nest6 < Subroutine
def initialize(options, functions = nil)
super("nest6",options,functions)
# ins
@kfl_stabi_nsi = Parameters.copy($kfl_stabi_nsi)
@pgaus = Parameters.copy($pgaus)
@pnode = Parameters.copy($pnode)
@gpsp1_p = Parameters.copy($gpsp1_p)
@wgrgr = Parameters.copy($wgrgr)
@gpvol = Parameters.copy($gpvol)
# inouts
@elapp = Parameters.copy($elapp)
@args.push @kfl_stabi_nsi,@pgaus,@pnode,@gpsp1_p,@wgrgr,@gpvol,@elapp
end
def generate
# locals
igaus = $igaus
inode = $inode
jnode = $jnode
fact = $fact
<<boast-nest6-body>>
construct(main_block)
end
end
Same principle as in <<boast-nest1-body>>
main_block = lambda {
decl fact,igaus,inode unless @usage == :included
pr For(igaus,1,@pgaus){
pr fact[2] === @penal_nsi * @gpvol[igaus]
pr For(inode,1,@pnode){
pr @elapp[inode,inode] === @elapp[inode,inode] + fact[2] * @gpsha[inode,igaus]
pr @elrbp[inode] === @elrbp[inode] + fact[2] * @gpsha[inode,igaus] * @elpre[inode,1]
}
}
}
Same principle as in <<boast-nest1-class>>
class Nest7 < Subroutine
def initialize(options, functions = nil)
super("nest7",options,functions)
# ins
@pnode = Parameters.copy($pnode)
@pgaus = Parameters.copy($pgaus)
@penal_nsi = Parameters.copy($penal_nsi)
@gpvol = Parameters.copy($gpvol)
@gpsha = Parameters.copy($gpsha)
@elpre = Parameters.copy($elpre)
# inouts
@elapp = Parameters.copy($elapp)
@elrbp = Parameters.copy($elrbp)
@args.push @pnode,@pgaus,@penal_nsi,@gpvol,@gpsha,@elpre,@elapp,@elrbp
end
def generate
# locals
fact = $fact
igaus = $igaus
inode = $inode
<<boast-nest7-body>>
construct(main_block)
end
end
First, this nest uses the function epsilon
for which a C macro
equivalent exists. For the FORTRAN version we can call directly
this function with BOAST by telling it this is an external
function with register_funccall
.
Second, the FORTRAN functions tanh
and min
can work with either
vector or scalar variables. Intrinsics exist for these
functions and BOAST provides support for them (Tanh
and Min
) by
calling the corresponding instruction whether working on scalar
or vector variables. However these Intrinsics only work with ICC
and we wrote scalar equivalent, they can be found later in this
tutorial. For more convenience the call to either our scalar
versions or the BOAST versions have been encapsulated into the
lambdas tanh
and min
.
register_funccall("epsilon") if get_lang == FORTRAN
tanh = lambda{|x|
if get_lang == FORTRAN or @vector_length < 2 then
return Tanh(x)
else
return @functions[:tanh].call(x)
end
}
min = lambda{|x,y|
if get_lang == FORTRAN then
return Min( x, y )
else
if @vector_length > 1 then
return @functions[:min].call(x,y)
else
return Ternary(x < y, x, y)
end
end
}
As in <<boast-initialization>> we used directly C code to initialize the arrays:
for_igaus = For(igaus,1,@pgaus){
if get_lang == FORTRAN then
pr c1[1] === 0.0.to_var
pr c2[1] === 0.0.to_var
pr c3[1] === 0.0.to_var
elsif get_lang == C
code =<<EOF
memset(c1,0,sizeof(c1));
memset(c2,0,sizeof(c2));
memset(c3,0,sizeof(c3));
EOF
get_output.print code
end
pr For(idime,1,@ndime){
if get_lang == FORTRAN then
pr c4[1] === 0.0.to_var
elsif get_lang == C
code =<<EOF
memset(c4,0,sizeof(c4));
EOF
get_output.print code
end
Like for tanh
and min
the Intrinsic for the exponential is only
available for the ICC so here we just replaced it with a
multiplication:
pr For(inode,1,@pnode){
pr c4[1] === c4[1] + @agrau[inode,igaus] * @elvel[idime,inode,1]
}
pr c4[1] === @gpsp1[igaus] * c4[1]
# Exponential intrinsics only works with ICC
# pr c1[1] === c1[1] + ( @gpvep[idime,igaus] - c4[1] )**2
pr c1[1] === c1[1] + ( @gpvep[idime,igaus] - c4[1] ) * ( @gpvep[idime,igaus] - c4[1] )
pr c3[1] === c3[1] + @gpvep[idime,igaus] * @gpvep[idime,igaus]
pr c2[1] === c2[1] + c4[1] * c4[1]
}
For square root no need to use scalar function when working on
vectors, unlike with tanh
and min
the corresponding intrinsics
are not ICC exclusive:
pr c3[1] === Sqrt( c2[1] ) + Sqrt( c3[1] )
pr c1[1] === Sqrt( c1[1] )
For the C version, we just need to use the C macro that gives the epsilon:
if get_lang == FORTRAN then
pr beta[1] === c1[1] / ( c3[1] + epsilon(1.0.to_var) )
elsif get_lang == C
code =<<EOF
beta[0] = c1[0] / ( c3[0] + DBL_EPSILON );
EOF
get_output.print code
end
For the call to min
we need to use Set
, for the 1.0 value, to
convert it as a vector if working on vectors. For tanh there is
no need because done implicitly by Tanh
:
pr If(@kfl_limit_nsi == 1 => lambda{
pr alpha[1] === min.call( Set(1.0.to_var, alpha), 2.0.to_var * ( 1.0.to_var - beta[1] ) )
}, @kfl_limit_nsi == 2 => lambda{
pr alpha[1] === 0.5.to_var * ( tanh.call( 20.0.to_var * ( beta[1] - 0.8.to_var ) ) + 1.0.to_var )
})
The following is as previously, we close the for loop opened at the beginning and we build the subroutine:
pr For(idime,1,@ndime){
pr @gpvep[idime,igaus] === alpha[1] * @gpvep[idime,igaus]
}
}
if get_lang == C then
code =<<EOF
memset(gpvep,0,sizeof(#{@gpvep.type.decl}) * ndime * pgaus);
EOF
end
main_block = lambda{
decl igaus,idime,inode,c1,c2,c3,c4,beta,alpha unless @usage == :included
pr If( @kfl_stabi_nsi == -1 => lambda{
if get_lang == FORTRAN then
pr @gpvep === 0.0
elsif get_lang == C
get_output.print code
end
}, @kfl_limit_nsi == -1 => lambda{
if get_lang == FORTRAN then
pr @gpvep === 0.0
elsif get_lang == C
get_output.print code
end
}, @kfl_limit_nsi > 0 => lambda{ pr for_igaus } )
}
Same principle as in <<boast-nest1-class>>
class Nest8 < Subroutine
def initialize(options, functions = nil)
super("nest8",options,functions)
# ins
@pgaus = Parameters.copy($pgaus)
@ndime = Parameters.copy($ndime)
@pnode = Parameters.copy($pnode)
@elvel = Parameters.copy($elvel)
@agrau = Parameters.copy($agrau)
@gpsp1 = Parameters.copy($gpsp1)
@kfl_limit_nsi = Parameters.copy($kfl_limit_nsi)
@kfl_stabi_nsi = Parameters.copy($kfl_stabi_nsi)
# inouts
@gpvep = Parameters.copy($gpvep)
@args.push @pgaus,@ndime,@pnode,@elvel,@agrau,@gpsp1,@kfl_limit_nsi,@kfl_stabi_nsi,@gpvep
end
def generate
# local
igaus = $igaus
idime = $idime
inode = $inode
c1 = $c1
c2 = $c2
c3 = $c3
c4 = $c4
beta = $beta
alpha = $alpha
<<boast-nest8-body-functions>>
<<boast-nest8-body-init>>
<<boast-nest8-body-pow>>
<<boast-nest8-body-sqrt>>
<<boast-nest8-body-epsilon>>
<<boast-nest8-body-c-calls>>
<<boast-nest8-body-end>>
construct(main_block)
end
end
for_igaus = []
for_igaus[0] = []
for_igaus[1] = []
@dimensions.each{|dim|
for_igaus[0].push For(igaus,1,@pgaus){
pr For(idime,1,dim){
pr @gpgrp[idime,igaus] === @gpgrp[idime,igaus] + @gpsp1_p[igaus] * @gprhs[idime,igaus]
}.unroll(@unroll)
}
for_igaus[1].push For(igaus,1,@pgaus){
pr fact[2] === @gpden[igaus] * @dtsgs[1] * @gpsp1_v[igaus]
pr fact1_p[1] === @gpden[igaus] * @dtsgs[1] * @gpsp1_p[igaus]
pr For(idime,1,dim){
pr @gpvep[idime,igaus] === @gpvep[idime,igaus] + fact[2] * @gpsgs[idime,igaus,2]
pr @gpgrp[idime,igaus] === @gpgrp[idime,igaus] + fact1_p[1] * @gpsgs[idime,igaus,2]
}.unroll(@unroll)
}
}
if_ndime = []
2.times{|i|
if @unroll then
if_ndime[i] = If(@ndime == 2 => lambda{
pr for_igaus[i][0]
}, :else => lambda{
pr for_igaus[i][1]
})
else
if_ndime[i] = for_igaus[i][0]
end
}
block = lambda{
pr if_ndime[0]
pr If(@kfl_sgsti_nsi == 1){
pr if_ndime[1]
}
}
main_block = lambda{
decl igaus,idime,fact,fact1_p unless @usage == :included
pr If(@kfl_stabi_nsi == -1 => lambda{
if get_lang == FORTRAN then
pr @gpgrp === 0.0
elsif get_lang == C
code =<<EOF
memset(gpgrp,0,sizeof(#{@gpgrp.type.decl}) * ndime * pgaus);
EOF
end
get_output.print code
}, :else => block)
}
class Nest9 < Subroutine
def initialize(options, functions = nil)
super("nest9",options,functions)
# ins
@ndime = Parameters.copy($ndime)
@pgaus = Parameters.copy($pgaus)
@gpsp1_p = Parameters.copy($gpsp1_p)
@gprhs = Parameters.copy($gprhs)
@gpden = Parameters.copy($gpden)
@dtsgs = Parameters.copy($dtsgs)
@gpsp1_v = Parameters.copy($gpsp1_v)
@gpsgs = Parameters.copy($gpsgs)
@kfl_sgsti_nsi = Parameters.copy($kfl_sgsti_nsi)
@kfl_stabi_nsi = Parameters.copy($kfl_stabi_nsi)
# inouts
@gpgrp = Parameters.copy($gpgrp)
@gpvep = Parameters.copy($gpvep)
@args.push @ndime,@pgaus,@gpsp1_p,@gprhs,@gpden,@dtsgs,@gpsp1_v,@gpsgs,@kfl_sgsti_nsi,@kfl_stabi_nsi,@gpgrp,@gpvep
end
def generate
# local
igaus = $igaus
idime = $idime
fact = $fact
fact1_p = $fact1_p
<<boast-nest9-body>>
construct(main_block)
end
end
class Nest10 < Subroutine
def initialize(options, functions = nil)
super("nest10",options,functions)
# ins
@mnode = Parameters.copy($mnode)
@ndime = Parameters.copy($ndime)
@pgaus = Parameters.copy($pgaus)
@gpden = Parameters.copy($gpden)
@dtinv_mod = Parameters.copy($dtinv_mod)
@nbdfp_nsi = Parameters.copy($nbdfp_nsi)
@gpvel = Parameters.copy($gpvel)
@pnode = Parameters.copy($pnode)
@gpvol = Parameters.copy($gpvol)
@gpsha = Parameters.copy($gpsha)
@agrau = Parameters.copy($agrau)
@gprhc = Parameters.copy($gprhc)
@gpcar = Parameters.copy($gpcar)
@gpvep = Parameters.copy($gpvep)
@gpgrp = Parameters.copy($gpgrp)
# inouts
@gprhs = Parameters.copy($gprhs)
@elrbu = Parameters.copy($elrbu)
@elrbp = Parameters.copy($elrbp)
@args.push @mnode,@ndime,@pgaus,@gpden,@dtinv_mod,@nbdfp_nsi,@gpvel,@pnode,@gpvol,@gpsha,@agrau,@gprhc,@gpcar,@gpvep,@gpgrp,@gprhs,@elrbu,@elrbp
end
def generate
# locals
igaus = $igaus
fact = $fact
idime = $idime
itime = $itime
inode = $inode
for_igaus = []
@dimensions.each{|dim|
for_igaus.push For(igaus,1,@pgaus){
pr fact[5] === @gpden[igaus] * @dtinv_mod[1]
pr For(itime,2,@nbdfp_nsi){
pr For(idime,1,dim){
pr @gprhs[idime,igaus] === @gprhs[idime,igaus] - @functions[:pabdf_nsi].call(itime) * fact[5] * @gpvel[idime,igaus,itime]
}.unroll(@unroll)
}
pr For(inode,1,@pnode){
pr fact[2] === @gpvol[igaus] * @gpsha[inode,igaus] # ( f + rho*u^n/dt , v )
pr fact[4] === @gpvol[igaus] * @agrau[inode,igaus] # ( rho * a.grad(v) , P1' )
pr For(idime,1,dim){
pr @elrbu[idime,inode] === @elrbu[idime,inode] + fact[2] * @gprhs[idime,igaus] + fact[4] * @gpvep[idime,igaus]
}.unroll(@unroll)
pr @elrbp[inode] === @elrbp[inode] + @gpvol[igaus] * @gpsha[inode,igaus] * @gprhc[igaus] # ( rhs, q )
pr For(idime,1,dim){
pr @elrbp[inode] === @elrbp[inode] + @gpvol[igaus] * @gpcar[idime,inode,igaus] * @gpgrp[idime,igaus] # ( P2' , grad(q) )
}.unroll(@unroll)
}
}
}
main_block = lambda{
decl igaus,fact,idime,itime,inode unless @usage == :included
if @unroll then
pr If(@ndime == 2 => lambda{ pr for_igaus[0] }, :else => lambda{ pr for_igaus[1] })
else
pr for_igaus[0]
end
}
construct(main_block)
end
end
class Nest11 < Subroutine
def initialize(options, functions = nil)
super("nest11",options,functions)
# ins
@mnode = Parameters.copy($mnode)
@pgaus = Parameters.copy($pgaus)
@pnode = Parameters.copy($pnode)
@ndime = Parameters.copy($ndime)
@gpcar = Parameters.copy($gpcar)
@gpvol = Parameters.copy($gpvol)
@gpsha = Parameters.copy($gpsha)
@gpcar_bub = Parameters.copy($gpcar_bub)
@pbubl = Parameters.copy($pbubl)
@kfl_stabi_nsi = Parameters.copy($kfl_stabi_nsi)
@kfl_press_nsi = Parameters.copy($kfl_press_nsi)
@penal_nsi = Parameters.copy($penal_nsi)
@elbub = Parameters.copy($elbub)
@gprhc = Parameters.copy($gprhc)
@gpsha_bub = Parameters.copy($gpsha_bub)
# inouts
@elauq = Parameters.copy($elauq)
@elaqu = Parameters.copy($elaqu)
@elapq = Parameters.copy($elapq)
@elaqp = Parameters.copy($elaqp)
@elaqq = Parameters.copy($elaqq)
@elrbq = Parameters.copy($elrbq)
@args.push @mnode,@pgaus,@pnode,@ndime,@gpcar,@gpvol,@gpsha,@gpcar_bub,@pbubl,@kfl_stabi_nsi,@kfl_press_nsi,@penal_nsi,@elbub,@gprhc,@gpsha_bub,@elauq,@elaqu,@elapq,@elaqp,@elaqq,@elrbq
end
def generate
# locals
fact = $fact
igaus = $igaus
idof = $idof
idime = $idime
inode = $inode
register_funccall("stop") if get_lang == FORTRAN
register_funccall("maxval") if get_lang == FORTRAN
exp_elauq = []
exp_elauq[0] = "@elauq[idof[dim],1] === @elauq[idof[dim],1] - fact[2] * @gpcar[idime,inode,igaus]"
exp_elauq[1] = "@elauq[idof[dim],1] === @elauq[idof[dim],1] + @gpvol[igaus] * @gpsha[inode,igaus] * @gpcar_bub[idime,igaus]"
block_igaus = lambda{|dim,exp|
return For(igaus,1,@pgaus){
pr fact[2] === @gpvol[igaus] * @gpsha_bub[igaus]
pr For(inode,1,@pnode){
pr For(idime,1,dim){
pr idof[dim] === (inode-1)*dim + idime
pr eval exp
pr @elaqu[1,idof[dim]] === @elaqu[1,idof[dim]] + fact[2] * @gpcar[idime,inode,igaus]
}.unroll(@unroll)
}
}
}
if @unroll then
if_kfl_press_nsi_1 = If(@ndime == 2 => lambda{ pr block_igaus.call(@dimensions[0],exp_elauq[0]) }, :else => lambda{ pr block_igaus.call(@dimensions[1],exp_elauq[0]) })
if_kfl_press_nsi_else = If(@ndime == 2 => lambda{ pr block_igaus.call(@dimensions[0],exp_elauq[1]) }, :else => lambda{ pr block_igaus.call(@dimensions[1],exp_elauq[1]) })
else
if_kfl_press_nsi_1 = block_igaus.call(@dimensions[0],exp_elauq[0])
if_kfl_press_nsi_else = block_igaus.call(@dimensions[0],exp_elauq[1])
end
call_maxval = lambda{|x|
if get_lang == FORTRAN
return maxval(x)
else
return @functions[:maxval].call(x)
end
}
main_block = lambda{
decl fact,igaus,idof,idime,inode unless @usage == :included
pr If(call_maxval.call(@pbubl) == 1){
pr If(@kfl_stabi_nsi != -1){
if get_lang == FORTRAN then
stop
end
}
if get_lang == FORTRAN then
pr @elauq === 0.0
pr @elapq === 0.0
pr @elaqu === 0.0
pr @elaqp === 0.0
pr @elaqq === 0.0
pr @elrbq === 0.0
elsif get_lang == C
code =<<EOF
memset(elauq, 0, sizeof(#{@elauq.type.decl}) * pnode * ndime);
memset(elapq, 0, sizeof(#{@elapq.type.decl}) * pnode);
memset(elaqu, 0, sizeof(#{@elaqu.type.decl}) * pnode * ndime);
memset(elaqp, 0, sizeof(#{@elaqp.type.decl}) * pnode);
memset(elaqq, 0, sizeof(#{@elaqq.type.decl}));
memset(elrbq, 0, sizeof(#{@elrbq.type.decl}));
EOF
get_output.print code
end
pr If(@kfl_press_nsi == 1 => lambda{ pr if_kfl_press_nsi_1}, :else => lambda{ pr if_kfl_press_nsi_else })
pr For(igaus,1,@pgaus){
pr @elaqq[1,1] === @elaqq[1,1] + @gpvol[igaus] * @gpsha_bub[igaus] * @penal_nsi
pr @elrbq[1] === @elrbq[1] + @gpvol[igaus] * @gpsha_bub[igaus] * @penal_nsi * @elbub[1]
pr @elrbq[1] === @elrbq[1] + @gpvol[igaus] * @gpsha_bub[igaus] * @gprhc[igaus]
}
}
}
construct(main_block)
end
end
x1 = Int("x", :dir => :in)
y1 = Real("y")
@pabdf_nsi = Procedure("pabdf_nsi",[x1], :return => y1){
pr y1 === 1.0
}
Because in C there are no equivalent of the following functions, or intrinsics we need to write them. <<boast-c-functions>>
unless get_lang == FORTRAN then
x2 = Int("x", :dir => :in, :vector_length => @opts[:vector_length], :dim => [Dim(1)])
y2 = Int("y")
@p_maxval = Procedure("maxval",[x2], :return => y2){
if @opts[:vector_length] > 1 then
decl i = Int("i")
decl a = Int("a", :dim => [Dim(@opts[:vector_length])], :allocate => true)
pr a[1] === x2.dereference
pr y2 === a[1]
pr For(i,2,@opts[:vector_length]){
pr If(a[i] > y2){
pr y2 === a[i]
}
}
else
pr y2 === x2[1]
end
}
end
unless get_lang == FORTRAN then
x3 = Real("x", :dir => :in, :vector_length => @opts[:vector_length])
y3 = Real("y", :vector_length => @opts[:vector_length])
@p_tanh = Procedure("Tanh", [x3], :return => y3){
decl i = Int("i")
decl tmp1 = Real("tmp1", :dim => [Dim(@opts[:vector_length])], :allocate => true)
decl tmp2 = Real("tmp2", :dim => [Dim(@opts[:vector_length])], :allocate => true)
pr tmp1[1] === x3
pr For(i,1,@opts[:vector_length]){
pr tmp2[i] === Tanh(tmp1[i])
}
pr y3 === tmp2[1]
}
end
unless get_lang == FORTRAN then
x4 = Real("x", :dir => :in, :vector_length => @opts[:vector_length])
y4 = Real("y", :vector_length => @opts[:vector_length])
@p_sqrt = Procedure("Sqrt", [x4], :return => y4){
decl i = Int("i")
decl tmp1 = Real("tmp1", :dim => [Dim(@opts[:vector_length])], :allocate => true)
decl tmp2 = Real("tmp2", :dim => [Dim(@opts[:vector_length])], :allocate => true)
pr tmp1[1] === x4
pr For(i,1,@opts[:vector_length]){
pr tmp2[i] === Sqrt(tmp1[i])
}
pr y4 === tmp2[1]
}
end
unless get_lang == FORTRAN then
x5 = Real("x", :dir => :in, :vector_length => @opts[:vector_length])
y5 = Real("y", :dir => :in, :vector_length => @opts[:vector_length])
z5 = Real("z", :vector_length => @opts[:vector_length])
@p_min = Procedure("Min", [x5,y5], :return => z5){
decl i = Int("i")
decl tmp1 = Real("tmp1", :dim => [Dim(@opts[:vector_length])], :allocate => true)
decl tmp2 = Real("tmp2", :dim => [Dim(@opts[:vector_length])], :allocate => true)
decl tmp3 = Real("tmp3", :dim => [Dim(@opts[:vector_length])], :allocate => true)
pr tmp1[1] === x5
pr tmp2[1] === y5
pr For(i,1,@opts[:vector_length]){
pr tmp3[i] === Ternary(tmp1[i] < tmp2[i], tmp1[i], tmp2[i])
}
pr z5 === tmp3[1]
}
end
The subroutines are created by simply creating instances and generating the procedure. When a user defined functions is used by a subroutine it need to be referenced by the Procedure like it is the case for the nest 2,8,10 and 11:
nests = {}
@opts[:nests].each{|i|
case i
when 2,10
functions = {:pabdf_nsi => @pabdf_nsi}
when 8
functions = get_lang == FORTRAN ? nil :{:tanh => @p_tanh, :min => @p_min}
when 11
functions = get_lang == FORTRAN ? nil : {:maxval => @p_maxval}
else
functions = nil
end
eval "nests[:nest#{i}] = Nest#{i}::new(@opts,functions)"
eval "nests[:nest#{i}].generate"
}
However the subroutines have been previously generated the code has
not been printed yet into the kernel, this is done using the
method call
of the subroutines.
class KSplitBoast < KSplitOss
def generate
if @opts[:unroll] then
@dimensions = [2,3]
else
@dimensions = [@ndime]
end
register_funccall("maxval") if get_lang == FORTRAN
register_funccall("epsilon") if get_lang == FORTRAN
register_funccall("stop") if get_lang == FORTRAN
<<boast-mocks>>
<<boast-generate-maxval>>
<<boast-generate-tanh>>
<<boast-generate-sqrt>>
<<boast-generate-min>>
<<boast-subroutines-instanciate>>
includes = ["immintrin.h"]
includes.push("string.h", "math.h", "float.h") if get_lang == C
@kernel = CKernel::new( :includes => includes )
functions = [@pabdf_nsi]
functions.push @p_maxval unless get_lang == FORTRAN
@kernel.procedure = declare_procedure(functions.flatten)
# Printing subroutines
pr @pabdf_nsi
unless get_lang == FORTRAN then
pr @p_maxval
pr @p_min
pr @p_tanh
end
nests.values.each{|n| pr n.code } unless @opts[:inline] == :included
# Main procedure body
opn @kernel.procedure
# Local arrays
<<boast-local-vars>>
<<boast-initialization>>
# Either call subroutine or paste their code
nests.values.each{|n| n.call}
close @kernel.procedure
end
end
This document has local variables in its postembule, which should allow Org-mode to work seamlessly without any setup. If you’re uncomfortable using such variables, you can safely ignore them at startup. Exporting may require that you copy them in your .emacs.