From 2030f952e827a4ae8fb3d17bb68c4033f23c6f36 Mon Sep 17 00:00:00 2001 From: Patrick Atkinson Date: Sun, 6 Sep 2020 16:04:29 +0100 Subject: [PATCH 1/3] add initial GPU port of xtb --- meson.build | 4 + meson/meson.build | 20 +- meson_options.txt | 10 + src/disp/dftd4.f90 | 143 ++++++- src/intgrad.f90 | 3 + src/lin_mod.f90 | 1 + src/mctc/lapack/eigensolve.f90 | 45 +++ src/mctc/mctc_global.f90 | 6 + src/mctc/mctc_init.f90 | 9 + src/nvtx.f90 | 81 ++++ src/scc_core.f90 | 2 + src/scf_module.f90 | 8 + src/xtb/hamiltonian_gpu.f90 | 710 +++++++++++++++++++++++++++++++++ src/xtb/meson.build | 1 + 14 files changed, 1041 insertions(+), 2 deletions(-) create mode 100644 src/nvtx.f90 create mode 100644 src/xtb/hamiltonian_gpu.f90 diff --git a/meson.build b/meson.build index 77c27208f..52402e1fd 100644 --- a/meson.build +++ b/meson.build @@ -86,6 +86,10 @@ subdir('src') srcs += 'symmetry/symmetry.f90' srcs += 'symmetry/symmetry_i.c' +if get_option('nvtx') + srcs += 'src/nvtx.f90' +endif + xtb_inc = meson.current_source_dir() / 'include' incdir = include_directories('include') diff --git a/meson/meson.build b/meson/meson.build index 749c0e99b..47b533c88 100644 --- a/meson/meson.build +++ b/meson/meson.build @@ -34,9 +34,23 @@ elif fc.get_id() == 'intel' add_project_arguments('-traceback', language: 'fortran') elif fc.get_id() == 'pgi' add_project_arguments( - '-Mbackslash', '-Mallocatable=03', '-traceback', '-r8', + '-Mpreprocess', '-Mbackslash', '-Mallocatable=03', '-traceback', '-r8', language: 'fortran' ) + + if get_option('gpu') + add_project_arguments('-acc', '-Minfo=accel', '-DXTB_GPU', language: 'fortran') + add_project_link_arguments('-acc', '-Minfo=accel', language: 'fortran') + + gpu_arch = get_option('gpu_arch') + add_project_arguments('-ta=tesla:cc@0@'.format(gpu_arch), language: 'fortran') + add_project_link_arguments('-ta=tesla:cc@0@'.format(gpu_arch), language: 'fortran') + + if get_option('cusolver') + add_project_arguments('-Mcudalib=cusolver', '-DUSE_CUSOLVER', language: 'fortran') + add_project_link_arguments('-Mcudalib=cusolver', '-DUSE_CUSOLVER', language: 'fortran') + endif + endif endif # fix compiliation problems with of symmetry/symmetry_i.c @@ -135,6 +149,10 @@ endif dependencies += dependency('threads') +if get_option('nvtx') + dependencies += fc.find_library('nvToolsExt', required: true) +endif + # distribute dependencies for shared object and static executable lib_deps += dependencies exe_deps += dependencies diff --git a/meson_options.txt b/meson_options.txt index 8e38f6022..21692bf8a 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -28,3 +28,13 @@ option('build_name', type: 'string', value: 'unknown', description: 'Name of the build, will be overwritten automatically by git') option('test_timeout', type: 'integer', min: 1, value: 30, description: 'test timeout in seconds') + +# GPU specific options +option('gpu', type: 'boolean', value: false, + description: 'use GPU acceleration') +option('gpu_arch', type: 'string', value: '70', + description: 'GPU architecture version string') +option('cusolver', type: 'boolean', value: false, + description: 'Use cuSOLVER for eigensolver routines') +option('nvtx', type: 'boolean', value: false, + description: 'use NVTX markers') diff --git a/src/disp/dftd4.f90 b/src/disp/dftd4.f90 index cf1340172..fc04c013a 100644 --- a/src/disp/dftd4.f90 +++ b/src/disp/dftd4.f90 @@ -1,6 +1,7 @@ ! This file is part of xtb. ! ! Copyright (C) 2017-2020 Stefan Grimme +! Copyright (C) 2020, NVIDIA CORPORATION. All rights reserved. ! ! xtb is free software: you can redistribute it and/or modify it under ! the terms of the GNU Lesser General Public License as published by @@ -1609,8 +1610,13 @@ subroutine d4_full_gradient_latp & if (par%s9 /= 0.0_wp) then call get_atomic_c6(dispm, nat, mol%at, zerovec, zerodcn, zerodq, & & c6, dc6dcn, dc6dq) +#ifdef XTB_GPU + call atm_gradient_latp_gpu(mol, trans, cutoff3, par, sqrtZr4r2, c6, dc6dcn, & + & energies3, gradient, sigma, dEdcn) +#else call atm_gradient_latp(mol, trans, cutoff3, par, sqrtZr4r2, c6, dc6dcn, & & energies3, gradient, sigma, dEdcn) +#endif end if if (present(e3)) e3 = sum(energies3) @@ -1892,9 +1898,15 @@ subroutine d4_atm_gradient_latp & call get_atomic_c6(dispm, nat, mol%at, zerovec, zerodcn, zerodq, & & c6, dc6dcn, dc6dq) +#ifdef XTB_GPU + call atm_gradient_latp_gpu & + & (mol, trans, cutoff, par, sqrtZr4r2, c6, dc6dcn, & + & energies, gradient, sigma, dEdcn) +#else call atm_gradient_latp & & (mol, trans, cutoff, par, sqrtZr4r2, c6, dc6dcn, & & energies, gradient, sigma, dEdcn) +#endif call mctc_gemv(dcndr, dEdcn, gradient, beta=1.0_wp) call mctc_gemv(dcndL, dEdcn, sigma, beta=1.0_wp) @@ -1903,7 +1915,6 @@ subroutine d4_atm_gradient_latp & end subroutine d4_atm_gradient_latp - subroutine atm_gradient_latp & & (mol, trans, cutoff, par, r4r2, c6, dc6dcn, & & energies, gradient, sigma, dEdcn) @@ -1996,11 +2007,140 @@ subroutine atm_gradient_latp & end subroutine atm_gradient_latp +subroutine atm_gradient_latp_gpu & + & (mol, trans, cutoff, par, r4r2, c6, dc6dcn, & + & energies, gradient, sigma, dEdcn) + + !> Molecular structure data + type(TMolecule), intent(in) :: mol + + !> Damping parameters + type(dftd_parameter), intent(in) :: par + + real(wp), intent(in) :: trans(:, :) + real(wp), intent(in) :: r4r2(:) + real(wp), intent(in) :: cutoff + real(wp), intent(in) :: c6(:, :) + real(wp), intent(in) :: dc6dcn(:, :) + + real(wp), intent(inout) :: energies(:) + real(wp), intent(inout) :: gradient(:, :) + real(wp), intent(inout) :: sigma(:, :) + real(wp), intent(inout) :: dEdcn(:) + + integer :: iat, jat, kat, ati, atj, atk, jtr, ktr + real(wp) :: cutoff2 + real(wp) :: rij(3), rjk(3), rik(3), r2ij, r2jk, r2ik + real(wp) :: c6ij, c6jk, c6ik, cij, cjk, cik, scale + real(wp) :: dE, dG(3, 3), dS(3, 3), dCN(3) + real(wp), parameter :: sr = 4.0_wp/3.0_wp + integer :: mlen, k, kk + + cutoff2 = cutoff**2 + mlen = len(mol) + + !$acc enter data copyin(par,trans,r4r2,c6,dc6dcn,energies,gradient,sigma,dEdcn, & + !$acc& mol,mol%at,mol%xyz) + + !$acc parallel default(present) private(rij,rjk,rik,dG,dS,dCN) + + !$acc loop gang collapse(2) + do iat = 1, mlen + do jat = 1, mlen + if (jat.gt.iat) cycle + + do kat = 1, jat + ati = mol%at(iat) + atj = mol%at(jat) + + c6ij = c6(jat,iat) + cij = par%a1*sqrt(3.0_wp*r4r2(ati)*r4r2(atj))+par%a2 + + atk = mol%at(kat) + + c6ik = c6(kat,iat) + c6jk = c6(kat,jat) + + cik = par%a1*sqrt(3.0_wp*r4r2(ati)*r4r2(atk))+par%a2 + cjk = par%a1*sqrt(3.0_wp*r4r2(atj)*r4r2(atk))+par%a2 + + do jtr = 1, size(trans, dim=2) + rij = mol%xyz(:, jat) - mol%xyz(:, iat) + trans(:, jtr) + r2ij = sum(rij**2) + if (r2ij > cutoff2 .or. r2ij < 1.0e-14_wp) cycle + do ktr = 1, size(trans, dim=2) + if (jat == kat .and. jtr == ktr) cycle + rik = mol%xyz(:, kat) - mol%xyz(:, iat) + trans(:, ktr) + r2ik = sum(rik**2) + if (r2ik > cutoff2 .or. r2ik < 1.0e-14_wp) cycle + rjk = mol%xyz(:, kat) - mol%xyz(:, jat) + trans(:, ktr) & + & - trans(:, jtr) + r2jk = sum(rjk**2) + if (r2jk > cutoff2 .or. r2jk < 1.0e-14_wp) cycle + + call deriv_atm_triple(c6ij, c6ik, c6jk, cij, cjk, cik, & + & r2ij, r2jk, r2ik, dc6dcn(iat,jat), dc6dcn(jat,iat), & + & dc6dcn(jat,kat), dc6dcn(kat,jat), dc6dcn(iat,kat), & + & dc6dcn(kat,iat), rij, rjk, rik, par%alp, dE, dG, dS, dCN) + + scale = par%s9 * triple_scale(iat, jat, kat) + !$acc atomic + energies(iat) = energies(iat) + dE * scale/3 + !$acc end atomic + !$acc atomic + energies(jat) = energies(jat) + dE * scale/3 + !$acc end atomic + !$acc atomic + energies(kat) = energies(kat) + dE * scale/3 + !$acc end atomic + do k = 1,3 + !$acc atomic + gradient(k, iat) = gradient(k, iat) + dG(k, 1) * scale + !$acc end atomic + !$acc atomic + gradient(k, jat) = gradient(k, jat) + dG(k, 2) * scale + !$acc end atomic + !$acc atomic + gradient(k, kat) = gradient(k, kat) + dG(k, 3) * scale + !$acc end atomic + enddo + do k = 1,3 + do kk = 1,3 + !$acc atomic + sigma(k, kk) = sigma(k, kk) + dS(k, kk) * scale + !$acc end atomic + enddo + enddo + !$acc atomic + dEdcn(iat) = dEdcn(iat) + dCN(1) * scale + !$acc end atomic + !$acc atomic + dEdcn(jat) = dEdcn(jat) + dCN(2) * scale + !$acc end atomic + !$acc atomic + dEdcn(kat) = dEdcn(kat) + dCN(3) * scale + !$acc end atomic + + end do + end do + + end do + end do + end do + !$acc end parallel + + !$acc exit data copyout(energies,gradient,sigma,dEdcn) + !$acc exit data delete(par,trans,r4r2,c6,dc6dcn,mol,mol%at,mol%xyz) + + +end subroutine atm_gradient_latp_gpu + pure subroutine deriv_atm_triple(c6ij, c6ik, c6jk, cij, cjk, cik, & & r2ij, r2jk, r2ik, dc6ij, dc6ji, dc6jk, dc6kj, dc6ik, dc6ki, & & rij, rjk, rik, alp, dE, dG, dS, dCN) + !$acc routine vector real(wp), intent(in) :: c6ij, c6ik, c6jk real(wp), intent(in) :: cij, cjk, cik real(wp), intent(in) :: r2ij, r2jk, r2ik @@ -2069,6 +2209,7 @@ end subroutine deriv_atm_triple !> Logic exercise to distribute a triple energy to atomwise energies. elemental function triple_scale(ii, jj, kk) result(scale) + !$acc routine seq !> Atom indices integer, intent(in) :: ii, jj, kk diff --git a/src/intgrad.f90 b/src/intgrad.f90 index 2e9b54d3d..154950a3e 100644 --- a/src/intgrad.f90 +++ b/src/intgrad.f90 @@ -64,6 +64,7 @@ module xtb_intgrad ! --------------------------------------------------------------[SAW1907]- !> calculates a partial overlap in one cartesian direction pure elemental function olapp(l,gama) result(s) + !$acc routine seq implicit none integer,intent(in) :: l real(wp),intent(in) :: gama @@ -344,6 +345,7 @@ end subroutine build_hshift ! --------------------------------------------------------------[SAW1801]- pure subroutine build_hshift2(cfs,a,e,l) + !$acc routine seq implicit none integer,intent(in) :: l real(wp), intent(in) :: a,e @@ -374,6 +376,7 @@ end subroutine build_hshift2 ! --------------------------------------------------------------[SAW1801]- pure subroutine prod3(a,b,d,la,lb) + !$acc routine seq implicit none integer,intent(in) :: la,lb real(wp), intent(in) :: a(*),b(*) diff --git a/src/lin_mod.f90 b/src/lin_mod.f90 index 18a989bdb..cdc7617f3 100644 --- a/src/lin_mod.f90 +++ b/src/lin_mod.f90 @@ -27,6 +27,7 @@ module xtb_lin !*********************************************************************** pure elemental integer function lin(i1,i2) + !$acc routine seq integer,intent(in) :: i1,i2 integer :: idum1,idum2 idum1=max(i1,i2) diff --git a/src/mctc/lapack/eigensolve.f90 b/src/mctc/lapack/eigensolve.f90 index ad3020cd7..f42745b5c 100644 --- a/src/mctc/lapack/eigensolve.f90 +++ b/src/mctc/lapack/eigensolve.f90 @@ -1,6 +1,7 @@ ! This file is part of xtb. ! ! Copyright (C) 2019-2020 Sebastian Ehlert +! Copyright (C) 2020, NVIDIA CORPORATION. All rights reserved. ! ! xtb is free software: you can redistribute it and/or modify it under ! the terms of the GNU Lesser General Public License as published by @@ -21,6 +22,10 @@ module xtb_mctc_lapack_eigensolve use xtb_mctc_lapack_geneigval, only : lapack_sygvd use xtb_mctc_lapack_trf, only : mctc_potrf use xtb_type_environment, only : TEnvironment +#ifdef USE_CUSOLVER + use xtb_mctc_global + use cusolverDn +#endif implicit none private @@ -35,6 +40,9 @@ module xtb_mctc_lapack_eigensolve real(sp), allocatable :: sbmat(:, :) real(dp), allocatable :: dwork(:) real(dp), allocatable :: dbmat(:, :) +#ifdef USE_CUSOLVER + integer :: lwork +#endif contains generic :: solve => sgen_solve, dgen_solve procedure :: sgen_solve => mctc_ssygvd @@ -74,11 +82,29 @@ subroutine initDEigenSolver(self, env, bmat) class(TEigenSolver), intent(out) :: self type(TEnvironment), intent(inout) :: env real(dp), intent(in) :: bmat(:, :) +#ifdef USE_CUSOLVER + integer :: istat, lwork + ! dummy is only a dummy argument used to query the workspace size needed + ! for cuSolverDnDsygvd -- it is okay to pass an empty array to cuSolverDnDsygvd_bufferSize + real(dp) :: dummy(:) +#endif self%n = size(bmat, 1) +#ifdef USE_CUSOLVER + istat = cusolverDnDsygvd_bufferSize(cusolverDnH, CUSOLVER_EIG_TYPE_1, & + CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, self%n, dummy, & + self%n, dummy, self%n, dummy, lwork) + if (istat /= 0) then + call env%error("failed to get dygvd buffer size", source) + end if + + self%lwork = lwork + allocate(self%dwork(lwork)) +#else allocate(self%dwork(1 + 6*self%n + 2*self%n**2)) allocate(self%iwork(3 + 5*self%n)) +#endif self%dbmat = bmat ! Check for Cholesky factorisation @@ -118,13 +144,32 @@ subroutine mctc_dsygvd(self, env, amat, bmat, eval) real(dp), intent(in) :: bmat(:, :) real(dp), intent(out) :: eval(:) integer :: info, ldwork, liwork +#ifdef USE_CUSOLVER + integer :: istat +#endif self%dbmat(:, :) = bmat +#ifdef USE_CUSOLVER + !$acc enter data copyin(amat, self%dbmat, eval, info) create(self%dwork) + + !$acc host_data use_device(amat, self%dbmat, eval, self%dwork, info) + istat = cusolverDnDsygvd(cusolverDnH, CUSOLVER_EIG_TYPE_1, & + CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, self%n, amat, self%n, & + self%dbmat, self%n, eval, self%dwork, self%lwork, info) + !$acc end host_data + + !$acc exit data copyout(amat, self%dbmat, eval, info) delete(self%dwork) + + if (istat /= 0) then + call env%error("cuSovlerDnDsygvd failed", source) + end if +#else ldwork = size(self%dwork) liwork = size(self%iwork) call lapack_sygvd(1, 'v', 'u', self%n, amat, self%n, self%dbmat, self%n, eval, & & self%dwork, ldwork, self%iwork, liwork, info) +#endif if (info /= 0) then call env%error("Failed to solve eigenvalue problem", source) diff --git a/src/mctc/mctc_global.f90 b/src/mctc/mctc_global.f90 index 313b1469c..d932f3de3 100644 --- a/src/mctc/mctc_global.f90 +++ b/src/mctc/mctc_global.f90 @@ -17,6 +17,9 @@ module xtb_mctc_global use xtb_type_environment, only : TEnvironment +#ifdef USE_CUSOLVER + use cusolverDn +#endif character(len=:),allocatable :: name !< name of the currently running program character(len=:),allocatable,target :: msgbuffer !< error message buffer integer :: msgid !< number of generated errors @@ -31,6 +34,9 @@ module xtb_mctc_global !> Global environment type(TEnvironment), allocatable :: persistentEnv +#ifdef USE_CUSOLVER + type(cuSolverDnHandle) :: cusolverDnH +#endif type :: errormsg character(len=:),allocatable :: msg diff --git a/src/mctc/mctc_init.f90 b/src/mctc/mctc_init.f90 index 17a185880..5b41d8da9 100644 --- a/src/mctc/mctc_init.f90 +++ b/src/mctc/mctc_init.f90 @@ -24,6 +24,9 @@ subroutine mctc_init(progname,ntimer,verbose) character(len=*),intent(in) :: progname integer, intent(in) :: ntimer logical, intent(in) :: verbose +#ifdef USE_CUSOLVER + integer :: err +#endif !! ======================================================================== ! signal processing @@ -56,6 +59,12 @@ subroutine mctc_init(progname,ntimer,verbose) allocate(persistentEnv) call init(persistentEnv) +#ifdef USE_CUSOLVER + err = cusolverDnCreate(cusolverDnH) + if (err /= 0) then + call persistentEnv%error("failed to create cusolver handle", "mctc_init") + end if +#endif end subroutine mctc_init subroutine mctc_sanity(sane) diff --git a/src/nvtx.f90 b/src/nvtx.f90 new file mode 100644 index 000000000..0ea1f407d --- /dev/null +++ b/src/nvtx.f90 @@ -0,0 +1,81 @@ +! This file is part of xtb. +! +! Copyright (C) 2020, NVIDIA CORPORATION. All rights reserved. +! +! xtb is free software: you can redistribute it and/or modify it under +! the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation, either version 3 of the License, or +! (at your option) any later version. +! +! xtb is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU Lesser General Public License for more details. +! +! You should have received a copy of the GNU Lesser General Public License +! along with xtb. If not, see . + +module nvtx + + use iso_c_binding + implicit none + + integer,private :: col(7) = [ Z'0000ff00', Z'000000ff', Z'00ffff00', Z'00ff00ff', Z'0000ffff', Z'00ff0000', Z'00ffffff'] + character(len=256),private :: tempName + + type, bind(C):: nvtxEventAttributes + integer(C_INT16_T):: version=1 + integer(C_INT16_T):: size=48 ! + integer(C_INT):: category=0 + integer(C_INT):: colorType=1 ! NVTX_COLOR_ARGB = 1 + integer(C_INT):: color + integer(C_INT):: payloadType=0 ! NVTX_PAYLOAD_UNKNOWN = 0 + integer(C_INT):: reserved0 + integer(C_INT64_T):: payload ! union uint,int,double + integer(C_INT):: messageType=1 ! NVTX_MESSAGE_TYPE_ASCII = 1 + type(C_PTR):: message ! ascii char + end type + + interface nvtxRangePush + ! push range with custom label and standard color + subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA') + use iso_c_binding + character(kind=C_CHAR,len=*) :: name + end subroutine + + ! push range with custom label and custom color + subroutine nvtxRangePushEx(event) bind(C, name='nvtxRangePushEx') + use iso_c_binding + import:: nvtxEventAttributes + type(nvtxEventAttributes):: event + end subroutine + end interface + + interface nvtxRangePop + subroutine nvtxRangePop() bind(C, name='nvtxRangePop') + end subroutine + end interface + +contains + + subroutine nvtxStartRange(name,id) + character(kind=c_char,len=*) :: name + integer, optional:: id + type(nvtxEventAttributes):: event + + tempName=trim(name)//c_null_char + + if ( .not. present(id)) then + call nvtxRangePush(tempName) + else + event%color=col(mod(id,7)+1) + event%message=c_loc(tempName) + call nvtxRangePushEx(event) + end if + end subroutine + + subroutine nvtxEndRange + call nvtxRangePop + end subroutine + +end module nvtx diff --git a/src/scc_core.f90 b/src/scc_core.f90 index 55c35cb13..462f069ee 100644 --- a/src/scc_core.f90 +++ b/src/scc_core.f90 @@ -611,6 +611,7 @@ end subroutine scc !> H0 off-diag scaling subroutine h0scal(hData,il,jl,izp,jzp,valaoi,valaoj,km) + !$acc routine seq type(THamiltonianData), intent(in) :: hData integer, intent(in) :: il integer, intent(in) :: jl @@ -695,6 +696,7 @@ end subroutine electro !! ======================================================================== pure function shellPoly(iPoly,jPoly,iRad,jRad,xyz1,xyz2) use xtb_mctc_convert, only : aatoau + !$acc routine seq real(wp), intent(in) :: iPoly,jPoly real(wp), intent(in) :: iRad,jRad real(wp), intent(in) :: xyz1(3),xyz2(3) diff --git a/src/scf_module.f90 b/src/scf_module.f90 index 68e651974..4f8d15b2a 100644 --- a/src/scf_module.f90 +++ b/src/scf_module.f90 @@ -44,6 +44,7 @@ module xtb_scf use xtb_xtb_dispersion use xtb_xtb_hamiltonian, only : getSelfEnergy, build_SDQH0, build_dSDQH0, & & build_dSdQH0_noreset, count_dpint, count_qpint + use xtb_xtb_hamiltonian_gpu, only: build_SDQH0_gpu use xtb_xtb_multipole use xtb_paramset, only : tmmetal use xtb_scc_core @@ -532,10 +533,17 @@ subroutine scf(env, mol, wfn, basis, pcem, xtbData, solvation, & & selfEnergy=selfEnergy, dSEdcn=dSEdcn) ! compute integrals and prescreen to set up list arrays call latp%getLatticePoints(trans, sqrt(800.0_wp)) +#ifdef XTB_GPU + call build_SDQH0_gpu(xtbData%nShell, xtbData%hamiltonian, mol%n, mol%at, & + & basis%nbf, basis%nao, mol%xyz, trans, selfEnergy, intcut, & + & basis%caoshell, basis%saoshell, basis%nprim, basis%primcount, basis%alp, & + & basis%cont, S, dpint, qpint, H0) +#else call build_SDQH0(xtbData%nShell, xtbData%hamiltonian, mol%n, mol%at, & & basis%nbf, basis%nao, mol%xyz, trans, selfEnergy, intcut, & & basis%caoshell, basis%saoshell, basis%nprim, basis%primcount, basis%alp, & & basis%cont, S, dpint, qpint, H0) +#endif call count_dpint(ndp, dpint, neglect) call count_qpint(nqp, qpint, neglect) diff --git a/src/xtb/hamiltonian_gpu.f90 b/src/xtb/hamiltonian_gpu.f90 new file mode 100644 index 000000000..d62bb801b --- /dev/null +++ b/src/xtb/hamiltonian_gpu.f90 @@ -0,0 +1,710 @@ +! This file is part of xtb. +! +! Copyright (C) 2020, NVIDIA CORPORATION. All rights reserved. +! +! xtb is free software: you can redistribute it and/or modify it under +! the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation, either version 3 of the License, or +! (at your option) any later version. +! +! xtb is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU Lesser General Public License for more details. +! +! You should have received a copy of the GNU Lesser General Public License +! along with xtb. If not, see . + +module xtb_xtb_hamiltonian_gpu + use xtb_mctc_accuracy, only : wp + use xtb_mctc_constants, only : pi + use xtb_mctc_convert, only : evtoau + use xtb_xtb_data, only : THamiltonianData + use xtb_intgrad + use xtb_lin + use xtb_scc_core, only : shellPoly, h0scal + use xtb_grad_core, only : dshellPoly + implicit none + private + + public :: build_SDQH0_gpu + + integer,private, parameter :: llao (0:3) = (/ 1, 3, 6,10/) + integer,private, parameter :: llao2(0:3) = (/ 1, 3, 5, 7/) + + integer,parameter :: lx(84) = (/ & + & 0, & + & 1,0,0, & + & 2,0,0,1,1,0, & + & 3,0,0,2,2,1,0,1,0,1, & + & 4,0,0,3,3,1,0,1,0,2,2,0,2,1,1, & + & 5,0,0,3,3,2,2,0,0,4,4,1,0,0,1,1,3,1,2,2,1, & + & 6,0,0,3,3,0,5,5,1,0,0,1,4,4,2,0,2,0,3,3,1,2,2,1,4,1,1,2/) + integer,parameter :: ly(84) = (/ & + & 0, & + & 0,1,0, & + & 0,2,0,1,0,1, & + & 0,3,0,1,0,2,2,0,1,1, & + & 0,4,0,1,0,3,3,0,1,2,0,2,1,2,1, & + & 0,5,0,2,0,3,0,3,2,1,0,4,4,1,0,1,1,3,2,1,2, & + & 0,6,0,3,0,3,1,0,0,1,5,5,2,0,0,2,4,4,2,1,3,1,3,2,1,4,1,2/) + integer,parameter :: lz(84) = (/ & + & 0, & + & 0,0,1, & + & 0,0,2,0,1,1, & + & 0,0,3,0,1,0,1,2,2,1, & + & 0,0,4,0,1,0,1,3,3,0,2,2,1,1,2, & + & 0,0,5,0,2,0,3,2,3,0,1,0,1,4,4,3,1,1,1,2,2, & + & 0,0,6,0,3,3,0,1,5,5,1,0,0,2,4,4,0,2,1,2,2,3,1,3,1,1,4,2/) + + real(wp), parameter :: trafo(6,6) = reshape((/ & ! copied from scf.f, simplyfied + ! --- dS + & sqrt(1.0_wp/5.0_wp), & + & sqrt(1.0_wp/5.0_wp), & + & sqrt(1.0_wp/5.0_wp), & + & 0.0_wp,0.0_wp,0.0_wp, & + ! --- dx²-y² + & 0.5_wp*sqrt(3.0_wp), & + &-0.5_wp*sqrt(3.0_wp), & + & 0.0_wp,0.0_wp,0.0_wp,0.0_wp, & + ! --- dz² + & 0.5_wp,0.5_wp,-1.0_wp, & + & 0.0_wp,0.0_wp, 0.0_wp, & + ! --- rest + & 0.0_wp,0.0_wp,0.0_wp,1.0_wp,0.0_wp,0.0_wp, & + & 0.0_wp,0.0_wp,0.0_wp,0.0_wp,1.0_wp,0.0_wp, & + & 0.0_wp,0.0_wp,0.0_wp,0.0_wp,0.0_wp,1.0_wp /), shape(trafo)) + +contains + +pure subroutine build_sdq_ints_gpu(a,b,c,alpi,alpj,la,lb,kab,t,e,lx,ly,lz,v) + !$acc routine seq + implicit none + ! aufpunkte,ref point,intarray + integer,intent(in) :: la,lb + real(wp), intent(in) :: alpi,alpj + real(wp), intent(in) :: a(3),b(3),c(3) + real(wp), intent(in) :: kab,t(0:8),e(3) + integer, intent(in) :: lx(84),ly(84),lz(84) + real(wp), intent(out) :: v(10) + ! local variables + real(wp) :: dd(0:8),va(3),val(3,3) + real(wp) :: aa(0:3,3),bb(0:3,3) + integer :: i,j,ij(3),ii(3),jj(3),lmax + + val = 0 + + aa = 0 + bb = 0 + dd = 0 + ii = [lx(la),ly(la),lz(la)] + jj = [lx(lb),ly(lb),lz(lb)] + ij = ii+jj + aa(ii(1),1)=1.0_wp + aa(ii(2),2)=1.0_wp + aa(ii(3),3)=1.0_wp + bb(jj(1),1)=1.0_wp + bb(jj(2),2)=1.0_wp + bb(jj(3),3)=1.0_wp + + do i = 1, 3 + ! calculate cartesian prefactor for first gaussian + call build_hshift2(aa(:,i),a(i),e(i),ii(i)) ! + ! form their product + call prod3(aa(:,i),bb(:,i),dd(:),ii(i),jj(i)) + lmax = ij(i) + do j = 0, lmax + ! + va = [t(j), t(j+1) + e(i)*t(j), t(j+2) + 2*e(i)*t(j+1) + e(i)*e(i)*t(j)] + val(i,1:3) = val(i,1:3) + dd(j)*va(1:3) + enddo + enddo + v( 1)=kab*(val(1,1)*val(2,1)*val(3,1)) + v( 2)=kab*(val(1,2)*val(2,1)*val(3,1)) + v( 3)=kab*(val(1,1)*val(2,2)*val(3,1)) + v( 4)=kab*(val(1,1)*val(2,1)*val(3,2)) + v( 5)=kab*(val(1,3)*val(2,1)*val(3,1)) + v( 6)=kab*(val(1,1)*val(2,3)*val(3,1)) + v( 7)=kab*(val(1,1)*val(2,1)*val(3,3)) + v( 8)=kab*(val(1,2)*val(2,2)*val(3,1)) + v( 9)=kab*(val(1,2)*val(2,1)*val(3,2)) + v(10)=kab*(val(1,1)*val(2,2)*val(3,2)) +end subroutine build_sdq_ints_gpu + +!> Computes the dipole and quadrupole integrals and performs screening to +! determine, which contribute to potential +subroutine build_SDQH0_gpu(nShell, hData, nat, at, nbf, nao, xyz, trans, selfEnergy, & + & intcut, caoshell, saoshell, nprim, primcount, alp, cont, & + & sint, dpint, qpint, H0) + implicit none + integer, intent(in) :: nShell(:) + type(THamiltonianData), intent(in) :: hData + !> # of atoms + integer, intent(in) :: nat + !> # of spherical AOs (SAOs) + integer, intent(in) :: nao + !> # of Cartesian AOs (CAOs) + integer, intent(in) :: nbf + integer, intent(in) :: at(nat) + !> Cartesian coordinates + real(wp),intent(in) :: xyz(3,nat) + real(wp),intent(in) :: trans(:, :) + real(wp), intent(in) :: selfEnergy(:, :) + !> Integral cutoff according to prefactor from Gaussian product theorem + real(wp),intent(in) :: intcut + !> Map shell of atom to index in CAO space (lowest Cart. component is taken) + integer, intent(in) :: caoshell(:,:) + !> Map shell of atom to index in SAO space (lowest m_l component is taken) + integer, intent(in) :: saoshell(:,:) + integer, intent(in) :: nprim(:) + !> Index of first primitive (over entire system) of given CAO + integer, intent(in) :: primcount(:) + real(wp),intent(in) :: alp(:) + real(wp),intent(in) :: cont(:) + !> Overlap integral matrix + real(wp),intent(out) :: sint(nao,nao) + !> Dipole integral matrix + real(wp),intent(out) :: dpint(3,nao,nao) + !> Quadrupole integral matrix + real(wp),intent(out) :: qpint(6,nao,nao) + !> Core Hamiltonian + real(wp),intent(out) :: H0(:) + + + integer i,j,k,l,m,ii,jj,ll,mm,kk,ki,kj,kl,mi,mj,ij,ioff,joff + real(wp) tmp1,tmp2,tmp3,tmp4,step,step2 + real(wp) dx,dy,dz,s00r,s00l,s00,alpj + real(wp) skj,r1,r2,tt,t1,t2,t3,t4,thr2,f,ci,cc,cj,alpi,rab2,ab,est + + real(wp) ra(3),rb(3),f1,f2,point(3) + real(wp) dtmp(3),qtmp(6),ss(6,6),dd(6,6,3),qq(6,6,6),tmp(6,6),dd2(3,6,6) + integer ip,jp,iat,jat,izp,jzp,ish,jsh,icao,jcao,iao,jao,jshmax + integer ishtyp,jshtyp,iptyp,jptyp,naoi,naoj,mli,mlj,iprim,jprim + integer :: il, jl, itr + real(wp) :: zi, zj, zetaij, km, hii, hjj, hav, shpoly + integer itt(0:3) + parameter(itt =(/0,1,4,10/)) + real(wp) :: saw(10) + + real(wp) :: gama,kab,t(0:8),e(3) + real(wp),parameter :: sqrtpi = sqrt(pi) + + real(wp) s2(6,6),dum(6,6),sspher + + !$acc enter data create(H0(:),sint(nao,nao),dpint(3,nao,nao),qpint(6,nao,nao)) + + !$acc kernels default(present) + ! integrals + H0(:) = 0.0_wp + sint = 0.0_wp + dpint = 0.0_wp + qpint = 0.0_wp + !$acc end kernels + ! --- Aufpunkt for moment operator + point = 0.0_wp + + !$acc enter data copyin(xyz, at, nShell, selfEnergy, caoshell, saoshell, & + !$acc& nprim, primcount, alp, cont, intcut, trans, point, hData, & + !$acc& hData%principalQuantumNumber(:, :), hData%angShell(:, :), hData%valenceShell(:, :), & + !$acc& hData%numberOfPrimitives(:, :), hData%slaterExponent(:, :), hData%selfEnergy(:, :), & + !$acc& hData%referenceOcc(:, :), hData%kCN(:, :), hData%electronegativity(:), hData%atomicRad(:), & + !$acc& hData%shellPoly(:, :), hData%pairParam(:, :), hData%kQShell(:, :), hData%kQAtom(:)) + + !$acc parallel vector_length(32) private(ss,dd,qq,rb,iat,jat,izp,ci,ra,& + !$acc& rab2,jzp,ish,ishtyp,icao,naoi,iptyp, & + !$acc& jsh,jshmax,jshtyp,jcao,naoj,jptyp,shpoly, & + !$acc& est,alpi,alpj,ab,iprim,jprim,ip,jp,il,jl,hii,hjj,km,zi,zj,zetaij,hav, & + !$acc& mli,mlj,iao,jao,ii,jj,k,ij,itr,ioff,joff,t,e,saw,s2,dum) + + !$acc loop gang collapse(2) + do iat = 1, nat + do jat = 1, nat + if (jat.ge.iat) cycle + ra(1:3) = xyz(1:3,iat) + izp = at(iat) + jzp = at(jat) + do ish = 1, nShell(izp) + ishtyp = hData%angShell(ish,izp) + icao = caoshell(ish,iat) + naoi = llao(ishtyp) + iptyp = itt(ishtyp) + do jsh = 1, nShell(jzp) + jshtyp = hData%angShell(jsh,jzp) + jcao = caoshell(jsh,jat) + naoj = llao(jshtyp) + jptyp = itt(jshtyp) + + il = ishtyp+1 + jl = jshtyp+1 + ! diagonals are the same for all H0 elements + hii = selfEnergy(ish, iat) + hjj = selfEnergy(jsh, jat) + + ! we scale the two shells depending on their exponent + zi = hData%slaterExponent(ish, izp) + zj = hData%slaterExponent(jsh, jzp) + zetaij = (2 * sqrt(zi*zj)/(zi+zj))**hData%wExp + call h0scal(hData,il,jl,izp,jzp,hData%valenceShell(ish, izp).ne.0, & + & hData%valenceShell(jsh, jzp).ne.0,km) + + hav = 0.5_wp * km * (hii + hjj) * zetaij + + !$acc loop private(ss,dd,qq,s2,dum) + do itr = 1, size(trans, dim=2) + !$acc cache(ss,dd,qq,s2,dum) + rb(1:3) = xyz(1:3,jat) + trans(:, itr) + rab2 = sum( (rb-ra)**2 ) + + ! distance dependent polynomial + shpoly=shellPoly(hData%shellPoly(il,izp),hData%shellPoly(jl,jzp),& + & hData%atomicRad(izp),hData%atomicRad(jzp),ra,rb) + + ss = 0.0_wp + dd = 0.0_wp + qq = 0.0_wp + + !call get_multiints(icao,jcao,naoi,naoj,iptyp,jptyp,ra,rb,point, & + ! & intcut,nprim,primcount,alp,cont,ss,dd,qq) + !$acc loop vector private(saw,t,e) independent collapse(2) + do ip = 1,nprim(icao+1) + do jp = 1,nprim(jcao+1) + + iprim = ip+primcount(icao+1) + jprim = jp+primcount(jcao+1) + alpi = alp(iprim) + alpj = alp(jprim) + + gama = alpi+alpj + ab = 1.0_wp/(gama) + e = (alpi * ra + alpj * rb)/(alpi + alpj) + t(0:8) = olapp([(j,j=0,8)],gama) + gama = alpi+alpj + ab = 1.0_wp/(gama) + est = rab2*alpi*alpj*ab + kab = exp(-est)*(sqrtpi*sqrt(ab))**3 + + do mli = 1,naoi + do mlj = 1,naoj + saw = 0.0_wp + + ! prim-prim quadrupole and dipole integrals + call build_sdq_ints_gpu(ra,rb,point,alpi,alpj, & + & iptyp+mli,jptyp+mlj,kab,t,e,lx,ly,lz,saw) + + iprim = ip+primcount(icao+mli) + jprim = jp+primcount(jcao+mlj) + ci = cont(iprim) ! coefficients NOT the same (contain CAO2SAO lin. comb. coefficients) + cc = cont(jprim)*ci + ! from primitive integrals fill CAO-CAO matrix for ish-jsh block + ! ! overlap + !$acc atomic + ss(mlj,mli) = ss(mlj,mli)+saw(1)*cc + !$acc end atomic + ! dipole + do k=1,3 + !$acc atomic + dd(mlj,mli,k) = dd(mlj,mli,k)+saw(k+1)*cc + !$acc end atomic + enddo + do k = 1,6 + ! quadrupole + !$acc atomic + qq(mlj,mli,k) = qq(mlj,mli,k)+saw(k+4)*cc + !$acc end atomic + enddo + enddo ! mlj + enddo ! mli + enddo ! jp + enddo ! ip + + !transform from CAO to SAO + !call dtrf2(ss,ishtyp,jshtyp) + ! DTRF2(S) + !select case(ishtyp) + !case(0) ! s-d + if (ishtyp.ge.2.or.jshtyp.ge.2) then + if (ishtyp.eq.0) then + !ss + do jj=1,6 + sspher=0 + do m=1,6 + sspher=sspher+trafo(m,jj)*ss(m,1) + enddo + s2(jj,1)=sspher + enddo + ss(1:5,1) = s2(2:6,1) + + !dd + do k = 1,3 + do jj=1,6 + sspher=0 + do m=1,6 + sspher=sspher+trafo(m,jj)*dd(m,1,k) + enddo + s2(jj,1)=sspher + enddo + dd(1:5,1,k) = s2(2:6,1) + enddo + + !qq + do k = 1,6 + do jj=1,6 + sspher=0 + do m=1,6 + sspher=sspher+trafo(m,jj)*qq(m,1,k) + enddo + s2(jj,1)=sspher + enddo + qq(1:5,1,k) = s2(2:6,1) + enddo + + !case(1) ! p-d + else if (ishtyp.eq.1) then + !ss + do ii=1,3 + do jj=1,6 + sspher=0 + do m=1,6 + sspher=sspher+trafo(m,jj)*ss(m,ii) + enddo + s2(jj,ii)=sspher + enddo + ss(1:5,ii) = s2(2:6,ii) + enddo + + ! dd + do k = 1,3 + do ii=1,3 + do jj=1,6 + sspher=0 + do m=1,6 + sspher=sspher+trafo(m,jj)*dd(m,ii,k) + enddo + s2(jj,ii)=sspher + enddo + dd(1:5,ii,k) = s2(2:6,ii) + enddo + enddo + + !qq + do k = 1,6 + do ii=1,3 + do jj=1,6 + sspher=0 + do m=1,6 + sspher=sspher+trafo(m,jj)*qq(m,ii,k) + enddo + s2(jj,ii)=sspher + enddo + qq(1:5,ii,k) = s2(2:6,ii) + enddo + enddo + !return + !end select + ! wasn't there, then try iat ... + !select case(jshtyp) + !case(0) ! d-s + else if (jshtyp.eq.0) then + !ss + do jj=1,6 + sspher=0 + do m=1,6 + sspher=sspher+trafo(m,jj)*ss(1,m) + enddo + s2(1,jj)=sspher + enddo + ss(1,1:5) = s2(1,2:6) + + !dd + do k = 1,3 + do jj=1,6 + sspher=0 + do m=1,6 + sspher=sspher+trafo(m,jj)*dd(1,m,k) + enddo + s2(1,jj)=sspher + enddo + dd(1,1:5,k) = s2(1,2:6) + enddo + + !qq + do k = 1,6 + do jj=1,6 + sspher=0 + do m=1,6 + sspher=sspher+trafo(m,jj)*qq(1,m,k) + enddo + s2(1,jj)=sspher + enddo + qq(1,1:5,k) = s2(1,2:6) + enddo + !return + !case(1) ! d-p + else if (jshtyp.eq.1) then + + !ss + do ii=1,3 + do jj=1,6 + sspher=0 + do m=1,6 + sspher=sspher+trafo(m,jj)*ss(ii,m) + enddo + s2(ii,jj)=sspher + enddo + ss(ii,1:5) = s2(ii,2:6) + enddo + + !dd + do k = 1,3 + do ii=1,3 + do jj=1,6 + sspher=0 + do m=1,6 + sspher=sspher+trafo(m,jj)*dd(ii,m,k) + enddo + s2(ii,jj)=sspher + enddo + dd(ii,1:5,k) = s2(ii,2:6) + enddo + enddo + + !qq + do k = 1,6 + do ii=1,3 + do jj=1,6 + sspher=0 + do m=1,6 + sspher=sspher+trafo(m,jj)*qq(ii,m,k) + enddo + s2(ii,jj)=sspher + enddo + qq(ii,1:5,k) = s2(ii,2:6) + enddo + enddo + + else + ! return + !end select + ! if not returned up to here -> d-d + ! CB: transposing s in first dgemm is important for integrals other than S + + !CALL mctc_gemm(trafo, s, dum, transa='T') + + !ss ----------- + do i=1,6 + do j=1,6 + sspher = 0.0_wp + do k=1,6 + sspher = sspher + trafo(k,i) * ss(k,j) + enddo + dum(i,j) = sspher + enddo + enddo + + !CALL mctc_gemm(dum, trafo, s2, transb='N') + + do i=1,6 + do j=1,6 + sspher = 0.0_wp + do k=1,6 + sspher = sspher + dum(i,k) * trafo(k,j) + enddo + s2(i,j) = sspher + enddo + enddo + + do i=1,5 + do j=1,5 + ss(j,i) = s2(j+1,i+1) + enddo + enddo + ! end ss ----------- + + !dd -------- + do kk = 1,3 + do i=1,6 + do j=1,6 + sspher = 0.0_wp + do k=1,6 + sspher = sspher + trafo(k,i) * dd(k,j,kk) + enddo + dum(i,j) = sspher + enddo + enddo + + !CALL mctc_gemm(dum, trafo, s2, transb='N') + + do i=1,6 + do j=1,6 + sspher = 0.0_wp + do k=1,6 + sspher = sspher + dum(i,k) * trafo(k,j) + enddo + s2(i,j) = sspher + enddo + enddo + + do i=1,5 + do j=1,5 + dd(j,i,kk) = s2(j+1,i+1) + enddo + enddo + enddo + !end dd -------------- + + !qq --------------- + do kk = 1,6 + do i=1,6 + do j=1,6 + sspher = 0.0_wp + do k=1,6 + sspher = sspher + trafo(k,i) * qq(k,j,kk) + enddo + dum(i,j) = sspher + enddo + enddo + + !CALL mctc_gemm(dum, trafo, s2, transb='N') + + do i=1,6 + do j=1,6 + sspher = 0.0_wp + do k=1,6 + sspher = sspher + dum(i,k) * trafo(k,j) + enddo + s2(i,j) = sspher + enddo + enddo + + do i=1,5 + do j=1,5 + qq(j,i,kk) = s2(j+1,i+1) + enddo + enddo + enddo + !end qq ----------------- + + endif + endif + + ioff = saoshell(ish,iat) + joff = saoshell(jsh,jat) + + ! would be good to change data order of sint,dpint,qpint for + ! memory coalescence + !$acc loop vector collapse(2) + do ii = 1,llao2(ishtyp) + do jj = 1,llao2(jshtyp) + iao = ii + ioff + jao = jj + joff + ij = lin(iao, jao) + !$acc atomic + H0(ij) = H0(ij) + hav * shpoly * ss(jj, ii) + !$acc end atomic + !$acc atomic + sint(iao, jao) = sint(iao, jao) + ss(jj, ii) + !$acc end atomic + !$acc atomic + sint(jao, iao) = sint(jao, iao) + ss(jj, ii) + !$acc end atomic + do k = 1,3 + !$acc atomic + dpint(k, iao, jao) = dpint(k, iao, jao) + dd(jj, ii, k) + !$acc end atomic + !$acc atomic + dpint(k, jao, iao) = dpint(k, jao, iao) + dd(jj, ii, k) + !$acc end atomic + enddo + do k = 1,6 + !$acc atomic + qpint(k, iao, jao) = qpint(k, iao, jao) + qq(jj, ii, k) + !$acc end atomic + !$acc atomic + qpint(k, jao, iao) = qpint(k, jao, iao) + qq(jj, ii, k) + !$acc end atomic + enddo + enddo + enddo + enddo + enddo + enddo + enddo + enddo + !$acc end parallel + + !$acc exit data copyout(H0(:),sint(nao,nao),dpint(3,nao,nao),qpint(6,nao,nao)) + + ! dd array is transposed in ACC region, so dd2 is created for the diagonal + ! elements, which is still running on CPU + + ! diagonal elements + do iat = 1, nat + ra = xyz(:, iat) + izp = at(iat) + do ish = 1, nShell(izp) + ishtyp = hData%angShell(ish,izp) + do iao = 1, llao2(ishtyp) + i = iao+saoshell(ish,iat) + ii = i*(1+i)/2 + sint(i,i) = 1.0_wp + sint(i,i) + H0(ii) = H0(ii) + selfEnergy(ish, iat) + end do + + icao = caoshell(ish,iat) + naoi = llao(ishtyp) + iptyp = itt(ishtyp) + do jsh = 1, ish + jshtyp = hData%angShell(jsh,izp) + jcao = caoshell(jsh,iat) + naoj = llao(jshtyp) + jptyp = itt(jshtyp) + ss = 0.0_wp + dd2 = 0.0_wp + qq = 0.0_wp + call get_multiints(icao,jcao,naoi,naoj,iptyp,jptyp,ra,ra,point, & + & intcut,nprim,primcount,alp,cont,ss,dd2,qq) + !transform from CAO to SAO + !call dtrf2(ss,ishtyp,jshtyp) + do k = 1,3 + tmp(1:6, 1:6) = dd2(k,1:6, 1:6) + call dtrf2(tmp, ishtyp, jshtyp) + dd2(k, 1:6, 1:6) = tmp(1:6, 1:6) + enddo + do k = 1,6 + tmp(1:6, 1:6) = qq(k, 1:6, 1:6) + call dtrf2(tmp, ishtyp, jshtyp) + qq(k, 1:6, 1:6) = tmp(1:6, 1:6) + enddo + do ii = 1, llao2(ishtyp) + iao = ii + saoshell(ish,iat) + do jj = 1, llao2(jshtyp) + jao = jj + saoshell(jsh,iat) + if (jao > iao .and. ish == jsh) cycle + dpint(1:3, iao, jao) = dpint(1:3, iao, jao) + dd2(1:3, jj, ii) + if (iao /= jao) then + dpint(1:3, jao, iao) = dpint(1:3, jao, iao) + dd2(1:3, jj, ii) + end if + qpint(1:6, iao, jao) = qq(1:6, jj, ii) + if (jao /= iao) then + qpint(1:6, jao, iao) = qq(1:6, jj, ii) + end if + end do + end do + end do + end do + end do + + !$acc exit data delete(xyz, at, nShell, selfEnergy, caoshell, saoshell, & + !$acc& nprim, primcount, alp, cont, intcut, trans, point) + + !$acc exit data delete(hData, & + !$acc& hData%principalQuantumNumber(:, :), hData%angShell(:, :), hData%valenceShell(:, :), & + !$acc& hData%numberOfPrimitives(:, :), hData%slaterExponent(:, :), hData%selfEnergy(:, :), & + !$acc& hData%referenceOcc(:, :), hData%kCN(:, :), hData%electronegativity(:), hData%atomicRad(:), & + !$acc& hData%shellPoly(:, :), hData%pairParam(:, :), hData%kQShell(:, :), hData%kQAtom(:)) + +end subroutine build_SDQH0_gpu + +end module xtb_xtb_hamiltonian_gpu diff --git a/src/xtb/meson.build b/src/xtb/meson.build index 926a1db56..3d41056ff 100644 --- a/src/xtb/meson.build +++ b/src/xtb/meson.build @@ -27,6 +27,7 @@ srcs += files( 'gfn2.f90', 'halogen.f90', 'hamiltonian.f90', + 'hamiltonian_gpu.f90', 'multipole.f90', 'repulsion.f90', 'thirdorder.f90', From 3bce3d230008c816c429885c53237e8b9781adfa Mon Sep 17 00:00:00 2001 From: Sebastian Ehlert <28669218+awvwgk@users.noreply.github.com> Date: Sun, 6 Sep 2020 19:06:47 +0200 Subject: [PATCH 2/3] Explicitly mark files for preprocessing --- src/CMakeLists.txt | 2 +- src/disp/CMakeLists.txt | 2 +- src/disp/{dftd4.f90 => dftd4.F90} | 0 src/disp/meson.build | 2 +- src/mctc/CMakeLists.txt | 4 ++-- src/mctc/lapack/CMakeLists.txt | 2 +- src/mctc/lapack/{eigensolve.f90 => eigensolve.F90} | 0 src/mctc/lapack/meson.build | 2 +- src/mctc/{mctc_global.f90 => mctc_global.F90} | 0 src/mctc/{mctc_init.f90 => mctc_init.F90} | 0 src/mctc/meson.build | 4 ++-- src/meson.build | 2 +- src/{scf_module.f90 => scf_module.F90} | 0 13 files changed, 10 insertions(+), 10 deletions(-) rename src/disp/{dftd4.f90 => dftd4.F90} (100%) rename src/mctc/lapack/{eigensolve.f90 => eigensolve.F90} (100%) rename src/mctc/{mctc_global.f90 => mctc_global.F90} (100%) rename src/mctc/{mctc_init.f90 => mctc_init.F90} (100%) rename src/{scf_module.f90 => scf_module.F90} (100%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d8f927d5b..e41a6583a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -128,7 +128,7 @@ list(APPEND srcs "${dir}/scan_driver.f90" "${dir}/scanparam.f90" "${dir}/scc_core.f90" - "${dir}/scf_module.f90" + "${dir}/scf_module.F90" "${dir}/screening.f90" "${dir}/set_module.f90" "${dir}/setparam.f90" diff --git a/src/disp/CMakeLists.txt b/src/disp/CMakeLists.txt index c4d3b0007..e4a797890 100644 --- a/src/disp/CMakeLists.txt +++ b/src/disp/CMakeLists.txt @@ -22,7 +22,7 @@ list(APPEND srcs "${dir}/dftd3.f" "${dir}/dftd3.f90" "${dir}/dftd3_parameters.f90" - "${dir}/dftd4.f90" + "${dir}/dftd4.F90" "${dir}/dftd4_parameters.f90" "${dir}/encharges.f90" "${dir}/ncoord.f90" diff --git a/src/disp/dftd4.f90 b/src/disp/dftd4.F90 similarity index 100% rename from src/disp/dftd4.f90 rename to src/disp/dftd4.F90 diff --git a/src/disp/meson.build b/src/disp/meson.build index 2eda9103b..5d8761a8a 100644 --- a/src/disp/meson.build +++ b/src/disp/meson.build @@ -20,7 +20,7 @@ srcs += files( 'dftd3.f', 'dftd3.f90', 'dftd3_parameters.f90', - 'dftd4.f90', + 'dftd4.F90', 'dftd4_parameters.f90', 'encharges.f90', 'ncoord.f90', diff --git a/src/mctc/CMakeLists.txt b/src/mctc/CMakeLists.txt index cc6eb3ad3..0e04df990 100644 --- a/src/mctc/CMakeLists.txt +++ b/src/mctc/CMakeLists.txt @@ -38,7 +38,7 @@ list(APPEND srcs "${dir}/systools.F90" "${dir}/thresholds.f90" "${dir}/version.f90" - "${dir}/mctc_global.f90" + "${dir}/mctc_global.F90" "${dir}/mctc_strings.f90" "${dir}/mctc_constants.f90" "${dir}/mctc_param.f90" @@ -53,7 +53,7 @@ list(APPEND srcs "${dir}/linalg.f90" "${dir}/lapack.f90" "${dir}/blas.f90" - "${dir}/mctc_init.f90" + "${dir}/mctc_init.F90" "${dir}/error.f90" "${dir}/signal.c" ) diff --git a/src/mctc/lapack/CMakeLists.txt b/src/mctc/lapack/CMakeLists.txt index c3a156752..71a37eabe 100644 --- a/src/mctc/lapack/CMakeLists.txt +++ b/src/mctc/lapack/CMakeLists.txt @@ -18,7 +18,7 @@ set(dir "${CMAKE_CURRENT_SOURCE_DIR}") list(APPEND srcs - "${dir}/eigensolve.f90" + "${dir}/eigensolve.F90" "${dir}/geneigval.f90" "${dir}/gst.f90" "${dir}/stdeigval.f90" diff --git a/src/mctc/lapack/eigensolve.f90 b/src/mctc/lapack/eigensolve.F90 similarity index 100% rename from src/mctc/lapack/eigensolve.f90 rename to src/mctc/lapack/eigensolve.F90 diff --git a/src/mctc/lapack/meson.build b/src/mctc/lapack/meson.build index 24d436f77..2787337a8 100644 --- a/src/mctc/lapack/meson.build +++ b/src/mctc/lapack/meson.build @@ -16,7 +16,7 @@ # along with xtb. If not, see . srcs += files( - 'eigensolve.f90', + 'eigensolve.F90', 'geneigval.f90', 'gst.f90', 'stdeigval.f90', diff --git a/src/mctc/mctc_global.f90 b/src/mctc/mctc_global.F90 similarity index 100% rename from src/mctc/mctc_global.f90 rename to src/mctc/mctc_global.F90 diff --git a/src/mctc/mctc_init.f90 b/src/mctc/mctc_init.F90 similarity index 100% rename from src/mctc/mctc_init.f90 rename to src/mctc/mctc_init.F90 diff --git a/src/mctc/meson.build b/src/mctc/meson.build index fd4dd0142..67709935d 100644 --- a/src/mctc/meson.build +++ b/src/mctc/meson.build @@ -36,7 +36,7 @@ srcs += files( 'systools.F90', 'thresholds.f90', 'version.f90', - 'mctc_global.f90', + 'mctc_global.F90', 'mctc_strings.f90', 'mctc_constants.f90', 'mctc_param.f90', @@ -51,7 +51,7 @@ srcs += files( 'linalg.f90', 'lapack.f90', 'blas.f90', - 'mctc_init.f90', + 'mctc_init.F90', 'error.f90', 'signal.c', ) diff --git a/src/meson.build b/src/meson.build index 1bd0d5fd7..4c51a4077 100644 --- a/src/meson.build +++ b/src/meson.build @@ -126,7 +126,7 @@ srcs += files( 'scan_driver.f90', 'scanparam.f90', 'scc_core.f90', - 'scf_module.f90', + 'scf_module.F90', 'screening.f90', 'set_module.f90', 'setparam.f90', diff --git a/src/scf_module.f90 b/src/scf_module.F90 similarity index 100% rename from src/scf_module.f90 rename to src/scf_module.F90 From b5103a3d75fe9a5f04f07f3f81bad2f3bf38f037 Mon Sep 17 00:00:00 2001 From: Sebastian Ehlert <28669218+awvwgk@users.noreply.github.com> Date: Sun, 6 Sep 2020 19:18:23 +0200 Subject: [PATCH 3/3] Fix CMake build --- src/xtb/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/xtb/CMakeLists.txt b/src/xtb/CMakeLists.txt index 46fcdd8c7..d74e2ac49 100644 --- a/src/xtb/CMakeLists.txt +++ b/src/xtb/CMakeLists.txt @@ -29,6 +29,7 @@ list(APPEND srcs "${dir}/gfn2.f90" "${dir}/halogen.f90" "${dir}/hamiltonian.f90" + "${dir}/hamiltonian_gpu.f90" "${dir}/multipole.f90" "${dir}/repulsion.f90" "${dir}/thirdorder.f90"