From 383736e85533ae6687b6194135acd12bbd0176f7 Mon Sep 17 00:00:00 2001
From: Linnea Andersson <lian5777@fredholm.it.uu.se>
Date: Mon, 11 Jan 2021 15:49:53 +0100
Subject: [PATCH 01/10] Added kernel routines for XL-BOMD

---
 src/CMakeLists.txt             |   2 +
 src/prg_ewald_mod.F90          | 751 +++++++++++++++++++++++++++++
 src/prg_implicit_fermi_mod.F90 | 326 +++++++++++--
 src/prg_xlbokernel_mod.F90     | 837 +++++++++++++++++++++++++++++++++
 4 files changed, 1879 insertions(+), 37 deletions(-)
 create mode 100644 src/prg_ewald_mod.F90
 create mode 100644 src/prg_xlbokernel_mod.F90

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9b6b5ce4..beaa317a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,6 +18,7 @@ add_library(progress
   prg_chebyshev_mod.F90
   prg_densitymatrix_mod.F90
   prg_dos_mod.F90
+  prg_ewald_mod.F90
   prg_extras_mod.F90
   prg_genz_mod.F90
   prg_graph_mod.F90
@@ -47,6 +48,7 @@ add_library(progress
   prg_system_mod.F90
   prg_timer_mod.F90
   prg_xlbo_mod.F90
+  prg_xlbokernel_mod.F90
   prg_xlkernel_mod.F90)
 
 if(OPENMP_FOUND)
diff --git a/src/prg_ewald_mod.F90 b/src/prg_ewald_mod.F90
new file mode 100644
index 00000000..5a032412
--- /dev/null
+++ b/src/prg_ewald_mod.F90
@@ -0,0 +1,751 @@
+! Ewald sum routines for kernel calculation 
+module prg_ewald_mod
+
+  use bml
+  use prg_timer_mod
+  use prg_parallel_mod
+
+  implicit none
+
+  private  !Everything is private by default
+
+  integer, parameter :: dp = kind(1.0d0)
+
+  public :: Ewald_Real_Space_Single
+  public :: Ewald_Real_Space_Single_latte
+  public :: Ewald_Real_Space
+  public :: Ewald_Real_Space_latte
+  public :: Ewald_k_space
+  public :: Ewald_k_space_latte
+
+contains
+
+!> Find Coulomb potential on site I from single charge at site J          
+subroutine  Ewald_Real_Space_Single_latte(COULOMBV,I,RXYZ,Box,Nr_elem, &
+    DELTAQ,J,U,Element_Pointer,Nr_atoms,COULACC,HDIM,Max_Nr_Neigh)
+
+implicit none
+
+integer,    parameter     :: PREC = 8
+real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+integer,    intent(in)    :: Nr_atoms, Nr_elem, HDIM, Max_Nr_Neigh, I, J, Element_Pointer(Nr_atoms)
+real(PREC), intent(in)    :: COULACC, DELTAQ(Nr_atoms)
+real(PREC)                :: TFACT, RELPERM, KECONST
+real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3)
+real(PREC), intent(in)    :: U(Nr_elem)
+real(PREC)                :: COULCUT, COULCUT2
+real(PREC), intent(out)   :: COULOMBV 
+real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+
+integer                   :: K, ccnt,l,m,n
+
+COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+SQRTX = sqrt(-log(COULACC))
+
+ccnt = 0
+COULCUT = 12.D0
+CALPHA = SQRTX/COULCUT
+COULCUT2 = COULCUT*COULCUT
+CALPHA2 = CALPHA*CALPHA
+
+RELPERM = ONE
+KECONST = 14.3996437701414D0*RELPERM
+TFACT  = 16.0D0/(5.0D0*KECONST)
+
+COULOMBV = ZERO
+
+TI = TFACT*U(Element_Pointer(I))
+TI2 = TI*TI
+TI3 = TI2*TI
+TI4 = TI2*TI2
+TI6 = TI4*TI2
+
+SSA = TI
+SSB = TI3/48.D0
+SSC = 3.D0*TI2/16.D0
+SSD = 11.D0*TI/16.D0
+SSE = 1.D0
+
+Ra(1) = RXYZ(1,I)
+Ra(2) = RXYZ(2,I)
+Ra(3) = RXYZ(3,I)
+
+    do k = -1,1
+    do m = -1,1
+    do l = -1,1
+
+    Rb(1) = RXYZ(1,J)+k*box(1,1)
+    Rb(2) = RXYZ(2,J)+m*box(2,2)
+    Rb(3) = RXYZ(3,J)+l*box(3,3)
+    Rab = Rb-Ra  ! OBS b - a !!!
+    dR = norm2(Rab)
+    MAGR = dR
+    MAGR2 = dR*dR
+
+    if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+       TJ = TFACT*U(Element_Pointer(J))
+       DC = Rab/dR
+
+       Z = abs(CALPHA*MAGR)
+       NUMREP_ERFC = erfc(Z)
+
+       CA = NUMREP_ERFC/MAGR
+       COULOMBV = COULOMBV + DELTAQ(J)*CA
+       ccnt = ccnt + 1
+       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+       EXPTI = exp(-TI*MAGR )
+
+       if (Element_Pointer(I).eq.Element_Pointer(J)) then
+           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+           ccnt = ccnt + 1
+       else
+           TJ2 = TJ*TJ
+           TJ3 = TJ2*TJ
+           TJ4 = TJ2*TJ2
+           TJ6 = TJ4*TJ2
+           EXPTJ = exp( -TJ*MAGR )
+           TI2MTJ2 = TI2 - TJ2
+           TJ2MTI2 = -TI2MTJ2
+           SA = TI
+           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+           SD = TJ
+           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+       endif
+    endif
+    enddo
+    enddo
+    enddo
+
+COULOMBV = KECONST*COULOMBV
+
+end subroutine Ewald_Real_Space_Single_latte
+
+subroutine  Ewald_Real_Space_Single(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
+    DELTAQ,J,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,HDIM,Max_Nr_Neigh)
+
+
+implicit none
+
+integer,    parameter     :: PREC = 8
+real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, J
+real(PREC), intent(in)    :: COULACC, TIMERATIO,DELTAQ(Nr_atoms)
+real(PREC)                :: TFACT, RELPERM, KECONST
+real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+real(PREC), intent(in)    :: U(Nr_atoms)
+real(PREC)                :: COULCUT, COULCUT2
+character(10), intent(in) :: Element_Type(Nr_atoms)
+real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
+real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+
+integer                   :: K, ccnt,l,m,n
+
+COULVOL = LBox(1)*LBox(2)*LBox(3)
+SQRTX = sqrt(-log(COULACC))
+
+ccnt = 0
+COULCUT = 12.D0
+CALPHA = SQRTX/COULCUT
+COULCUT2 = COULCUT*COULCUT
+CALPHA2 = CALPHA*CALPHA
+
+RELPERM = ONE
+KECONST = 14.3996437701414D0*RELPERM
+TFACT  = 16.0D0/(5.0D0*KECONST)
+
+FCOUL = ZERO
+COULOMBV = ZERO
+
+TI = TFACT*U(I)
+TI2 = TI*TI
+TI3 = TI2*TI
+TI4 = TI2*TI2
+TI6 = TI4*TI2
+
+SSA = TI
+SSB = TI3/48.D0
+SSC = 3.D0*TI2/16.D0
+SSD = 11.D0*TI/16.D0
+SSE = 1.D0
+
+Ra(1) = RX(I)
+Ra(2) = RY(I)
+Ra(3) = RZ(I)
+
+    do k = -1,1
+    do m = -1,1
+    do l = -1,1
+
+    Rb(1) = RX(J)+k*Lbox(1)
+    Rb(2) = RY(J)+m*Lbox(2)
+    Rb(3) = RZ(J)+l*Lbox(3)
+    Rab = Rb-Ra  ! OBS b - a !!!
+    dR = norm2(Rab)
+    MAGR = dR
+    MAGR2 = dR*dR
+
+    if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+       TJ = TFACT*U(J)
+       DC = Rab/dR
+
+       Z = abs(CALPHA*MAGR)
+       NUMREP_ERFC = erfc(Z)
+
+       CA = NUMREP_ERFC/MAGR
+       COULOMBV = COULOMBV + DELTAQ(J)*CA
+       ccnt = ccnt + 1
+       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+       FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
+       EXPTI = exp(-TI*MAGR )
+
+       if (Element_Type(I).eq.Element_Type(J)) then
+           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+           ccnt = ccnt + 1
+           FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
+             + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
+       else
+           TJ2 = TJ*TJ
+           TJ3 = TJ2*TJ
+           TJ4 = TJ2*TJ2
+           TJ6 = TJ4*TJ2
+           EXPTJ = exp( -TJ*MAGR )
+           TI2MTJ2 = TI2 - TJ2
+           TJ2MTI2 = -TI2MTJ2
+           SA = TI
+           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+           SD = TJ
+           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+           FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
+              + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
+       endif
+
+       FCOUL(1) = FCOUL(1) + DC(1)*FORCE
+       FCOUL(2) = FCOUL(2) + DC(2)*FORCE
+       FCOUL(3) = FCOUL(3) + DC(3)*FORCE
+    endif
+    enddo
+    enddo
+    enddo
+
+COULOMBV = KECONST*COULOMBV
+
+end subroutine Ewald_Real_Space_Single
+
+subroutine  Ewald_Real_Space_latte(COULOMBV,I,RXYZ,Box, &
+    DELTAQ,U,Element_Pointer,Nr_atoms,COULACC,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_Elem)
+
+implicit none
+
+integer,    parameter     :: PREC = 8
+real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, Nr_Elem
+real(PREC), intent(in)    :: COULACC
+real(PREC)                :: TFACT, RELPERM, KECONST
+real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+real(PREC), intent(in)    :: U(Nr_elem)
+real(PREC)                :: COULCUT, COULCUT2
+integer, intent(in)       :: Element_Pointer(Nr_atoms)
+integer,    intent(in)    :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
+real(PREC), intent(out)   :: COULOMBV
+real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+integer                   :: J,K, ccnt, newj, PBCI,PBCJ,PBCK
+
+COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+SQRTX = sqrt(-log(COULACC))
+
+ccnt = 0
+COULCUT = 12.D0
+CALPHA = SQRTX/COULCUT
+COULCUT2 = COULCUT*COULCUT
+CALPHA2 = CALPHA*CALPHA
+
+RELPERM = ONE
+KECONST = 14.3996437701414D0*RELPERM
+TFACT  = 16.0D0/(5.0D0*KECONST)
+
+COULOMBV = ZERO
+
+TI = TFACT*U(Element_Pointer(I))
+TI2 = TI*TI
+TI3 = TI2*TI
+TI4 = TI2*TI2
+TI6 = TI4*TI2
+
+SSA = TI
+SSB = TI3/48.D0
+SSC = 3.D0*TI2/16.D0
+SSD = 11.D0*TI/16.D0
+SSE = 1.D0
+
+Ra(1) = RXYZ(1,I)
+Ra(2) = RXYZ(2,I)
+Ra(3) = RXYZ(3,I)
+
+do newj = 1,totnebcoul(I)
+        J = NEBCOUL(1, NEWJ, I)
+        PBCI = NEBCOUL(2, NEWJ, I)
+        PBCJ = NEBCOUL(3, NEWJ, I)
+        PBCK = NEBCOUL(4, NEWJ, I)
+        Rb(1) = RXYZ(1,J) + REAL(PBCI)*BOX(1,1) + REAL(PBCJ)*BOX(2,1) + &
+             REAL(PBCK)*BOX(3,1) 
+
+        Rb(2) = RXYZ(2,J) + REAL(PBCI)*BOX(1,2) + REAL(PBCJ)*BOX(2,2) + &
+             REAL(PBCK)*BOX(3,2) 
+
+        Rb(3) = RXYZ(3,J) + REAL(PBCI)*BOX(1,3) + REAL(PBCJ)*BOX(2,3) + &
+             REAL(PBCK)*BOX(3,3) 
+        Rab = Rb-Ra  ! OBS b - a !!!
+        dR = norm2(Rab)
+        MAGR = dR
+        MAGR2 = dR*dR
+
+    if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+       TJ = TFACT*U(Element_Pointer(J))
+       DC = Rab/dR
+
+       Z = abs(CALPHA*MAGR)
+       NUMREP_ERFC = erfc(Z)
+
+       CA = NUMREP_ERFC/MAGR
+       COULOMBV = COULOMBV + DELTAQ(J)*CA
+       ccnt = ccnt + 1
+       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+       EXPTI = exp(-TI*MAGR )
+
+       if (Element_Pointer(I).eq.Element_Pointer(J)) then
+           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+           ccnt = ccnt + 1
+       else
+           TJ2 = TJ*TJ
+           TJ3 = TJ2*TJ
+           TJ4 = TJ2*TJ2
+           TJ6 = TJ4*TJ2
+           EXPTJ = exp( -TJ*MAGR )
+           TI2MTJ2 = TI2 - TJ2
+           TJ2MTI2 = -TI2MTJ2
+           SA = TI
+           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+           SD = TJ
+           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+       endif
+
+    endif
+enddo
+COULOMBV = KECONST*COULOMBV
+
+end subroutine Ewald_Real_Space_latte
+
+subroutine  Ewald_Real_Space(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
+    DELTAQ,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+
+implicit none
+
+integer,    parameter     :: PREC = 8
+real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I
+real(PREC), intent(in)    :: COULACC, TIMERATIO
+real(PREC)                :: TFACT, RELPERM, KECONST
+real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+real(PREC), intent(in)    :: U(Nr_atoms)
+real(PREC)                :: COULCUT, COULCUT2
+character(10), intent(in) :: Element_Type(Nr_atoms)
+integer,    intent(in)    :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+real(PREC), intent(in)    :: nnRx(Nr_atoms,Max_Nr_Neigh)
+real(PREC), intent(in)    :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
+real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+integer                   :: J,K, ccnt, nnI
+
+COULVOL = LBox(1)*LBox(2)*LBox(3)
+SQRTX = sqrt(-log(COULACC))
+
+ccnt = 0
+COULCUT = 12.D0
+CALPHA = SQRTX/COULCUT
+COULCUT2 = COULCUT*COULCUT
+CALPHA2 = CALPHA*CALPHA
+
+RELPERM = ONE
+KECONST = 14.3996437701414D0*RELPERM
+TFACT  = 16.0D0/(5.0D0*KECONST)
+
+FCOUL = ZERO
+COULOMBV = ZERO
+
+TI = TFACT*U(I)
+TI2 = TI*TI
+TI3 = TI2*TI
+TI4 = TI2*TI2
+TI6 = TI4*TI2
+
+SSA = TI
+SSB = TI3/48.D0
+SSC = 3.D0*TI2/16.D0
+SSD = 11.D0*TI/16.D0
+SSE = 1.D0
+
+Ra(1) = RX(I)
+Ra(2) = RY(I)
+Ra(3) = RZ(I)
+
+do nnI = 1,nrnnlist(I)
+  Rb(1) = nnRx(I,nnI)
+  Rb(2) = nnRy(I,nnI)
+  Rb(3) = nnRz(I,nnI)
+  J = nnType(I,nnI)
+    Rab = Rb-Ra  ! OBS b - a !!!
+    dR = norm2(Rab)
+    MAGR = dR
+    MAGR2 = dR*dR
+
+    if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+       TJ = TFACT*U(J)
+       DC = Rab/dR
+
+       ! Not Using Numerical Recipes ERFC
+       Z = abs(CALPHA*MAGR)
+       NUMREP_ERFC = erfc(Z)
+
+       CA = NUMREP_ERFC/MAGR
+       COULOMBV = COULOMBV + DELTAQ(J)*CA
+       ccnt = ccnt + 1
+       !TEST(ccnt) = DELTAQ(J)*CA
+       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+       FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
+       EXPTI = exp(-TI*MAGR )
+
+       if (Element_Type(I).eq.Element_Type(J)) then
+           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+           ccnt = ccnt + 1
+           !TEST(ccnt) = - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+           FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
+             + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
+       else
+           TJ2 = TJ*TJ
+           TJ3 = TJ2*TJ
+           TJ4 = TJ2*TJ2
+           TJ6 = TJ4*TJ2
+           EXPTJ = exp( -TJ*MAGR )
+           TI2MTJ2 = TI2 - TJ2
+           TJ2MTI2 = -TI2MTJ2
+           SA = TI
+           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+           SD = TJ
+           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+           FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
+              + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
+       endif
+
+       FCOUL(1) = FCOUL(1) + DC(1)*FORCE
+       FCOUL(2) = FCOUL(2) + DC(2)*FORCE
+       FCOUL(3) = FCOUL(3) + DC(3)*FORCE
+    endif
+enddo
+COULOMBV = KECONST*COULOMBV
+
+end subroutine Ewald_Real_Space
+
+subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_Neigh)
+
+implicit none
+
+integer, parameter        :: PREC = 8
+real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
+real(PREC), intent(in)    :: COULACC
+real(PREC)                :: KECONST, TFACT, RELPERM
+real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+real(PREC)                :: COULCUT, COULCUT2
+real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
+real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
+real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
+real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
+
+integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+SQRTX = sqrt(-log(COULACC))
+
+ccnt = 0
+
+COULCUT = 12.0D0
+CALPHA = SQRTX/COULCUT
+
+COULCUT2 = COULCUT*COULCUT
+KCUTOFF = TWO*CALPHA*SQRTX
+KCUTOFF2 = KCUTOFF*KCUTOFF
+CALPHA2 = CALPHA*CALPHA
+FOURCALPHA2 = FOUR*CALPHA2
+
+RECIPVECS = ZERO
+RECIPVECS(1,1) = TWO*pi/Box(1,1)
+RECIPVECS(2,2) = TWO*pi/Box(2,2)
+RECIPVECS(3,3) = TWO*pi/Box(3,3)
+LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+RELPERM = 1.D0
+KECONST = 14.3996437701414D0*RELPERM
+
+COULOMBV = ZERO
+SINLIST = ZERO
+COSLIST = ZERO
+
+do L = 0,LMAX
+
+  if (L.eq.0) then
+    MMIN = 0
+  else
+    MMIN = -MMAX
+  endif
+
+  L11 = L*RECIPVECS(1,1)
+  L12 = L*RECIPVECS(1,2)
+  L13 = L*RECIPVECS(1,3)
+
+  do M = MMIN,MMAX
+
+        NMIN = -NMAX
+        if ((L==0).and.(M==0)) then
+          NMIN = 1
+        endif
+
+        M21 = L11 + M*RECIPVECS(2,1)
+        M22 = L12 + M*RECIPVECS(2,2)
+        M23 = L13 + M*RECIPVECS(2,3)
+
+        do N = NMIN,NMAX
+           K(1) = M21 + N*RECIPVECS(3,1)
+           K(2) = M22 + N*RECIPVECS(3,2)
+           K(3) = M23 + N*RECIPVECS(3,3)
+           K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+           if (K2.le.KCUTOFF2) then
+              PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+              PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
+
+              COSSUM = 0.D0
+              SINSUM = 0.D0
+
+              ! Doing the sin and cos sums
+              !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,DOT) &
+              !$OMP REDUCTION(+:COSSUM) & 
+              !$OMP REDUCTION(+:SINSUM)  
+              do I = 1,Nr_atoms
+                 DOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
+                 ! We re-use these in the next loop...
+                 SINLIST(I) = sin(DOT)
+                 COSLIST(I) = cos(DOT)
+                 COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
+                 SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
+              enddo
+              !$OMP END PARALLEL DO
+              COSSUM2 = COSSUM*COSSUM
+              SINSUM2 = SINSUM*SINSUM
+
+              ! Add up energy and force contributions
+
+              KEPREF = KECONST*PREFACTOR
+              !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I) 
+              do I = 1,Nr_atoms
+                 COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
+              enddo
+              !$OMP END PARALLEL DO
+
+              KEPREF = KEPREF*(COSSUM2 + SINSUM2)
+           endif
+        enddo
+  enddo
+enddo
+
+! Point self energy
+CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+end subroutine Ewald_k_Space_latte
+
+subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TIMERATIO,Max_Nr_Neigh)
+
+implicit none
+
+integer, parameter        :: PREC = 8
+real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
+real(PREC), intent(in)    :: COULACC, TIMERATIO
+real(PREC)                :: KECONST, TFACT, RELPERM
+real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+real(PREC)                :: COULCUT, COULCUT2
+real(PREC), intent(out)   :: COULOMBV(Nr_atoms), FCOUL(3,Nr_atoms)
+real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
+real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
+real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
+
+integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+COULVOL = LBox(1)*LBox(2)*LBox(3)
+SQRTX = sqrt(-log(COULACC))
+
+ccnt = 0
+
+COULCUT = 12.0D0
+CALPHA = SQRTX/COULCUT
+
+COULCUT2 = COULCUT*COULCUT
+KCUTOFF = TWO*CALPHA*SQRTX
+KCUTOFF2 = KCUTOFF*KCUTOFF
+CALPHA2 = CALPHA*CALPHA
+FOURCALPHA2 = FOUR*CALPHA2
+
+RECIPVECS = ZERO
+RECIPVECS(1,1) = TWO*pi/LBox(1)
+RECIPVECS(2,2) = TWO*pi/LBox(2)
+RECIPVECS(3,3) = TWO*pi/LBox(3)
+LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+RELPERM = 1.D0
+KECONST = 14.3996437701414D0*RELPERM
+
+FCOUL = ZERO
+COULOMBV = ZERO
+SINLIST = ZERO
+COSLIST = ZERO
+
+do L = 0,LMAX
+
+  if (L.eq.0) then
+    MMIN = 0
+  else
+    MMIN = -MMAX
+  endif
+
+  L11 = L*RECIPVECS(1,1)
+  L12 = L*RECIPVECS(1,2)
+  L13 = L*RECIPVECS(1,3)
+
+  do M = MMIN,MMAX
+
+        NMIN = -NMAX
+        if ((L==0).and.(M==0)) then
+          NMIN = 1
+        endif
+
+        M21 = L11 + M*RECIPVECS(2,1)
+        M22 = L12 + M*RECIPVECS(2,2)
+        M23 = L13 + M*RECIPVECS(2,3)
+
+        do N = NMIN,NMAX
+           K(1) = M21 + N*RECIPVECS(3,1)
+           K(2) = M22 + N*RECIPVECS(3,2)
+           K(3) = M23 + N*RECIPVECS(3,3)
+           K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+           if (K2.le.KCUTOFF2) then
+              PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+              PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
+
+              COSSUM = 0.D0
+              SINSUM = 0.D0
+
+              ! Doing the sin and cos sums
+              do I = 1,Nr_atoms
+                 DOT = K(1)*RX(I) + K(2)*RY(I) + K(3)*RZ(I)
+                 ! We re-use these in the next loop...
+                 SINLIST(I) = sin(DOT)
+                 COSLIST(I) = cos(DOT)
+                 COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
+                 SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
+              enddo
+              COSSUM2 = COSSUM*COSSUM
+              SINSUM2 = SINSUM*SINSUM
+
+              ! Add up energy and force contributions
+
+              KEPREF = KECONST*PREFACTOR
+              do I = 1,Nr_atoms
+                 COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
+                 FORCE = KEPREF*DELTAQ(I)*(SINLIST(I)*COSSUM - COSLIST(I)*SINSUM)
+                 FCOUL(1,I) = FCOUL(1,I) + FORCE*K(1)
+                 FCOUL(2,I) = FCOUL(2,I) + FORCE*K(2)
+                 FCOUL(3,I) = FCOUL(3,I) + FORCE*K(3)
+              enddo
+
+              KEPREF = KEPREF*(COSSUM2 + SINSUM2)
+           endif
+        enddo
+  enddo
+enddo
+
+! Point self energy
+CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+end subroutine Ewald_k_Space
+
+end module prg_ewald_mod
diff --git a/src/prg_implicit_fermi_mod.F90 b/src/prg_implicit_fermi_mod.F90
index 5c30947c..2e217a98 100644
--- a/src/prg_implicit_fermi_mod.F90
+++ b/src/prg_implicit_fermi_mod.F90
@@ -5,11 +5,13 @@
 !!
 module prg_implicit_fermi_mod
 
+  use omp_lib
   use bml
   use prg_normalize_mod
   use prg_densitymatrix_mod
   use prg_timer_mod
   use prg_parallel_mod
+  use prg_ewald_mod
 
   implicit none
 
@@ -18,13 +20,164 @@ module prg_implicit_fermi_mod
   integer, parameter :: dp = kind(1.0d0)
 
   public :: prg_implicit_fermi
+  public :: prg_implicit_fermi_save_inverse
   public :: prg_implicit_fermi_zero
   public :: prg_test_density_matrix
   public :: prg_implicit_fermi_response
+  public :: prg_implicit_fermi_first_order_response
   public :: prg_finite_diff
 
 contains
 
+  !> Recursive Implicit Fermi Dirac for finite temperature.
+  !! \param Inv_bml Inverses generated by algorithm.  
+  !! \param h_bml Input Hamiltonian matrix.
+  !! \param p_bml Output density matrix.
+  !! \param nsteps Number of recursion steps.
+  !! \param nocc Number of occupied states.
+  !! \param mu Shifted chemical potential
+  !! \param beta Input inverse temperature.
+  !! \param occErrLimit Occupation error limit.
+  !! \param threshold Threshold for multiplication.
+  !! \param tol Tolerance for linear system solver.
+  !! \param SCF_IT The current SCF iteration. 
+  !! \param occiter Counts the total nr of DM calculations during MD. 
+  !! See \cite{niklasson2003}
+  subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc, &
+       mu, beta, occErrLimit, threshold, tol,SCF_IT, occiter)
+
+    implicit none
+
+    type(bml_matrix_t), intent(in) :: h_bml
+    type(bml_matrix_t), intent(inout) :: p_bml, Inv_bml(nsteps)
+    integer, intent(in) :: nsteps, SCF_IT
+    real(dp), intent(in) :: nocc, threshold
+    real(dp), intent(in) :: tol
+    real(dp), intent(in) :: occErrLimit, beta
+    real(dp), intent(inout) :: mu
+    integer, intent(inout) :: occiter
+
+    type(bml_matrix_t) :: w_bml, y_bml, d_bml, aux_bml, p2_bml, I_bml, ai_bml
+    real(dp) :: trdPdmu, trP0, occErr, alpha 
+    real(dp) :: cnst, ofactor, mustep 
+    real(dp), allocatable :: trace(:), gbnd(:)
+    character(20) :: bml_type
+    integer :: N, M, i, iter, muadj, prev
+
+    bml_type = bml_get_type(h_bml)
+    N = bml_get_N(h_bml)
+    M = bml_get_M(h_bml)
+
+    allocate(trace(2))
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, p2_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, d_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, w_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, aux_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, y_bml)
+    call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, I_bml)
+    call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, ai_bml) 
+
+    occErr = 10.0_dp
+    alpha = 1.0_dp
+    prev = 0
+    iter = 0
+    cnst = beta/(1.0_dp*2**(nsteps+2))
+
+    if (SCF_IT .eq. 1) then
+      ! Normalization
+      ! P0 = 0.5*I - cnst*(H0-mu0*I)
+      call bml_copy(h_bml, p_bml)
+      call prg_normalize_implicit_fermi(p_bml, cnst, mu)
+      ! Generate good starting guess for (2*(P2-P)+1)^-1 using conjugate gradient
+      call bml_multiply_x2(p_bml, p2_bml, threshold, trace)
+      ! Y = 2*(P2-P) + II
+      call bml_copy(p2_bml, y_bml)
+      call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
+      call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
+      call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.0001_dp, threshold)
+    else
+      ! Otherwise use previous inverse as starting guess
+      call bml_copy(Inv_bml(1),ai_bml) 
+    end if
+
+    do while (occErr .gt. occErrLimit .or. muadj .eq. 1) 
+      iter = iter + 1
+      muadj = 0
+      write(*,*) 'mu =', mu
+      ! Normalization
+      ! P0 = 0.5*I - cnst*(H0-mu0*I)
+      call bml_copy(h_bml, p_bml)
+      call prg_normalize_implicit_fermi(p_bml, cnst, mu)
+
+        do i = 1, nsteps
+            call bml_multiply_x2(p_bml, p2_bml, threshold, trace)
+            ! Y = 2*(P2-P) + I
+            call bml_copy(p2_bml, y_bml)
+            call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
+            call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
+            ! Find inverse ai = (2*(P2-P)+I)^-1
+            !call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.01_dp, threshold)
+            call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, aux_bml, I_bml, tol, threshold)
+            call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
+            call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation  
+        enddo
+
+      trdPdmu = bml_trace(p_bml)
+      trP0 = trdPdmu
+      trdPdmu = trdPdmu - bml_sum_squares(p_bml) ! sum p(i,j)**2
+      trdPdmu = beta * trdPdmu
+      occErr = abs(trP0 - nocc)
+      write(*,*) 'occerr =', nocc-trP0
+
+      ! If occupation error is too large, do bisection method 
+      if (occerr > 10.0_dp) then 
+         if (nocc-trP0 < 0.0_dp) then
+            if (prev .eq. 1) then
+               alpha = alpha/2
+            endif 
+            prev = -1
+            mu = mu - alpha
+         else 
+            if (prev .eq. -1) then
+               alpha = alpha/2
+            endif
+            prev = 1
+            mu = mu + alpha 
+         endif  
+      ! Otherwise do Newton 
+      else if (occErr .gt. occErrLimit) then
+        mustep = (nocc -trP0)/trdPdmu
+        if (abs(mustep) > 1.0) then
+          mustep = 0.1_dp*mustep
+        end if  
+        mu = mu + mustep 
+        muadj = 1
+      end if
+    enddo
+    ! Adjusting the occupation sometimes causes the perturbation calculation to not converge. 
+    ! For now we recompute the DM one extra time if mu was adjusted. 
+    !if (muadj .eq. 1) then
+      ! Adjust occupation
+     ! call bml_copy(p_bml, d_bml)
+     ! call bml_scale_add_identity(d_bml, -1.0_dp, 1.0_dp, threshold)
+     ! call bml_multiply(p_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
+     ! ofactor = ((nocc - trP0)/trdPdmu) * beta
+     ! call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
+    !end if
+    occiter = occiter + iter
+    call bml_scale(2.0_dp,p_bml)
+    deallocate(trace)
+
+    call bml_deallocate(p2_bml)
+    call bml_deallocate(w_bml)
+    call bml_deallocate(d_bml)
+    call bml_deallocate(y_bml)
+    call bml_deallocate(aux_bml)
+    call bml_deallocate(ai_bml)
+    call bml_deallocate(I_bml)
+
+  end subroutine prg_implicit_fermi_save_inverse
+  
   !> Recursive Implicit Fermi Dirac for finite temperature.
   !! \param h_bml Input Hamiltonian matrix.
   !! \param p_bml Output density matrix.
@@ -63,6 +216,11 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
     N = bml_get_N(h_bml)
     M = bml_get_M(h_bml)
 
+    call bml_print_matrix("h_bml",h_bml,1,10,1,10)
+    call bml_print_matrix("p_bml",p_bml,1,10,1,10)
+    write(*,*) nsteps, k, nocc, &
+       mu, beta, method, osteps, occErrLimit, threshold, tol
+    !stop
     allocate(trace(2))
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, p2_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, d_bml)
@@ -106,7 +264,7 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
           else
             call prg_setup_linsys(p_bml, y_bml, p2_bml, d_bml, w_bml, aux1_bml, aux2_bml, k, threshold)
           end if
-          call prg_conjgrad(y_bml, p_bml, p2_bml, d_bml, w_bml, tol, threshold)
+          call prg_conjgrad(y_bml, p_bml, p2_bml, d_bml, aux1_bml, w_bml, tol, threshold)
         enddo
       else
         write(*,*) "Doing NS"
@@ -123,9 +281,9 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
             call prg_setup_linsys(p_bml, y_bml, p2_bml, d_bml, w_bml, aux1_bml, aux2_bml, k, threshold)
           end if
           if (i .eq. 1) then
-            call prg_conjgrad(y_bml, ai_bml, I_bml, d_bml, w_bml, 0.9_dp, threshold)
+            call prg_conjgrad(y_bml, ai_bml, I_bml, aux1_bml, d_bml, w_bml, 0.9_dp, threshold)
           end if
-          call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, tol, threshold)
+          call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, aux1_bml, I_bml, tol, threshold)
           call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
         enddo
 
@@ -185,7 +343,7 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
     real(dp), intent(in) :: mu, threshold
     real(dp), intent(inout), optional :: tol
 
-    type(bml_matrix_t) :: w_bml, y_bml, d_bml, p2_bml, aux1_bml, aux2_bml, I_bml, ai_bml
+    type(bml_matrix_t) :: w_bml, y_bml, c_bml, d_bml, p2_bml, aux1_bml, aux2_bml, I_bml, ai_bml
     real(dp) :: cnst
     real(dp), allocatable :: trace(:), gbnd(:)
     character(20) :: bml_type
@@ -201,6 +359,7 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, d_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, w_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, y_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, c_bml)
     if (method .eq. 1) then
       call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, I_bml)
       call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, ai_bml)
@@ -221,7 +380,7 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
         call bml_copy(p2_bml, y_bml)
         call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
         call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
-        call prg_conjgrad(y_bml, p_bml, p2_bml, d_bml, w_bml, tol, threshold)
+        call prg_conjgrad(y_bml, p_bml, p2_bml, d_bml, w_bml, c_bml, tol, threshold)
       enddo
     else
       write(*,*) "Doing NS"
@@ -232,9 +391,9 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
         call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
         call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
         if (i .eq. 1) then
-          call prg_conjgrad(y_bml, ai_bml, I_bml, d_bml, w_bml, 0.9_dp, threshold)
+          call prg_conjgrad(y_bml, ai_bml, I_bml, c_bml, d_bml, w_bml, 0.9_dp, threshold)
         end if
-        call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, tol, threshold)
+        call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, c_bml, I_bml, tol, threshold)
         call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
       enddo
     endif
@@ -246,6 +405,7 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
     call bml_deallocate(w_bml)
     call bml_deallocate(d_bml)
     call bml_deallocate(y_bml)
+    call bml_deallocate(c_bml)
     if (method .eq. 1) then
       call bml_deallocate(ai_bml)
       call bml_deallocate(I_bml)
@@ -253,6 +413,89 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
 
   end subroutine prg_implicit_fermi_zero
 
+
+  !> Calculate first order density matrix response to perturbations using Implicit Fermi Dirac.
+  !! \param H0_bml Input Hamiltonian matrix.
+  !! \param H1_bml Input First order perturbation of H0.
+  !! \param P0_bml Output density matrix.
+  !! \param P1_bml Output First order density matrix response.
+  !! \param nsteps Number of recursion steps.
+  !! \param mu0 Shifted chemical potential.
+  !! \param beta Input inverse temperature.
+  !! \param nocc Number of occupied states.
+  !! \param threshold Threshold for matrix algebra.
+  !! See \cite{niklasson2015}
+  subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bml, &
+       Inv_bml, nsteps, mu0, beta, nocc, threshold)
+
+    implicit none
+
+    type(bml_matrix_t), intent(in) :: H0_bml, H1_bml, Inv_bml(nsteps)
+    type(bml_matrix_t), intent(inout) :: P0_bml, P1_bml
+    real(dp), intent(in) :: mu0, threshold
+    real(dp)  :: mu1
+    real(dp), intent(in) :: beta, nocc
+    integer, intent(in) :: nsteps
+    type(bml_matrix_t) :: B_bml, C_bml, C0_bml
+    character(20) :: bml_type
+    real(dp) :: p1_trace, dPdmu_trace, p1B_trace, mu1B, cnst 
+    integer :: N, M, i, j, k
+
+    bml_type = bml_get_type(H0_bml)
+    N = bml_get_N(H0_bml)
+    M = bml_get_M(H0_bml)
+
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, B_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C0_bml)
+
+    cnst = beta/(2**(2+nsteps))
+
+      ! P0 = 0.5*II - cnst*(H0-mu0*II)
+      call bml_copy(H0_bml, P0_bml)
+      call prg_normalize_implicit_fermi(P0_bml, cnst, mu0)
+
+      ! P1 =  - cnst*H1
+      call bml_copy(H1_bml, P1_bml)
+      call bml_scale(-1.0_dp*cnst, P1_bml)
+      do i = 1, nsteps
+
+        ! Calculate coefficient matrices
+        ! C0 = P0^2
+        call bml_multiply(P0_bml, P0_bml, C0_bml, 1.0_dp, 0.0_dp, threshold)
+        ! C = P0*P1+P1*P0, B = 2(P1 - C)
+        call bml_multiply(P0_bml, P1_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+        call bml_multiply(P1_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+        call bml_copy(P1_bml, B_bml)
+        call bml_add(B_bml, C_bml, 2.0_dp, -2.0_dp, threshold)
+        ! Get next P0
+        call bml_multiply(Inv_bml(i), C0_bml, P0_bml, 1.0_dp, 0.0_dp, threshold)
+        ! Get next P1
+        ! C = P0*P1+P1*P0 + 2(P1 -P0*P1-P1*P0)*P0(i+1)
+        call bml_multiply(B_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+        call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
+      enddo
+
+      ! dPdmu = beta*P0(I-P0)
+      call bml_copy(P0_bml, B_bml)
+      call bml_scale_add_identity(B_bml, -1.0_dp, 1.0_dp, threshold)
+      call bml_multiply(P0_bml, B_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+      call bml_scale(beta,C_bml)
+      dPdmu_trace = bml_trace(C_bml)
+      p1_trace = bml_trace(P1_bml)
+      mu1 =  - p1_trace/dPdmu_trace
+      if (abs(dPdmu_trace) > 1e-8) then
+        call bml_add(P1_bml,C_bml,1.0_dp,mu1,threshold)
+      endif 
+
+      call bml_deallocate(B_bml)
+      call bml_deallocate(C_bml)
+      call bml_deallocate(C0_bml)
+
+
+  end subroutine prg_implicit_fermi_first_order_response
+
+
   !> Calculate density matrix response to perturbations using Implicit Fermi Dirac.
   !! \param H0_bml Input Hamiltonian matrix.
   !! \param H1_bml, H2_bml, H3_bml Input First to third order perturbations of H0.
@@ -270,7 +513,7 @@ end subroutine prg_implicit_fermi_zero
   !! See \cite{niklasson2015}
   subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P1_bml, P2_bml, P3_bml, &
        nsteps, mu0, mu, beta, nocc, occ_tol, lin_tol, order, threshold)
-
+  
     implicit none
 
     type(bml_matrix_t), intent(in) :: H0_bml, H1_bml, H2_bml, H3_bml
@@ -279,7 +522,7 @@ subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P
     real(dp), allocatable, intent(inout) :: mu(:)
     real(dp), intent(in) :: beta, occ_tol, lin_tol, nocc
     integer, intent(in) :: nsteps
-    type(bml_matrix_t) :: I_bml, tmp1_bml, tmp2_bml, C0_bml, T_bml, Ti_bml
+    type(bml_matrix_t) :: I_bml, tmp1_bml, tmp2_bml, tmp3_bml, C0_bml, T_bml, Ti_bml
     type(bml_matrix_t), allocatable :: B_bml(:), P_bml(:), C_bml(:), H_bml(:)
     real(dp), allocatable :: p_trace(:), trace(:)
     character(20) :: bml_type
@@ -305,6 +548,7 @@ subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P
     end do
 
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, tmp1_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, tmp3_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, tmp2_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C0_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, T_bml)
@@ -371,10 +615,10 @@ subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P
         call bml_scale_add_identity(T_bml, 2.0_dp, 1.0_dp, threshold)
         ! Find T-inverse
         if (i .eq. 1) then
-          call prg_conjgrad(T_bml, Ti_bml, I_bml, tmp1_bml, tmp2_bml, 0.01_dp, threshold)
+          call prg_conjgrad(T_bml, Ti_bml, I_bml, tmp1_bml, tmp2_bml, tmp3_bml,0.01_dp, threshold)
           call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, I_bml)
         end if
-        call prg_newtonschulz(T_bml, Ti_bml, tmp1_bml, tmp2_bml, lin_tol, threshold)
+        call prg_newtonschulz(T_bml, Ti_bml, tmp1_bml, tmp2_bml, tmp3_bml, I_bml, lin_tol, threshold)
         ! Get next P0
         call bml_multiply(Ti_bml, C0_bml, P0_bml, 1.0_dp, 0.0_dp, threshold)
         ! Get next P1
@@ -608,31 +852,39 @@ end subroutine prg_setup_linsys
   !! \param tol Convergence criterion (Frobenius norm of residual matrix)
   !! \param threshold Threshold for matrix algebra
 
-  subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, tol, threshold)
+  subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, d_bml, I_bml, tol, threshold)
 
     implicit none
 
-    type(bml_matrix_t), intent(inout) :: ai_bml, r_bml, tmp_bml
-    type(bml_matrix_t), intent(in) :: a_bml
+    type(bml_matrix_t), intent(inout) :: ai_bml, r_bml, tmp_bml, d_bml
+    type(bml_matrix_t), intent(in) :: a_bml, I_bml
     real(dp), intent(in) :: threshold, tol
-    real(dp) :: norm
-    integer :: i
+    real(dp) :: err,prev_err,scaled_tol
+    integer :: i,N,N2
 
-    norm = 1.0
+    N = bml_get_N(a_bml)
+    err = 100000.0
     i = 0
-    do while(norm > tol)
+    N2 = N*N
+    scaled_tol = tol*N
+    do while(err > scaled_tol)
+      !write(*,*) 'iter = ', i 
       call bml_copy(ai_bml, tmp_bml)
       call bml_multiply(a_bml, ai_bml, r_bml, 1.0_dp, 0.0_dp, threshold)
       call bml_scale_add_identity(r_bml, -1.0_dp, 1.0_dp, threshold)
-      norm = bml_fnorm(r_bml)
-      !   write(*,*) "norm = ", norm
-      if (norm < tol) then
-        exit
-      end if
-      call bml_multiply(tmp_bml, r_bml, ai_bml, 1.0_dp, 1.0_dp, threshold)
+      prev_err = err
+      err = bml_fnorm(r_bml)
+      !write(*,*) "err = ", err
+      !write(*,*) "prev_err = ", prev_err
+      if (10*prev_err < err) then 
+        write(*,*) 'NS did not converge, calling conjugate gradient'
+        call prg_conjgrad(a_bml, ai_bml, I_bml, r_bml, tmp_bml, d_bml, 0.0001_dp, threshold)
+      else 
+        call bml_multiply(tmp_bml, r_bml, ai_bml, 1.0_dp, 1.0_dp, threshold)
+      endif 
       i = i + 1
     enddo
-    ! write(*,*) "Number of NS iterations:", i
+    !write(*,*) "Number of NS iterations:", i
   end subroutine prg_newtonschulz
 
   ! Preconditioned CG, preconditioner inverse diagonal of A
@@ -717,42 +969,42 @@ end subroutine prg_pcg
   !! \param w_bml Auxillary matrix
   !! \param cg_tol Convergence condition (OBS squared Frobenius norm of residual matrix)
   !! \param threshold Threshold for matrix algebra
-  subroutine prg_conjgrad(A_bml, p_bml, p2_bml, d_bml, w_bml, cg_tol, threshold)
+  subroutine prg_conjgrad(A_bml, p_bml, p2_bml, tmp_bml, d_bml, w_bml, cg_tol, threshold)
 
     implicit none
 
-    type(bml_matrix_t), intent(in) :: A_bml
-    type(bml_matrix_t), intent(inout) :: p_bml, p2_bml, d_bml, w_bml
+    type(bml_matrix_t), intent(in) :: A_bml, p2_bml
+    type(bml_matrix_t), intent(inout) :: p_bml, tmp_bml, d_bml, w_bml
     real(dp), intent(in) :: cg_tol, threshold
 
     real(dp) :: alpha, beta
     integer :: k
     real(dp) :: r_norm_old, r_norm_new
 
-    call bml_multiply(A_bml, p_bml, p2_bml, -1.0_dp, 1.0_dp, threshold)
-    r_norm_new = bml_sum_squares(p2_bml)
+    call bml_copy(p2_bml,tmp_bml)
+    call bml_multiply(A_bml, p_bml, tmp_bml, -1.0_dp, 1.0_dp, threshold)
+    r_norm_new = bml_sum_squares(tmp_bml)
     k = 0
 
     do while (r_norm_new .gt. cg_tol)
 
-      !   write(*,*) r_norm_new
+      write(*,*) r_norm_new
       k = k + 1
       if (k .eq. 1) then
-        write(*,*) r_norm_new
-        call bml_copy(p2_bml, d_bml)
+        call bml_copy(tmp_bml, d_bml)
       else
         beta = r_norm_new/r_norm_old
-        call bml_add(d_bml, p2_bml, beta, 1.0_dp, threshold)
+        call bml_add(d_bml, tmp_bml, beta, 1.0_dp, threshold)
       endif
 
       call bml_multiply(A_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
       alpha = r_norm_new/bml_trace_mult(d_bml, w_bml)
 
       call bml_add(p_bml, d_bml, 1.0_dp, alpha, threshold)
-      call bml_add(p2_bml, w_bml, 1.0_dp, -alpha, threshold)
+      call bml_add(tmp_bml, w_bml, 1.0_dp, -alpha, threshold)
       r_norm_old = r_norm_new
-      r_norm_new = bml_sum_squares(p2_bml)
-      if (k .gt. 50) then
+      r_norm_new = bml_sum_squares(tmp_bml)
+      if (k .gt. 500) then
         write(*,*) "Conjugate gradient is not converging"
         stop
       endif
diff --git a/src/prg_xlbokernel_mod.F90 b/src/prg_xlbokernel_mod.F90
new file mode 100644
index 00000000..54f4ecf1
--- /dev/null
+++ b/src/prg_xlbokernel_mod.F90
@@ -0,0 +1,837 @@
+!> Pre-conditioned O(N) calculation of the kernel for XL-BOMD.
+!! \ingroup PROGRESS
+!! \brief Here are subroutines implementing Niklasson's scheme for 
+!!  low-rank, Krylov subspace approximation of the kernel. 
+module prg_xlbokernel_mod
+
+  use omp_lib
+  use bml
+  use prg_normalize_mod
+  use prg_densitymatrix_mod
+  use prg_timer_mod
+  use prg_parallel_mod
+  use prg_ewald_mod
+  use prg_implicit_fermi_mod 
+
+  implicit none
+
+  private  !Everything is private by default
+
+  integer, parameter :: dp = kind(1.0d0)
+
+  public :: prg_kernel_multirank
+  public :: prg_kernel_multirank_latte
+  public :: prg_kernel_matrix_multirank
+  public :: prg_full_kernel
+  public :: prg_full_kernel_latte
+
+contains
+
+subroutine Invert(A,AI,N)
+
+implicit none
+integer, parameter                          :: PREC = 8
+integer, intent(in)                         :: N
+real(PREC),                 intent(in)      :: A(N,N)
+real(PREC), intent(out)                     :: AI(N,N)
+real(PREC)                                  :: WORK(N+N*N)!, C(N,N)
+integer                                     :: LDA, LWORK, M, INFO, IPIV(N)
+integer                                     :: I,J,K
+
+external DGETRF
+external DGETRI
+
+AI = A
+LDA = N
+M = N
+LWORK = N+N*N
+
+call DGETRF(M, N, AI, LDA, IPIV, INFO)
+call DGETRI(N, AI, N, IPIV, WORK, LWORK, INFO)
+
+end subroutine Invert
+  
+!> Compute low rank approximation of (K0*J)^(-1)*K0*(q[n]-n)(for LATTE)
+!! \param KRes The low rank approximation 
+!! \param KK0_bml The pre-conditioner K0.
+!! \param Res The residual q[n]-n
+!! \param FelTol Relative error tolerance for approximation 
+!! \param L Number of vectors used. 
+!! \param LMAX Maximum nr of vectors to use. 
+!! \param NUMRANK Nr of vectors to use. 
+!! \param HO_bml, Orthogonalized Hamiltonian matrix. 
+!! \param mu The chemical potiential. 
+!! \param beta Scaled inverse temperature.
+!! \param RXYZ Nuclear coordinates.
+!! \param Box Box dimensions. 
+!! \param Hubbard_U Hubbard U list. 
+!! \param Element_Pointer List to keep track of elements. 
+!! \param Nr_atoms The number of atoms. 
+!! \param HDIM Hamiltonian matrix dimension. 
+!! \param Max_Nr_Neigh Max neighbours for Ewald. 
+!! \param Coulomb_acc Coulomb accuracy.
+!! \param nebcoul Neighbour lists.
+!! \param totnebcoul Number of neighbours list. 
+!! \param Hinxlist List to keep track of atomic positions in the Hamiltonian. 
+!! \param S_bml The S matrix. 
+!! \param Z_bml, The Z matrix. 
+!! \param Nocc Occupation. 
+!! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+!! \param DO_bml, D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+!! \param m_rec Number of recursion steps.
+!! \param threshold Threshold value for matrix truncation.
+!! \param Nr_elem Number of elements in Hubbard list. 
+subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_bml,mu,beta,RXYZ,Box,Hubbard_U,Element_Pointer, & 
+    Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,nebcoul,totnebcoul,Hinxlist, &
+    S_bml,Z_bml,Nocc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold,Nr_elem)
+
+!! Res = q[n] - n
+!! KK0 is preconditioner
+!! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+  implicit none
+  integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec,Nr_elem,NUMRANK
+  real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+  real(dp), intent(in)      :: Coulomb_acc, FelTol, beta, mu
+  real(dp), intent(in)      :: RXYZ(3,Nr_atoms), Box(3,3)
+  real(dp), intent(in)      :: Res(Nr_atoms)
+  integer,    intent(in)      :: Hinxlist(HDIM),Element_Pointer(Nr_atoms)
+  real(dp), intent(in)      :: Hubbard_U(Nr_elem)
+  type(bml_matrix_t), intent(inout)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
+  real(dp)                  :: K0Res(Nr_atoms)
+  type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+  real(dp), intent(in)      :: threshold
+  integer,    intent(in)      :: LMAX
+  character(20)             :: bml_type
+  integer,    intent(in)      :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+  real(dp)                  :: dq_v(Nr_atoms)
+  real(dp), intent(inout)   :: KRes(Nr_atoms)
+  integer                     :: I,J,K,It,N,MN
+  integer, intent(out)        :: L
+  real(dp)                  :: Fel, proj_tmp, start, finish
+  real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+  real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+  real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
+  type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml, ZT_bml, T_bml
+
+  call timer_prg_init()
+  bml_type = bml_get_type(HO_bml)
+  N = bml_get_N(HO_bml)
+  MN = bml_get_M(HO_bml)
+  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
+  call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
+  call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
+  call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
+  call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+
+  call bml_transpose(Z_bml,ZT_bml)
+
+  ! K0Res = KK0*Res temporary for matrix-vector multiplication
+  call bml_set_row(Res_bml,1,Res,threshold)
+  call bml_transpose(KK0_bml,KK0T_bml)
+  call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
+  call bml_get_row(K0Res_bml,1,row_NA)
+  K0Res = row_NA
+  dr = K0Res
+
+  I = 0
+  Fel = 1.D0  
+  do while ((Fel > FelTol).AND.(I < (LMAX)).AND.(I < NUMRANK))   !! Fel = "Error" in Swedish
+    I = I + 1
+    vi(:,I) = dr/norm2(dr)
+    do J = 1,I-1
+      vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+    enddo
+    vi(:,I) = vi(:,I)/norm2(vi(:,I))
+    v(:) = vi(:,I)  ! v_i
+
+    ! Compute H1 = H(v) 
+    dq_v = v 
+    call prg_timer_start(1)
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I)
+    do J = 1,Nr_atoms
+      call Ewald_Real_Space_latte(Coulomb_Pot_Real_I,J,RXYZ,Box, &
+      dq_v,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_elem)
+      Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+    enddo
+    !$OMP END PARALLEL DO
+    
+    call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc,Max_Nr_Neigh)
+    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+    call prg_timer_stop(1,1)
+
+
+    call bml_deallocate(H1_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,HDIM,MN,H1_bml)
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+    do It = 1,Nr_atoms-1
+       do K = Hinxlist(It)+1,Hinxlist(It+1)
+           row1(K) = Hubbard_U(Element_Pointer(It))*dq_v(It) + Coulomb_Pot_dq_v(It)
+       enddo
+    enddo
+    !$OMP END PARALLEL DO
+    do K = Hinxlist(Nr_atoms)+1,HDIM
+         row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
+    enddo
+     
+    ! H1 = 1/2(S*H1+H1*S)
+    call bml_set_diagonal(H1_bml,row1,threshold)
+    call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold) 
+    call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
+    call bml_copy(X_bml,H1_bml)
+
+    ! H1 = Z^T H1 Z
+    call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+    call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+    ! Compute D1 = F_FD(HO_bml + eps*H1_bml)/eps at eps = 0
+    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+        m_rec, mu, beta, real(nocc,dp), threshold)
+    
+    ! D1 = Z D1 Z^T
+    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+    call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+    ! Compute dq/dv
+    call bml_multiply(D1_bml,S_bml,X_bml, 1.0_dp,0.0_dp,threshold)
+    call bml_get_diagonal(X_bml,row1)
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+    do It = 1, Nr_atoms-1
+       dq_dv(It) = 0
+       do K = Hinxlist(It)+1,Hinxlist(It+1)
+          dq_dv(It) = dq_dv(It) + row1(K)
+       enddo
+    enddo
+    !$OMP END PARALLEL DO
+    dq_dv(Nr_atoms) = 0
+    do K = Hinxlist(Nr_atoms)+1,HDIM
+       dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K) 
+    enddo
+
+    dr = dq_dv - v
+    ! fi = K0(dq_dv - v)
+    call bml_set_row(Res_bml,1,dr,threshold)
+    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
+    call bml_get_row(K0Res_bml,1,row_NA)
+    dr = row_NA
+    fi(:,I) = dr
+
+    L = I
+    allocate(O(L,L), M(L,L))
+    do K = 1,L
+    do J = 1,L
+       O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+    enddo
+    enddo
+    call Invert(O,M,L)                        ! M = O^(-1)
+    IdentRes = 0.D0*K0Res
+    KRes = 0.D0
+    do K = 1,L
+    do J = 1,L
+      proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
+      IdentRes = IdentRes + proj_tmp*fi(:,K)
+      KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n) 
+    enddo
+    enddo
+    Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+    write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+    deallocate(O, M)
+
+  enddo
+
+  deallocate(row1);deallocate(row2);deallocate(row_NA)
+  call bml_deallocate(KK0T_bml)
+  call bml_deallocate(K0Res_bml)
+  call bml_deallocate(Res_bml)
+  call bml_deallocate(ZT_bml)
+  call prg_timer_shutdown() 
+
+  end subroutine prg_kernel_multirank_latte
+
+  ! Above routine but for development code   
+  subroutine prg_kernel_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, & 
+    Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
+    S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+
+!! Res = q[n] - n
+!! KK0 is preconditioner
+!! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+  implicit none
+  integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
+  real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+  real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
+  real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+  real(dp), intent(in)      :: Res(Nr_atoms)
+  integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+  real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+  type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
+  real(dp)                  :: K0Res(Nr_atoms)
+  type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+  real(dp), intent(in)      :: threshold
+  integer,    intent(in)      :: LMAX
+  character(10), intent(in)   :: Element_Type(Nr_atoms)
+  integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+  real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
+  real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+  real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+  real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+  real(dp)                  :: dq_v(Nr_atoms)
+  real(dp), intent(inout)   :: KRes(Nr_atoms)
+  integer                     :: I,J,K,It
+  integer, intent(out)        :: L
+  real(dp)                  :: Fel, proj_tmp
+  real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+  real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+  real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
+  type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml
+
+  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
+  call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
+  call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
+  call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
+
+  ! K0Res = KK0*Res temporary for matrix-vector multiplication
+  call bml_transpose(KK0_bml,KK0T_bml) 
+  call bml_set_row(Res_bml,1,Res,ONE*1e-14)
+  call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,ONE*1e-14)
+  call bml_get_row(K0Res_bml,1,row_NA)
+  K0Res = row_NA
+  dr = K0Res
+
+  I = 0
+  Fel = 1.D0  
+  do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
+    I = I + 1
+    vi(:,I) = dr/norm2(dr)
+    do J = 1,I-1
+      vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+    enddo
+    vi(:,I) = vi(:,I)/norm2(vi(:,I))
+    v(:) = vi(:,I)  ! v_i
+
+    dq_v = v 
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
+    do J = 1,Nr_atoms
+      call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
+      dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+      Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+    enddo
+    !$OMP END PARALLEL DO
+    
+    call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+    row1 = 0.0_dp
+    do J = 1,HDIM
+       call bml_set_row(H1_bml,J,row1,threshold)
+    enddo
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
+    do J = 1,Nr_atoms
+       do K = H_INDEX_START(J),H_INDEX_END(J)
+           row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
+       enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    call bml_set_diagonal(H1_bml,row1,threshold)
+    call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold) 
+    call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+    call bml_copy(D1_bml,H1_bml)
+
+    call bml_transpose(Z_bml,X_bml)
+    call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+    call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+        m_rec, mu, beta, real(nocc,dp), threshold)
+    
+    call bml_transpose(Z_bml,X_bml)
+    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+    call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
+    do It = 1, Nr_atoms
+      dq_dv(It) = 0
+      do K = H_INDEX_START(It), H_INDEX_END(It)
+        call bml_get_row(S_bml,K,row1)
+        call bml_get_row(D1_bml,K,row2)
+        dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    dr = dq_dv - v 
+    ! fi = K0(dq_dv - v)
+    call bml_set_row(Res_bml,1,dr,threshold)
+    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
+    call bml_get_row(K0Res_bml,1,row_NA)
+    dr = row_NA
+    fi(:,I) = dr
+
+    L = I
+    allocate(O(L,L), M(L,L))
+    do K = 1,L
+    do J = 1,L
+       O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+    enddo
+    enddo
+    call Invert(O,M,L)                        ! M = O^(-1)
+    IdentRes = 0.D0*K0Res
+    KRes = 0.D0
+    do K = 1,L
+    do J = 1,L
+      proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
+      IdentRes = IdentRes + proj_tmp*fi(:,K)
+      KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n) 
+    enddo
+    enddo
+    Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+    write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+    deallocate(O, M)
+
+  enddo
+
+  deallocate(row1);deallocate(row2);deallocate(row_NA)
+  call bml_deallocate(KK0T_bml)
+  call bml_deallocate(K0Res_bml)
+  call bml_deallocate(Res_bml)
+
+  end subroutine prg_kernel_multirank
+
+!> Compute full inverse Jacobian of q[n]-n (for LATTE)
+!! \param KK The inverse Jacobian.
+!! \param DO_bml Orthogonalized density matrix. 
+!! \param mu0 The chemical potiential. 
+!! \param RXYZ Nuclear coordinates.
+!! \param Box Box dimensions. 
+!! \param Hubbard_U Hubbard U list. 
+!! \param Element_Pointer List to keep track of elements. 
+!! \param Nr_atoms The number of atoms. 
+!! \param HDIM Hamiltonian matrix dimension. 
+!! \param Max_Nr_Neigh Max neighbours for Ewald. 
+!! \param Coulomb_acc Coulomb accuracy
+!! \param Hinxlist List to keep track of atomic positions in the Hamiltonian. 
+!! \param S_bml The S matrix. 
+!! \param Z_bml, The Z matrix. 
+!! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+!! \param HO_bml, Orthogonalized Hamiltonian matrix. 
+!! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+!! \param Nocc Occupation. 
+!! \param m_rec Number of recursion steps.
+!! \param threshold Threshold value for matrix truncation.
+!! \param beta Scaled inverse temperature.
+!! \param Nr_elem Number of elements in Hubbard list. 
+subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
+Element_Pointer, Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc, & 
+Hinxlist,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
+Nocc,m_rec,threshold,beta,Nr_elem)
+
+use bml
+
+implicit none
+integer, parameter             :: PREC = 8, dp = kind(1.0d0)
+integer,    intent(in)         :: Nr_atoms, Nr_elem,HDIM, Nocc, Max_Nr_Neigh,m_rec
+real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
+real(PREC), intent(in)         :: Coulomb_acc, threshold,beta
+real(PREC)                     :: v(Nr_atoms)
+real(PREC), intent(in)         :: RXYZ(3,Nr_atoms),Box(3,3)
+integer,    intent(in)         :: Hinxlist(HDIM)
+real(PREC), intent(in)         :: Hubbard_U(Nr_elem)
+type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
+type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
+type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml 
+real(PREC), intent(inout)      :: mu0
+integer,    intent(in)         :: Element_Pointer(Nr_atoms)
+real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
+real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+real(PREC)                     :: dq_v(Nr_atoms)
+real(PREC)                     :: dq_dv(Nr_atoms), err,tol
+real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
+integer                        :: I,J,K, ITER, mm,It,N,MN
+real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)     
+type(bml_matrix_t)             :: ZT_bml, R_bml, JJI_bml, JJ_bml, tmp_bml
+character(20)                  :: bml_type
+
+  bml_type = bml_get_type(HO_bml)
+  N = bml_get_N(HO_bml)
+  MN = bml_get_M(HO_bml)
+  call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+  call bml_transpose(Z_bml,ZT_bml)
+  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
+  Coulomb_Pot_dq_v = ZERO
+  Coulomb_Pot_k = ZERO
+  dq_v = ZERO
+  JJ = ZERO
+  KK = ZERO
+  
+  do J = 1,Nr_atoms  
+    dq_v(J) = ONE
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
+    do I = 1,Nr_atoms   
+        call Ewald_Real_Space_Single_latte(Coulomb_Pot_Real_I,I,RXYZ,Box,Nr_elem, &
+        dq_v,J,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,HDIM,Max_Nr_Neigh)
+        Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
+    enddo
+    !$OMP END PARALLEL DO
+    call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc, &
+    Max_Nr_Neigh)
+    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+    call bml_deallocate(H1_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,H1_bml)
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
+    do I = 1,Nr_atoms-1
+       do K = Hinxlist(I)+1,Hinxlist(I+1)
+           row1(K) = Hubbard_U(Element_Pointer(I))*dq_v(I) + Coulomb_Pot_dq_v(I)
+       enddo
+    enddo
+    !$OMP END PARALLEL DO
+    do K = Hinxlist(Nr_atoms)+1,HDIM
+         row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
+    enddo
+
+    call bml_set_diagonal(H1_bml,row1,threshold)
+    call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold) 
+    call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
+    call bml_copy(X_bml,H1_bml)
+
+    call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+    call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec,mu0,beta,real(nocc,PREC),threshold) 
+
+    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+    call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+    call bml_multiply(D1_bml,S_bml,X_bml,1.0_dp,0.0_dp,threshold)
+    call bml_get_diagonal(X_bml,row1)
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+    do It = 1, Nr_atoms-1
+       dq_dv(It) = 0
+       do K = Hinxlist(It)+1,Hinxlist(It+1)
+          dq_dv(It) = dq_dv(It) + row1(K)
+       enddo
+       JJ(It,J) = dq_dv(It)
+    enddo
+    !$OMP END PARALLEL DO
+    dq_dv(Nr_atoms) = 0
+    do K = Hinxlist(Nr_atoms)+1,HDIM
+       dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K) 
+    enddo
+    JJ(Nr_atoms,J) = dq_dv(Nr_atoms)
+    dq_v = ZERO
+  enddo
+  
+  do I = 1,Nr_atoms
+     JJ(I,I) = JJ(I,I) - ONE
+  enddo
+  
+  call Invert(JJ,KK,Nr_atoms)
+  
+  deallocate(row1); deallocate(row2); deallocate(JJ)
+  call bml_deallocate(ZT_bml)
+
+end subroutine prg_full_kernel_latte
+
+!> Compute full inverse Jacobian of q[n]-n (for development code) 
+!! \param KK The inverse Jacobian.
+!! \param DO_bml Orthogonalized density matrix. 
+!! \param mu0 The chemical potiential. 
+!! \param RX,RY,RZ Nuclear coordinates.
+!! \param Lbox Box dimensions. 
+!! \param Hubbard_U Hubbard U list. 
+!! \param Element_Type List to keep track of elements. 
+!! \param Nr_atoms The number of atoms. 
+!! \param HDIM Hamiltonian matrix dimension. 
+!! \param Max_Nr_Neigh Max neighbours for Ewald. 
+!! \param Coulomb_acc Coulomb accuracy
+!! \param TIMERATIO Parameter for Ewald
+!! \param nnRx,nnRy,nnRz Neighbour lists.
+!! \param nrnnlist Number of neighbours list.
+!! \param nnType Refers to original order of atoms. 
+!! \param H_INDEX_START, H_INDEX_END Lists to keep track of atomic positions in the Hamiltonian. 
+!! \param S_bml The S matrix. 
+!! \param Z_bml, The Z matrix. 
+!! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+!! \param HO_bml, Orthogonalized Hamiltonian matrix. 
+!! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+!! \param Nocc Occupation. 
+!! \param Znuc List of nuclear charges.
+!! \param m_rec Number of recursion steps.
+!! \param threshold Threshold value for matrix truncation.
+!! \param beta Scaled inverse temperature.
+!! \param diagonal Auxillary vector. 
+
+subroutine prg_full_kernel(KK,DO_bml,mu0,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
+Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz, & 
+nrnnlist,nnType,H_INDEX_START,H_INDEX_END,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
+Nocc,Znuc,m_rec,threshold,beta,diagonal)
+
+use bml
+
+implicit none
+integer, parameter             :: PREC = 8, dp = kind(1.0d0)
+integer,    intent(in)         :: Nr_atoms, HDIM, Nocc, Max_Nr_Neigh,m_rec
+real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
+real(PREC), intent(in)         :: Coulomb_acc, TIMERATIO,threshold,beta
+real(PREC)                     :: v(Nr_atoms)
+real(PREC), intent(in)         :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+integer,    intent(in)         :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+real(PREC), intent(in)         :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
+type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
+type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml 
+real(PREC), intent(inout)      :: mu0, diagonal(HDIM)
+character(10), intent(in)      :: Element_Type(Nr_atoms)
+integer,    intent(in)         :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+real(PREC), intent(in)         :: nnRx(Nr_atoms,Max_Nr_Neigh)
+real(PREC), intent(in)         :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
+real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+real(PREC)                     :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+real(PREC)                     :: dq_v(Nr_atoms)
+real(PREC)                     :: dq_dv(Nr_atoms) 
+real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
+integer                        :: I,J,K, ITER, mm,It
+real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)        
+
+  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
+  Coulomb_Pot_dq_v = ZERO
+  Coulomb_Pot_k = ZERO
+  dq_v = ZERO
+  JJ = ZERO
+  KK = ZERO
+  
+  do J = 1,Nr_atoms  
+    dq_v(J) = ONE
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
+    do I = 1,Nr_atoms   
+        call Ewald_Real_Space_Single(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,I,RX,RY,RZ,LBox, &
+        dq_v,J,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,HDIM,Max_Nr_Neigh)
+        Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
+    enddo
+    !$OMP END PARALLEL DO
+    call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
+    TIMERATIO,Max_Nr_Neigh)
+    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+    diagonal = 0.0_dp
+    do I = 1,HDIM
+       call bml_set_row(H1_bml,I,diagonal,threshold)
+    enddo
+    
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
+    do I = 1,Nr_atoms
+       do K = H_INDEX_START(I),H_INDEX_END(I)
+           diagonal(K) = Hubbard_U(I)*dq_v(I) + Coulomb_Pot_dq_v(I)
+       enddo
+    enddo
+    !$OMP END PARALLEL DO
+    
+    call bml_set_diagonal(H1_bml,diagonal,threshold)
+    call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold) 
+    call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+    call bml_copy(D1_bml,H1_bml)
+
+    call bml_transpose(Z_bml,X_bml)
+    call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+    call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec,mu0,beta,real(nocc,PREC),threshold) 
+
+    call bml_transpose(Z_bml,X_bml)
+    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+    call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+    do It = 1, Nr_atoms
+       dq_dv(It) = 0
+       do K = H_INDEX_START(It), H_INDEX_END(It)
+          call bml_get_row(S_bml,K,row1)
+          call bml_get_row(D1_bml,K,row2)
+          dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+       enddo
+       JJ(It,J) = dq_dv(It)
+    enddo
+    dq_v = ZERO
+  enddo
+
+  do I = 1,Nr_atoms
+     JJ(I,I) = JJ(I,I) - ONE
+  enddo
+  call Invert(JJ,KK,Nr_atoms)
+  deallocate(row1); deallocate(row2); deallocate(JJ)
+
+end subroutine prg_full_kernel
+
+
+! Compute the low-rank kernel matrix. (For development code) 
+  subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, & 
+    Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
+    S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+
+!! Res = q[n] - n
+!! KK0 is preconditioner
+!! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+  implicit none
+  integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
+  real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+  real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
+  real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+  real(dp), intent(in)      :: Res(Nr_atoms)
+  integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+  real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+  type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec)
+  real(dp)                  :: K0Res(Nr_atoms)
+  type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+  type(bml_matrix_t),intent(inout) :: KK0_bml
+  real(dp), intent(in)      :: threshold
+  integer,    intent(in)      :: LMAX
+  character(10), intent(in)   :: Element_Type(Nr_atoms)
+  integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+  real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
+  real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+  real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+  real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+  real(dp)                  :: dq_v(Nr_atoms)
+  real(dp), intent(out)     :: KRes(Nr_atoms)
+  integer                     :: I,J,K,It,col,row
+  integer, intent(out)        :: L
+  real(dp)                  :: Fel, proj_tmp,elem
+  real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+  real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+  real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:)
+
+  allocate(row1(HDIM));allocate(row2(HDIM));
+
+  dr = Res
+  I = 0
+  Fel = 1.D0  
+  do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
+    I = I + 1
+    vi(:,I) = dr/norm2(dr)
+    do J = 1,I-1
+      vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+    enddo
+    vi(:,I) = vi(:,I)/norm2(vi(:,I))
+    v(:) = vi(:,I)  ! v_i
+
+    dq_v = v 
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
+    do J = 1,Nr_atoms
+      call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
+      dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+      Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+    enddo
+    !$OMP END PARALLEL DO
+    
+    call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+    row1 = 0.0_dp
+    do J = 1,HDIM
+       call bml_set_row(H1_bml,J,row1,threshold)
+    enddo
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
+    do J = 1,Nr_atoms
+       do K = H_INDEX_START(J),H_INDEX_END(J)
+           row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
+       enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    call bml_set_diagonal(H1_bml,row1,threshold)
+    call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold) 
+    call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+    call bml_copy(D1_bml,H1_bml)
+
+    call bml_transpose(Z_bml,X_bml)
+    call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+    call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+        m_rec, mu, beta, real(nocc,dp), threshold)
+    
+    call bml_transpose(Z_bml,X_bml)
+    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+    call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
+    do It = 1, Nr_atoms
+      dq_dv(It) = 0
+      do K = H_INDEX_START(It), H_INDEX_END(It)
+        call bml_get_row(S_bml,K,row1)
+        call bml_get_row(D1_bml,K,row2)
+        dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    dr = dq_dv - v 
+    fi(:,I) = dr
+
+    L = I
+    allocate(O(L,L), M(L,L))
+    do K = 1,L
+    do J = 1,L
+       O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+    enddo
+    enddo
+    call Invert(O,M,L)                        ! M = O^(-1)
+    IdentRes = 0.D0*Res
+    KRes = 0.D0
+    do K = 1,L
+    do J = 1,L
+      proj_tmp = M(K,J)*dot_product(fi(:,J),Res)
+      IdentRes = IdentRes + proj_tmp*fi(:,K)
+      KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (J)^(-1)*(q[n]-n) 
+    enddo
+    enddo
+    Fel = norm2(IdentRes-Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+    write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+  
+    ! Does not work, need to normalize matrix  
+        if ((Fel > FelTol).AND.(I < (LMAX))) then 
+          deallocate(O, M)
+        else 
+         do row = 1,Nr_atoms
+         do col = 1,Nr_atoms 
+           elem = 0.0
+           do K = 1,L
+             do J = 1,L 
+               elem = elem + M(J,K)*vi(row,J)*fi(col,K)
+             enddo
+           enddo 
+           if (abs(elem) > threshold) then
+             call bml_set_element(KK0_bml,row,col,elem)
+           endif
+         enddo
+         enddo  
+         deallocate(O,M)
+         endif
+    enddo 
+    deallocate(row1); deallocate(row2);
+
+  end subroutine prg_kernel_matrix_multirank
+
+end module prg_xlbokernel_mod

From 7960bcfbbd5f25863131277e80ddc4b92abe3d02 Mon Sep 17 00:00:00 2001
From: Linnea Andersson <lian5777@fredholm.it.uu.se>
Date: Wed, 12 May 2021 19:34:44 +0200
Subject: [PATCH 02/10] O(N) pre-conditioned kernel matrix with implicit
 FD-expansion response

---
 src/CMakeLists.txt             |    1 +
 src/prg_ewald_mod.F90          | 1386 ++++++++++++++---------------
 src/prg_implicit_fermi_mod.F90 |  210 ++---
 src/prg_timer_mod.F90          |  186 ++--
 src/prg_xlbokernel_mod.F90     | 1505 ++++++++++++++++----------------
 tests/CMakeLists.txt           |    1 +
 tests/src/main.F90             | 1474 ++++++++++++++++---------------
 7 files changed, 2403 insertions(+), 2360 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index beaa317a..becd6040 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -125,6 +125,7 @@ install(FILES
   ${CMAKE_CURRENT_BINARY_DIR}/prg_timer_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_xlbo_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_xlkernel_mod.mod
+  ${CMAKE_CURRENT_BINARY_DIR}/prg_xlbokernel_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/slaterkosterforce_latte_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/tbparams_latte_mod.mod
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
diff --git a/src/prg_ewald_mod.F90 b/src/prg_ewald_mod.F90
index 5a032412..95127797 100644
--- a/src/prg_ewald_mod.F90
+++ b/src/prg_ewald_mod.F90
@@ -1,4 +1,4 @@
-! Ewald sum routines for kernel calculation 
+! Ewald sum routines for kernel calculation
 module prg_ewald_mod
 
   use bml
@@ -20,550 +20,550 @@ module prg_ewald_mod
 
 contains
 
-!> Find Coulomb potential on site I from single charge at site J          
-subroutine  Ewald_Real_Space_Single_latte(COULOMBV,I,RXYZ,Box,Nr_elem, &
-    DELTAQ,J,U,Element_Pointer,Nr_atoms,COULACC,HDIM,Max_Nr_Neigh)
-
-implicit none
-
-integer,    parameter     :: PREC = 8
-real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
-real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
-real(PREC), parameter     :: pi = 3.14159265358979323846264D0
-real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
-integer,    intent(in)    :: Nr_atoms, Nr_elem, HDIM, Max_Nr_Neigh, I, J, Element_Pointer(Nr_atoms)
-real(PREC), intent(in)    :: COULACC, DELTAQ(Nr_atoms)
-real(PREC)                :: TFACT, RELPERM, KECONST
-real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3)
-real(PREC), intent(in)    :: U(Nr_elem)
-real(PREC)                :: COULCUT, COULCUT2
-real(PREC), intent(out)   :: COULOMBV 
-real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
-real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
-real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
-real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
-real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
-real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
-
-integer                   :: K, ccnt,l,m,n
-
-COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
-SQRTX = sqrt(-log(COULACC))
-
-ccnt = 0
-COULCUT = 12.D0
-CALPHA = SQRTX/COULCUT
-COULCUT2 = COULCUT*COULCUT
-CALPHA2 = CALPHA*CALPHA
-
-RELPERM = ONE
-KECONST = 14.3996437701414D0*RELPERM
-TFACT  = 16.0D0/(5.0D0*KECONST)
-
-COULOMBV = ZERO
-
-TI = TFACT*U(Element_Pointer(I))
-TI2 = TI*TI
-TI3 = TI2*TI
-TI4 = TI2*TI2
-TI6 = TI4*TI2
-
-SSA = TI
-SSB = TI3/48.D0
-SSC = 3.D0*TI2/16.D0
-SSD = 11.D0*TI/16.D0
-SSE = 1.D0
-
-Ra(1) = RXYZ(1,I)
-Ra(2) = RXYZ(2,I)
-Ra(3) = RXYZ(3,I)
+  !> Find Coulomb potential on site I from single charge at site J
+  subroutine  Ewald_Real_Space_Single_latte(COULOMBV,I,RXYZ,Box,Nr_elem, &
+       DELTAQ,J,U,Element_Pointer,Nr_atoms,COULACC,HDIM,Max_Nr_Neigh)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Nr_elem, HDIM, Max_Nr_Neigh, I, J, Element_Pointer(Nr_atoms)
+    real(PREC), intent(in)    :: COULACC, DELTAQ(Nr_atoms)
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3)
+    real(PREC), intent(in)    :: U(Nr_elem)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: COULOMBV
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+
+    integer                   :: K, ccnt,l,m,n
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    COULOMBV = ZERO
+
+    TI = TFACT*U(Element_Pointer(I))
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RXYZ(1,I)
+    Ra(2) = RXYZ(2,I)
+    Ra(3) = RXYZ(3,I)
 
     do k = -1,1
-    do m = -1,1
-    do l = -1,1
-
-    Rb(1) = RXYZ(1,J)+k*box(1,1)
-    Rb(2) = RXYZ(2,J)+m*box(2,2)
-    Rb(3) = RXYZ(3,J)+l*box(3,3)
-    Rab = Rb-Ra  ! OBS b - a !!!
-    dR = norm2(Rab)
-    MAGR = dR
-    MAGR2 = dR*dR
-
-    if ((dR <= COULCUT).and.(dR > 1e-12)) then
-
-       TJ = TFACT*U(Element_Pointer(J))
-       DC = Rab/dR
-
-       Z = abs(CALPHA*MAGR)
-       NUMREP_ERFC = erfc(Z)
-
-       CA = NUMREP_ERFC/MAGR
-       COULOMBV = COULOMBV + DELTAQ(J)*CA
-       ccnt = ccnt + 1
-       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
-       EXPTI = exp(-TI*MAGR )
-
-       if (Element_Pointer(I).eq.Element_Pointer(J)) then
-           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
-           ccnt = ccnt + 1
-       else
-           TJ2 = TJ*TJ
-           TJ3 = TJ2*TJ
-           TJ4 = TJ2*TJ2
-           TJ6 = TJ4*TJ2
-           EXPTJ = exp( -TJ*MAGR )
-           TI2MTJ2 = TI2 - TJ2
-           TJ2MTI2 = -TI2MTJ2
-           SA = TI
-           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
-           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
-           SD = TJ
-           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
-           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
-
-           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
-       endif
-    endif
-    enddo
-    enddo
+      do m = -1,1
+        do l = -1,1
+
+          Rb(1) = RXYZ(1,J)+k*box(1,1)
+          Rb(2) = RXYZ(2,J)+m*box(2,2)
+          Rb(3) = RXYZ(3,J)+l*box(3,3)
+          Rab = Rb-Ra  ! OBS b - a !!!
+          dR = norm2(Rab)
+          MAGR = dR
+          MAGR2 = dR*dR
+
+          if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+            TJ = TFACT*U(Element_Pointer(J))
+            DC = Rab/dR
+
+            Z = abs(CALPHA*MAGR)
+            NUMREP_ERFC = erfc(Z)
+
+            CA = NUMREP_ERFC/MAGR
+            COULOMBV = COULOMBV + DELTAQ(J)*CA
+            ccnt = ccnt + 1
+            CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+            EXPTI = exp(-TI*MAGR )
+
+            if (Element_Pointer(I).eq.Element_Pointer(J)) then
+              COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+              ccnt = ccnt + 1
+            else
+              TJ2 = TJ*TJ
+              TJ3 = TJ2*TJ
+              TJ4 = TJ2*TJ2
+              TJ6 = TJ4*TJ2
+              EXPTJ = exp( -TJ*MAGR )
+              TI2MTJ2 = TI2 - TJ2
+              TJ2MTI2 = -TI2MTJ2
+              SA = TI
+              SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+              SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+              SD = TJ
+              SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+              SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+              COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+            endif
+          endif
+        enddo
+      enddo
     enddo
 
-COULOMBV = KECONST*COULOMBV
-
-end subroutine Ewald_Real_Space_Single_latte
-
-subroutine  Ewald_Real_Space_Single(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
-    DELTAQ,J,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,HDIM,Max_Nr_Neigh)
-
-
-implicit none
-
-integer,    parameter     :: PREC = 8
-real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
-real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
-real(PREC), parameter     :: pi = 3.14159265358979323846264D0
-real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
-integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, J
-real(PREC), intent(in)    :: COULACC, TIMERATIO,DELTAQ(Nr_atoms)
-real(PREC)                :: TFACT, RELPERM, KECONST
-real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
-real(PREC), intent(in)    :: U(Nr_atoms)
-real(PREC)                :: COULCUT, COULCUT2
-character(10), intent(in) :: Element_Type(Nr_atoms)
-real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
-real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
-real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
-real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
-real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
-real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
-real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
-
-integer                   :: K, ccnt,l,m,n
-
-COULVOL = LBox(1)*LBox(2)*LBox(3)
-SQRTX = sqrt(-log(COULACC))
-
-ccnt = 0
-COULCUT = 12.D0
-CALPHA = SQRTX/COULCUT
-COULCUT2 = COULCUT*COULCUT
-CALPHA2 = CALPHA*CALPHA
-
-RELPERM = ONE
-KECONST = 14.3996437701414D0*RELPERM
-TFACT  = 16.0D0/(5.0D0*KECONST)
-
-FCOUL = ZERO
-COULOMBV = ZERO
-
-TI = TFACT*U(I)
-TI2 = TI*TI
-TI3 = TI2*TI
-TI4 = TI2*TI2
-TI6 = TI4*TI2
-
-SSA = TI
-SSB = TI3/48.D0
-SSC = 3.D0*TI2/16.D0
-SSD = 11.D0*TI/16.D0
-SSE = 1.D0
-
-Ra(1) = RX(I)
-Ra(2) = RY(I)
-Ra(3) = RZ(I)
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space_Single_latte
+
+  subroutine  Ewald_Real_Space_Single(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
+       DELTAQ,J,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,HDIM,Max_Nr_Neigh)
+
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, J
+    real(PREC), intent(in)    :: COULACC, TIMERATIO,DELTAQ(Nr_atoms)
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+    real(PREC), intent(in)    :: U(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    character(10), intent(in) :: Element_Type(Nr_atoms)
+    real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+
+    integer                   :: K, ccnt,l,m,n
+
+    COULVOL = LBox(1)*LBox(2)*LBox(3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    FCOUL = ZERO
+    COULOMBV = ZERO
+
+    TI = TFACT*U(I)
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RX(I)
+    Ra(2) = RY(I)
+    Ra(3) = RZ(I)
 
     do k = -1,1
-    do m = -1,1
-    do l = -1,1
-
-    Rb(1) = RX(J)+k*Lbox(1)
-    Rb(2) = RY(J)+m*Lbox(2)
-    Rb(3) = RZ(J)+l*Lbox(3)
-    Rab = Rb-Ra  ! OBS b - a !!!
-    dR = norm2(Rab)
-    MAGR = dR
-    MAGR2 = dR*dR
-
-    if ((dR <= COULCUT).and.(dR > 1e-12)) then
-
-       TJ = TFACT*U(J)
-       DC = Rab/dR
-
-       Z = abs(CALPHA*MAGR)
-       NUMREP_ERFC = erfc(Z)
-
-       CA = NUMREP_ERFC/MAGR
-       COULOMBV = COULOMBV + DELTAQ(J)*CA
-       ccnt = ccnt + 1
-       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
-       FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
-       EXPTI = exp(-TI*MAGR )
-
-       if (Element_Type(I).eq.Element_Type(J)) then
-           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
-           ccnt = ccnt + 1
-           FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
-             + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
-       else
-           TJ2 = TJ*TJ
-           TJ3 = TJ2*TJ
-           TJ4 = TJ2*TJ2
-           TJ6 = TJ4*TJ2
-           EXPTJ = exp( -TJ*MAGR )
-           TI2MTJ2 = TI2 - TJ2
-           TJ2MTI2 = -TI2MTJ2
-           SA = TI
-           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
-           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
-           SD = TJ
-           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
-           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
-
-           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
-           FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
-              + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
-       endif
-
-       FCOUL(1) = FCOUL(1) + DC(1)*FORCE
-       FCOUL(2) = FCOUL(2) + DC(2)*FORCE
-       FCOUL(3) = FCOUL(3) + DC(3)*FORCE
-    endif
-    enddo
+      do m = -1,1
+        do l = -1,1
+
+          Rb(1) = RX(J)+k*Lbox(1)
+          Rb(2) = RY(J)+m*Lbox(2)
+          Rb(3) = RZ(J)+l*Lbox(3)
+          Rab = Rb-Ra  ! OBS b - a !!!
+          dR = norm2(Rab)
+          MAGR = dR
+          MAGR2 = dR*dR
+
+          if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+            TJ = TFACT*U(J)
+            DC = Rab/dR
+
+            Z = abs(CALPHA*MAGR)
+            NUMREP_ERFC = erfc(Z)
+
+            CA = NUMREP_ERFC/MAGR
+            COULOMBV = COULOMBV + DELTAQ(J)*CA
+            ccnt = ccnt + 1
+            CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+            FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
+            EXPTI = exp(-TI*MAGR )
+
+            if (Element_Type(I).eq.Element_Type(J)) then
+              COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+              ccnt = ccnt + 1
+              FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
+                   + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
+            else
+              TJ2 = TJ*TJ
+              TJ3 = TJ2*TJ
+              TJ4 = TJ2*TJ2
+              TJ6 = TJ4*TJ2
+              EXPTJ = exp( -TJ*MAGR )
+              TI2MTJ2 = TI2 - TJ2
+              TJ2MTI2 = -TI2MTJ2
+              SA = TI
+              SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+              SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+              SD = TJ
+              SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+              SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+              COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+              FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
+                   + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
+            endif
+
+            FCOUL(1) = FCOUL(1) + DC(1)*FORCE
+            FCOUL(2) = FCOUL(2) + DC(2)*FORCE
+            FCOUL(3) = FCOUL(3) + DC(3)*FORCE
+          endif
+        enddo
+      enddo
     enddo
+
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space_Single
+
+  subroutine  Ewald_Real_Space_latte(COULOMBV,I,RXYZ,Box, &
+       DELTAQ,U,Element_Pointer,Nr_atoms,COULACC,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_Elem)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, Nr_Elem
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+    real(PREC), intent(in)    :: U(Nr_elem)
+    real(PREC)                :: COULCUT, COULCUT2
+    integer, intent(in)       :: Element_Pointer(Nr_atoms)
+    integer,    intent(in)    :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
+    real(PREC), intent(out)   :: COULOMBV
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+    integer                   :: J,K, ccnt, newj, PBCI,PBCJ,PBCK
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    COULOMBV = ZERO
+
+    TI = TFACT*U(Element_Pointer(I))
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RXYZ(1,I)
+    Ra(2) = RXYZ(2,I)
+    Ra(3) = RXYZ(3,I)
+
+    do newj = 1,totnebcoul(I)
+      J = NEBCOUL(1, NEWJ, I)
+      PBCI = NEBCOUL(2, NEWJ, I)
+      PBCJ = NEBCOUL(3, NEWJ, I)
+      PBCK = NEBCOUL(4, NEWJ, I)
+      Rb(1) = RXYZ(1,J) + REAL(PBCI)*BOX(1,1) + REAL(PBCJ)*BOX(2,1) + &
+           REAL(PBCK)*BOX(3,1)
+
+      Rb(2) = RXYZ(2,J) + REAL(PBCI)*BOX(1,2) + REAL(PBCJ)*BOX(2,2) + &
+           REAL(PBCK)*BOX(3,2)
+
+      Rb(3) = RXYZ(3,J) + REAL(PBCI)*BOX(1,3) + REAL(PBCJ)*BOX(2,3) + &
+           REAL(PBCK)*BOX(3,3)
+      Rab = Rb-Ra  ! OBS b - a !!!
+      dR = norm2(Rab)
+      MAGR = dR
+      MAGR2 = dR*dR
+
+      if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+        TJ = TFACT*U(Element_Pointer(J))
+        DC = Rab/dR
+
+        Z = abs(CALPHA*MAGR)
+        NUMREP_ERFC = erfc(Z)
+
+        CA = NUMREP_ERFC/MAGR
+        COULOMBV = COULOMBV + DELTAQ(J)*CA
+        ccnt = ccnt + 1
+        CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+        EXPTI = exp(-TI*MAGR )
+
+        if (Element_Pointer(I).eq.Element_Pointer(J)) then
+          COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          ccnt = ccnt + 1
+        else
+          TJ2 = TJ*TJ
+          TJ3 = TJ2*TJ
+          TJ4 = TJ2*TJ2
+          TJ6 = TJ4*TJ2
+          EXPTJ = exp( -TJ*MAGR )
+          TI2MTJ2 = TI2 - TJ2
+          TJ2MTI2 = -TI2MTJ2
+          SA = TI
+          SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+          SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+          SD = TJ
+          SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+          SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+          COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+        endif
+
+      endif
     enddo
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space_latte
+
+  subroutine  Ewald_Real_Space(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
+       DELTAQ,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I
+    real(PREC), intent(in)    :: COULACC, TIMERATIO
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+    real(PREC), intent(in)    :: U(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    character(10), intent(in) :: Element_Type(Nr_atoms)
+    integer,    intent(in)    :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)    :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)    :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+    integer                   :: J,K, ccnt, nnI
+
+    COULVOL = LBox(1)*LBox(2)*LBox(3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    FCOUL = ZERO
+    COULOMBV = ZERO
+
+    TI = TFACT*U(I)
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RX(I)
+    Ra(2) = RY(I)
+    Ra(3) = RZ(I)
+
+    do nnI = 1,nrnnlist(I)
+      Rb(1) = nnRx(I,nnI)
+      Rb(2) = nnRy(I,nnI)
+      Rb(3) = nnRz(I,nnI)
+      J = nnType(I,nnI)
+      Rab = Rb-Ra  ! OBS b - a !!!
+      dR = norm2(Rab)
+      MAGR = dR
+      MAGR2 = dR*dR
+
+      if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+        TJ = TFACT*U(J)
+        DC = Rab/dR
+
+        ! Not Using Numerical Recipes ERFC
+        Z = abs(CALPHA*MAGR)
+        NUMREP_ERFC = erfc(Z)
+
+        CA = NUMREP_ERFC/MAGR
+        COULOMBV = COULOMBV + DELTAQ(J)*CA
+        ccnt = ccnt + 1
+        !TEST(ccnt) = DELTAQ(J)*CA
+        CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+        FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
+        EXPTI = exp(-TI*MAGR )
+
+        if (Element_Type(I).eq.Element_Type(J)) then
+          COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          ccnt = ccnt + 1
+          !TEST(ccnt) = - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
+               + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
+        else
+          TJ2 = TJ*TJ
+          TJ3 = TJ2*TJ
+          TJ4 = TJ2*TJ2
+          TJ6 = TJ4*TJ2
+          EXPTJ = exp( -TJ*MAGR )
+          TI2MTJ2 = TI2 - TJ2
+          TJ2MTI2 = -TI2MTJ2
+          SA = TI
+          SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+          SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+          SD = TJ
+          SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+          SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+          COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+          FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
+               + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
+        endif
 
-COULOMBV = KECONST*COULOMBV
-
-end subroutine Ewald_Real_Space_Single
-
-subroutine  Ewald_Real_Space_latte(COULOMBV,I,RXYZ,Box, &
-    DELTAQ,U,Element_Pointer,Nr_atoms,COULACC,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_Elem)
-
-implicit none
-
-integer,    parameter     :: PREC = 8
-real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
-real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
-real(PREC), parameter     :: pi = 3.14159265358979323846264D0
-real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
-integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, Nr_Elem
-real(PREC), intent(in)    :: COULACC
-real(PREC)                :: TFACT, RELPERM, KECONST
-real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
-real(PREC), intent(in)    :: U(Nr_elem)
-real(PREC)                :: COULCUT, COULCUT2
-integer, intent(in)       :: Element_Pointer(Nr_atoms)
-integer,    intent(in)    :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
-real(PREC), intent(out)   :: COULOMBV
-real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
-real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
-real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
-real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
-real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
-real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
-integer                   :: J,K, ccnt, newj, PBCI,PBCJ,PBCK
-
-COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
-SQRTX = sqrt(-log(COULACC))
-
-ccnt = 0
-COULCUT = 12.D0
-CALPHA = SQRTX/COULCUT
-COULCUT2 = COULCUT*COULCUT
-CALPHA2 = CALPHA*CALPHA
-
-RELPERM = ONE
-KECONST = 14.3996437701414D0*RELPERM
-TFACT  = 16.0D0/(5.0D0*KECONST)
-
-COULOMBV = ZERO
-
-TI = TFACT*U(Element_Pointer(I))
-TI2 = TI*TI
-TI3 = TI2*TI
-TI4 = TI2*TI2
-TI6 = TI4*TI2
-
-SSA = TI
-SSB = TI3/48.D0
-SSC = 3.D0*TI2/16.D0
-SSD = 11.D0*TI/16.D0
-SSE = 1.D0
-
-Ra(1) = RXYZ(1,I)
-Ra(2) = RXYZ(2,I)
-Ra(3) = RXYZ(3,I)
-
-do newj = 1,totnebcoul(I)
-        J = NEBCOUL(1, NEWJ, I)
-        PBCI = NEBCOUL(2, NEWJ, I)
-        PBCJ = NEBCOUL(3, NEWJ, I)
-        PBCK = NEBCOUL(4, NEWJ, I)
-        Rb(1) = RXYZ(1,J) + REAL(PBCI)*BOX(1,1) + REAL(PBCJ)*BOX(2,1) + &
-             REAL(PBCK)*BOX(3,1) 
-
-        Rb(2) = RXYZ(2,J) + REAL(PBCI)*BOX(1,2) + REAL(PBCJ)*BOX(2,2) + &
-             REAL(PBCK)*BOX(3,2) 
-
-        Rb(3) = RXYZ(3,J) + REAL(PBCI)*BOX(1,3) + REAL(PBCJ)*BOX(2,3) + &
-             REAL(PBCK)*BOX(3,3) 
-        Rab = Rb-Ra  ! OBS b - a !!!
-        dR = norm2(Rab)
-        MAGR = dR
-        MAGR2 = dR*dR
-
-    if ((dR <= COULCUT).and.(dR > 1e-12)) then
-
-       TJ = TFACT*U(Element_Pointer(J))
-       DC = Rab/dR
-
-       Z = abs(CALPHA*MAGR)
-       NUMREP_ERFC = erfc(Z)
-
-       CA = NUMREP_ERFC/MAGR
-       COULOMBV = COULOMBV + DELTAQ(J)*CA
-       ccnt = ccnt + 1
-       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
-       EXPTI = exp(-TI*MAGR )
-
-       if (Element_Pointer(I).eq.Element_Pointer(J)) then
-           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
-           ccnt = ccnt + 1
-       else
-           TJ2 = TJ*TJ
-           TJ3 = TJ2*TJ
-           TJ4 = TJ2*TJ2
-           TJ6 = TJ4*TJ2
-           EXPTJ = exp( -TJ*MAGR )
-           TI2MTJ2 = TI2 - TJ2
-           TJ2MTI2 = -TI2MTJ2
-           SA = TI
-           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
-           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
-           SD = TJ
-           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
-           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
-
-           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
-       endif
-
-    endif
-enddo
-COULOMBV = KECONST*COULOMBV
-
-end subroutine Ewald_Real_Space_latte
-
-subroutine  Ewald_Real_Space(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
-    DELTAQ,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
-
-implicit none
-
-integer,    parameter     :: PREC = 8
-real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
-real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
-real(PREC), parameter     :: pi = 3.14159265358979323846264D0
-real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
-integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I
-real(PREC), intent(in)    :: COULACC, TIMERATIO
-real(PREC)                :: TFACT, RELPERM, KECONST
-real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
-real(PREC), intent(in)    :: U(Nr_atoms)
-real(PREC)                :: COULCUT, COULCUT2
-character(10), intent(in) :: Element_Type(Nr_atoms)
-integer,    intent(in)    :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
-real(PREC), intent(in)    :: nnRx(Nr_atoms,Max_Nr_Neigh)
-real(PREC), intent(in)    :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
-real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
-real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
-real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
-real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
-real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
-real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
-real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
-integer                   :: J,K, ccnt, nnI
-
-COULVOL = LBox(1)*LBox(2)*LBox(3)
-SQRTX = sqrt(-log(COULACC))
-
-ccnt = 0
-COULCUT = 12.D0
-CALPHA = SQRTX/COULCUT
-COULCUT2 = COULCUT*COULCUT
-CALPHA2 = CALPHA*CALPHA
-
-RELPERM = ONE
-KECONST = 14.3996437701414D0*RELPERM
-TFACT  = 16.0D0/(5.0D0*KECONST)
-
-FCOUL = ZERO
-COULOMBV = ZERO
-
-TI = TFACT*U(I)
-TI2 = TI*TI
-TI3 = TI2*TI
-TI4 = TI2*TI2
-TI6 = TI4*TI2
-
-SSA = TI
-SSB = TI3/48.D0
-SSC = 3.D0*TI2/16.D0
-SSD = 11.D0*TI/16.D0
-SSE = 1.D0
-
-Ra(1) = RX(I)
-Ra(2) = RY(I)
-Ra(3) = RZ(I)
-
-do nnI = 1,nrnnlist(I)
-  Rb(1) = nnRx(I,nnI)
-  Rb(2) = nnRy(I,nnI)
-  Rb(3) = nnRz(I,nnI)
-  J = nnType(I,nnI)
-    Rab = Rb-Ra  ! OBS b - a !!!
-    dR = norm2(Rab)
-    MAGR = dR
-    MAGR2 = dR*dR
-
-    if ((dR <= COULCUT).and.(dR > 1e-12)) then
-
-       TJ = TFACT*U(J)
-       DC = Rab/dR
-
-       ! Not Using Numerical Recipes ERFC
-       Z = abs(CALPHA*MAGR)
-       NUMREP_ERFC = erfc(Z)
-
-       CA = NUMREP_ERFC/MAGR
-       COULOMBV = COULOMBV + DELTAQ(J)*CA
-       ccnt = ccnt + 1
-       !TEST(ccnt) = DELTAQ(J)*CA
-       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
-       FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
-       EXPTI = exp(-TI*MAGR )
-
-       if (Element_Type(I).eq.Element_Type(J)) then
-           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
-           ccnt = ccnt + 1
-           !TEST(ccnt) = - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
-           FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
-             + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
-       else
-           TJ2 = TJ*TJ
-           TJ3 = TJ2*TJ
-           TJ4 = TJ2*TJ2
-           TJ6 = TJ4*TJ2
-           EXPTJ = exp( -TJ*MAGR )
-           TI2MTJ2 = TI2 - TJ2
-           TJ2MTI2 = -TI2MTJ2
-           SA = TI
-           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
-           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
-           SD = TJ
-           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
-           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
-
-           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
-           FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
-              + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
-       endif
-
-       FCOUL(1) = FCOUL(1) + DC(1)*FORCE
-       FCOUL(2) = FCOUL(2) + DC(2)*FORCE
-       FCOUL(3) = FCOUL(3) + DC(3)*FORCE
-    endif
-enddo
-COULOMBV = KECONST*COULOMBV
-
-end subroutine Ewald_Real_Space
-
-subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_Neigh)
-
-implicit none
-
-integer, parameter        :: PREC = 8
-real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
-real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
-real(PREC), parameter     :: pi = 3.14159265358979323846264D0
-real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
-integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
-real(PREC), intent(in)    :: COULACC
-real(PREC)                :: KECONST, TFACT, RELPERM
-real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
-real(PREC)                :: COULCUT, COULCUT2
-real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
-real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
-real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
-real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
-real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
-real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
-real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
-real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
-
-integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
-
-COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
-SQRTX = sqrt(-log(COULACC))
-
-ccnt = 0
-
-COULCUT = 12.0D0
-CALPHA = SQRTX/COULCUT
-
-COULCUT2 = COULCUT*COULCUT
-KCUTOFF = TWO*CALPHA*SQRTX
-KCUTOFF2 = KCUTOFF*KCUTOFF
-CALPHA2 = CALPHA*CALPHA
-FOURCALPHA2 = FOUR*CALPHA2
-
-RECIPVECS = ZERO
-RECIPVECS(1,1) = TWO*pi/Box(1,1)
-RECIPVECS(2,2) = TWO*pi/Box(2,2)
-RECIPVECS(3,3) = TWO*pi/Box(3,3)
-LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
-MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
-NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
-
-RELPERM = 1.D0
-KECONST = 14.3996437701414D0*RELPERM
-
-COULOMBV = ZERO
-SINLIST = ZERO
-COSLIST = ZERO
-
-do L = 0,LMAX
-
-  if (L.eq.0) then
-    MMIN = 0
-  else
-    MMIN = -MMAX
-  endif
-
-  L11 = L*RECIPVECS(1,1)
-  L12 = L*RECIPVECS(1,2)
-  L13 = L*RECIPVECS(1,3)
-
-  do M = MMIN,MMAX
+        FCOUL(1) = FCOUL(1) + DC(1)*FORCE
+        FCOUL(2) = FCOUL(2) + DC(2)*FORCE
+        FCOUL(3) = FCOUL(3) + DC(3)*FORCE
+      endif
+    enddo
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space
+
+  subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_Neigh)
+
+    implicit none
+
+    integer, parameter        :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: KECONST, TFACT, RELPERM
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
+    real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
+    real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+    real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
+
+    integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+
+    COULCUT = 12.0D0
+    CALPHA = SQRTX/COULCUT
+
+    COULCUT2 = COULCUT*COULCUT
+    KCUTOFF = TWO*CALPHA*SQRTX
+    KCUTOFF2 = KCUTOFF*KCUTOFF
+    CALPHA2 = CALPHA*CALPHA
+    FOURCALPHA2 = FOUR*CALPHA2
+
+    RECIPVECS = ZERO
+    RECIPVECS(1,1) = TWO*pi/Box(1,1)
+    RECIPVECS(2,2) = TWO*pi/Box(2,2)
+    RECIPVECS(3,3) = TWO*pi/Box(3,3)
+    LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+    MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+    NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+    RELPERM = 1.D0
+    KECONST = 14.3996437701414D0*RELPERM
+
+    COULOMBV = ZERO
+    SINLIST = ZERO
+    COSLIST = ZERO
+
+    do L = 0,LMAX
+
+      if (L.eq.0) then
+        MMIN = 0
+      else
+        MMIN = -MMAX
+      endif
+
+      L11 = L*RECIPVECS(1,1)
+      L12 = L*RECIPVECS(1,2)
+      L13 = L*RECIPVECS(1,3)
+
+      do M = MMIN,MMAX
 
         NMIN = -NMAX
         if ((L==0).and.(M==0)) then
@@ -575,122 +575,122 @@ subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_
         M23 = L13 + M*RECIPVECS(2,3)
 
         do N = NMIN,NMAX
-           K(1) = M21 + N*RECIPVECS(3,1)
-           K(2) = M22 + N*RECIPVECS(3,2)
-           K(3) = M23 + N*RECIPVECS(3,3)
-           K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
-           if (K2.le.KCUTOFF2) then
-              PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
-              PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
-
-              COSSUM = 0.D0
-              SINSUM = 0.D0
-
-              ! Doing the sin and cos sums
-              !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,DOT) &
-              !$OMP REDUCTION(+:COSSUM) & 
-              !$OMP REDUCTION(+:SINSUM)  
-              do I = 1,Nr_atoms
-                 DOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
-                 ! We re-use these in the next loop...
-                 SINLIST(I) = sin(DOT)
-                 COSLIST(I) = cos(DOT)
-                 COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
-                 SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
-              enddo
-              !$OMP END PARALLEL DO
-              COSSUM2 = COSSUM*COSSUM
-              SINSUM2 = SINSUM*SINSUM
-
-              ! Add up energy and force contributions
-
-              KEPREF = KECONST*PREFACTOR
-              !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I) 
-              do I = 1,Nr_atoms
-                 COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
-              enddo
-              !$OMP END PARALLEL DO
-
-              KEPREF = KEPREF*(COSSUM2 + SINSUM2)
-           endif
+          K(1) = M21 + N*RECIPVECS(3,1)
+          K(2) = M22 + N*RECIPVECS(3,2)
+          K(3) = M23 + N*RECIPVECS(3,3)
+          K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+          if (K2.le.KCUTOFF2) then
+            PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+            PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
+
+            COSSUM = 0.D0
+            SINSUM = 0.D0
+
+            ! Doing the sin and cos sums
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,DOT) &
+            !$OMP REDUCTION(+:COSSUM) &
+            !$OMP REDUCTION(+:SINSUM)
+            do I = 1,Nr_atoms
+              DOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
+              ! We re-use these in the next loop...
+              SINLIST(I) = sin(DOT)
+              COSLIST(I) = cos(DOT)
+              COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
+              SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
+            enddo
+            !$OMP END PARALLEL DO
+            COSSUM2 = COSSUM*COSSUM
+            SINSUM2 = SINSUM*SINSUM
+
+            ! Add up energy and force contributions
+
+            KEPREF = KECONST*PREFACTOR
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I)
+            do I = 1,Nr_atoms
+              COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
+            enddo
+            !$OMP END PARALLEL DO
+
+            KEPREF = KEPREF*(COSSUM2 + SINSUM2)
+          endif
         enddo
-  enddo
-enddo
-
-! Point self energy
-CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
-COULOMBV = COULOMBV - CORRFACT*DELTAQ;
-
-end subroutine Ewald_k_Space_latte
-
-subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TIMERATIO,Max_Nr_Neigh)
-
-implicit none
-
-integer, parameter        :: PREC = 8
-real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
-real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
-real(PREC), parameter     :: pi = 3.14159265358979323846264D0
-real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
-integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
-real(PREC), intent(in)    :: COULACC, TIMERATIO
-real(PREC)                :: KECONST, TFACT, RELPERM
-real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
-real(PREC)                :: COULCUT, COULCUT2
-real(PREC), intent(out)   :: COULOMBV(Nr_atoms), FCOUL(3,Nr_atoms)
-real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
-real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
-real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
-real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
-real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
-real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
-real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
-
-integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
-
-COULVOL = LBox(1)*LBox(2)*LBox(3)
-SQRTX = sqrt(-log(COULACC))
-
-ccnt = 0
-
-COULCUT = 12.0D0
-CALPHA = SQRTX/COULCUT
-
-COULCUT2 = COULCUT*COULCUT
-KCUTOFF = TWO*CALPHA*SQRTX
-KCUTOFF2 = KCUTOFF*KCUTOFF
-CALPHA2 = CALPHA*CALPHA
-FOURCALPHA2 = FOUR*CALPHA2
-
-RECIPVECS = ZERO
-RECIPVECS(1,1) = TWO*pi/LBox(1)
-RECIPVECS(2,2) = TWO*pi/LBox(2)
-RECIPVECS(3,3) = TWO*pi/LBox(3)
-LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
-MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
-NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
-
-RELPERM = 1.D0
-KECONST = 14.3996437701414D0*RELPERM
-
-FCOUL = ZERO
-COULOMBV = ZERO
-SINLIST = ZERO
-COSLIST = ZERO
-
-do L = 0,LMAX
-
-  if (L.eq.0) then
-    MMIN = 0
-  else
-    MMIN = -MMAX
-  endif
-
-  L11 = L*RECIPVECS(1,1)
-  L12 = L*RECIPVECS(1,2)
-  L13 = L*RECIPVECS(1,3)
-
-  do M = MMIN,MMAX
+      enddo
+    enddo
+
+    ! Point self energy
+    CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+    COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+  end subroutine Ewald_k_Space_latte
+
+  subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TIMERATIO,Max_Nr_Neigh)
+
+    implicit none
+
+    integer, parameter        :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
+    real(PREC), intent(in)    :: COULACC, TIMERATIO
+    real(PREC)                :: KECONST, TFACT, RELPERM
+    real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: COULOMBV(Nr_atoms), FCOUL(3,Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
+    real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
+    real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+    real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
+
+    integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+    COULVOL = LBox(1)*LBox(2)*LBox(3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+
+    COULCUT = 12.0D0
+    CALPHA = SQRTX/COULCUT
+
+    COULCUT2 = COULCUT*COULCUT
+    KCUTOFF = TWO*CALPHA*SQRTX
+    KCUTOFF2 = KCUTOFF*KCUTOFF
+    CALPHA2 = CALPHA*CALPHA
+    FOURCALPHA2 = FOUR*CALPHA2
+
+    RECIPVECS = ZERO
+    RECIPVECS(1,1) = TWO*pi/LBox(1)
+    RECIPVECS(2,2) = TWO*pi/LBox(2)
+    RECIPVECS(3,3) = TWO*pi/LBox(3)
+    LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+    MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+    NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+    RELPERM = 1.D0
+    KECONST = 14.3996437701414D0*RELPERM
+
+    FCOUL = ZERO
+    COULOMBV = ZERO
+    SINLIST = ZERO
+    COSLIST = ZERO
+
+    do L = 0,LMAX
+
+      if (L.eq.0) then
+        MMIN = 0
+      else
+        MMIN = -MMAX
+      endif
+
+      L11 = L*RECIPVECS(1,1)
+      L12 = L*RECIPVECS(1,2)
+      L13 = L*RECIPVECS(1,3)
+
+      do M = MMIN,MMAX
 
         NMIN = -NMAX
         if ((L==0).and.(M==0)) then
@@ -702,50 +702,50 @@ subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TI
         M23 = L13 + M*RECIPVECS(2,3)
 
         do N = NMIN,NMAX
-           K(1) = M21 + N*RECIPVECS(3,1)
-           K(2) = M22 + N*RECIPVECS(3,2)
-           K(3) = M23 + N*RECIPVECS(3,3)
-           K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
-           if (K2.le.KCUTOFF2) then
-              PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
-              PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
-
-              COSSUM = 0.D0
-              SINSUM = 0.D0
-
-              ! Doing the sin and cos sums
-              do I = 1,Nr_atoms
-                 DOT = K(1)*RX(I) + K(2)*RY(I) + K(3)*RZ(I)
-                 ! We re-use these in the next loop...
-                 SINLIST(I) = sin(DOT)
-                 COSLIST(I) = cos(DOT)
-                 COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
-                 SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
-              enddo
-              COSSUM2 = COSSUM*COSSUM
-              SINSUM2 = SINSUM*SINSUM
-
-              ! Add up energy and force contributions
-
-              KEPREF = KECONST*PREFACTOR
-              do I = 1,Nr_atoms
-                 COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
-                 FORCE = KEPREF*DELTAQ(I)*(SINLIST(I)*COSSUM - COSLIST(I)*SINSUM)
-                 FCOUL(1,I) = FCOUL(1,I) + FORCE*K(1)
-                 FCOUL(2,I) = FCOUL(2,I) + FORCE*K(2)
-                 FCOUL(3,I) = FCOUL(3,I) + FORCE*K(3)
-              enddo
-
-              KEPREF = KEPREF*(COSSUM2 + SINSUM2)
-           endif
+          K(1) = M21 + N*RECIPVECS(3,1)
+          K(2) = M22 + N*RECIPVECS(3,2)
+          K(3) = M23 + N*RECIPVECS(3,3)
+          K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+          if (K2.le.KCUTOFF2) then
+            PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+            PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
+
+            COSSUM = 0.D0
+            SINSUM = 0.D0
+
+            ! Doing the sin and cos sums
+            do I = 1,Nr_atoms
+              DOT = K(1)*RX(I) + K(2)*RY(I) + K(3)*RZ(I)
+              ! We re-use these in the next loop...
+              SINLIST(I) = sin(DOT)
+              COSLIST(I) = cos(DOT)
+              COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
+              SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
+            enddo
+            COSSUM2 = COSSUM*COSSUM
+            SINSUM2 = SINSUM*SINSUM
+
+            ! Add up energy and force contributions
+
+            KEPREF = KECONST*PREFACTOR
+            do I = 1,Nr_atoms
+              COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
+              FORCE = KEPREF*DELTAQ(I)*(SINLIST(I)*COSSUM - COSLIST(I)*SINSUM)
+              FCOUL(1,I) = FCOUL(1,I) + FORCE*K(1)
+              FCOUL(2,I) = FCOUL(2,I) + FORCE*K(2)
+              FCOUL(3,I) = FCOUL(3,I) + FORCE*K(3)
+            enddo
+
+            KEPREF = KEPREF*(COSSUM2 + SINSUM2)
+          endif
         enddo
-  enddo
-enddo
+      enddo
+    enddo
 
-! Point self energy
-CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
-COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+    ! Point self energy
+    CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+    COULOMBV = COULOMBV - CORRFACT*DELTAQ;
 
-end subroutine Ewald_k_Space
+  end subroutine Ewald_k_Space
 
 end module prg_ewald_mod
diff --git a/src/prg_implicit_fermi_mod.F90 b/src/prg_implicit_fermi_mod.F90
index 2e217a98..76bd2bc2 100644
--- a/src/prg_implicit_fermi_mod.F90
+++ b/src/prg_implicit_fermi_mod.F90
@@ -30,7 +30,7 @@ module prg_implicit_fermi_mod
 contains
 
   !> Recursive Implicit Fermi Dirac for finite temperature.
-  !! \param Inv_bml Inverses generated by algorithm.  
+  !! \param Inv_bml Inverses generated by algorithm.
   !! \param h_bml Input Hamiltonian matrix.
   !! \param p_bml Output density matrix.
   !! \param nsteps Number of recursion steps.
@@ -40,8 +40,8 @@ module prg_implicit_fermi_mod
   !! \param occErrLimit Occupation error limit.
   !! \param threshold Threshold for multiplication.
   !! \param tol Tolerance for linear system solver.
-  !! \param SCF_IT The current SCF iteration. 
-  !! \param occiter Counts the total nr of DM calculations during MD. 
+  !! \param SCF_IT The current SCF iteration.
+  !! \param occiter Counts the total nr of DM calculations during MD.
   !! See \cite{niklasson2003}
   subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc, &
        mu, beta, occErrLimit, threshold, tol,SCF_IT, occiter)
@@ -58,8 +58,8 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
     integer, intent(inout) :: occiter
 
     type(bml_matrix_t) :: w_bml, y_bml, d_bml, aux_bml, p2_bml, I_bml, ai_bml
-    real(dp) :: trdPdmu, trP0, occErr, alpha 
-    real(dp) :: cnst, ofactor, mustep 
+    real(dp) :: trdPdmu, trP0, occErr, alpha
+    real(dp) :: cnst, ofactor, mustep
     real(dp), allocatable :: trace(:), gbnd(:)
     character(20) :: bml_type
     integer :: N, M, i, iter, muadj, prev
@@ -75,7 +75,7 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, aux_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, y_bml)
     call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, I_bml)
-    call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, ai_bml) 
+    call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, ai_bml)
 
     occErr = 10.0_dp
     alpha = 1.0_dp
@@ -97,10 +97,10 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
       call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.0001_dp, threshold)
     else
       ! Otherwise use previous inverse as starting guess
-      call bml_copy(Inv_bml(1),ai_bml) 
+      call bml_copy(Inv_bml(1),ai_bml)
     end if
 
-    do while (occErr .gt. occErrLimit .or. muadj .eq. 1) 
+    do while ((occErr .gt. occErrLimit .or. muadj .eq. 1) .and. iter < 50)
       iter = iter + 1
       muadj = 0
       write(*,*) 'mu =', mu
@@ -109,18 +109,18 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
       call bml_copy(h_bml, p_bml)
       call prg_normalize_implicit_fermi(p_bml, cnst, mu)
 
-        do i = 1, nsteps
-            call bml_multiply_x2(p_bml, p2_bml, threshold, trace)
-            ! Y = 2*(P2-P) + I
-            call bml_copy(p2_bml, y_bml)
-            call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
-            call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
-            ! Find inverse ai = (2*(P2-P)+I)^-1
-            !call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.01_dp, threshold)
-            call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, aux_bml, I_bml, tol, threshold)
-            call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
-            call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation  
-        enddo
+      do i = 1, nsteps
+        call bml_multiply_x2(p_bml, p2_bml, threshold, trace)
+        ! Y = 2*(P2-P) + I
+        call bml_copy(p2_bml, y_bml)
+        call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
+        call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
+        ! Find inverse ai = (2*(P2-P)+I)^-1
+        !call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.01_dp, threshold)
+        call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, aux_bml, I_bml, tol, threshold)
+        call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
+        call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation
+      enddo
 
       trdPdmu = bml_trace(p_bml)
       trP0 = trdPdmu
@@ -129,40 +129,44 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
       occErr = abs(trP0 - nocc)
       write(*,*) 'occerr =', nocc-trP0
 
-      ! If occupation error is too large, do bisection method 
-      if (occerr > 10.0_dp) then 
-         if (nocc-trP0 < 0.0_dp) then
-            if (prev .eq. 1) then
-               alpha = alpha/2
-            endif 
-            prev = -1
-            mu = mu - alpha
-         else 
-            if (prev .eq. -1) then
-               alpha = alpha/2
-            endif
-            prev = 1
-            mu = mu + alpha 
-         endif  
-      ! Otherwise do Newton 
+      ! If occupation error is too large, do bisection method
+      if (occerr > 10.0_dp) then
+        if (nocc-trP0 < 0.0_dp) then
+          if (prev .eq. 1) then
+            alpha = alpha/2
+          endif
+          prev = -1
+          mu = mu - alpha
+        else
+          if (prev .eq. -1) then
+            alpha = alpha/2
+          endif
+          prev = 1
+          mu = mu + alpha
+        endif
+        ! Otherwise do Newton
       else if (occErr .gt. occErrLimit) then
         mustep = (nocc -trP0)/trdPdmu
         if (abs(mustep) > 1.0) then
           mustep = 0.1_dp*mustep
-        end if  
-        mu = mu + mustep 
+        end if
+        mu = mu + mustep
         muadj = 1
       end if
     enddo
-    ! Adjusting the occupation sometimes causes the perturbation calculation to not converge. 
-    ! For now we recompute the DM one extra time if mu was adjusted. 
+
+    if (iter .ge. 50) then 
+            write(*,*) 'Could not converge chemical potential in prg_impplicit_fermi_save_inverse'
+    end if 
+    ! Adjusting the occupation sometimes causes the perturbation calculation to not converge.
+    ! For now we recompute the DM one extra time if mu was adjusted.
     !if (muadj .eq. 1) then
-      ! Adjust occupation
-     ! call bml_copy(p_bml, d_bml)
-     ! call bml_scale_add_identity(d_bml, -1.0_dp, 1.0_dp, threshold)
-     ! call bml_multiply(p_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
-     ! ofactor = ((nocc - trP0)/trdPdmu) * beta
-     ! call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
+    ! Adjust occupation
+    ! call bml_copy(p_bml, d_bml)
+    ! call bml_scale_add_identity(d_bml, -1.0_dp, 1.0_dp, threshold)
+    ! call bml_multiply(p_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
+    ! ofactor = ((nocc - trP0)/trdPdmu) * beta
+    ! call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
     !end if
     occiter = occiter + iter
     call bml_scale(2.0_dp,p_bml)
@@ -177,7 +181,7 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
     call bml_deallocate(I_bml)
 
   end subroutine prg_implicit_fermi_save_inverse
-  
+
   !> Recursive Implicit Fermi Dirac for finite temperature.
   !! \param h_bml Input Hamiltonian matrix.
   !! \param p_bml Output density matrix.
@@ -216,17 +220,12 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
     N = bml_get_N(h_bml)
     M = bml_get_M(h_bml)
 
-    call bml_print_matrix("h_bml",h_bml,1,10,1,10)
-    call bml_print_matrix("p_bml",p_bml,1,10,1,10)
-    write(*,*) nsteps, k, nocc, &
-       mu, beta, method, osteps, occErrLimit, threshold, tol
-    !stop
     allocate(trace(2))
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, p2_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, d_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, w_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, y_bml)
-    if (k .gt. 2) then
+    if (k .ge. 2) then
       call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, aux1_bml)
       call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, aux2_bml)
     endif
@@ -292,8 +291,10 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
       trP0 = trdPdmu
       trdPdmu = trdPdmu - bml_sum_squares(p_bml) ! sum p(i,j)**2
       trdPdmu = beta * trdPdmu
-      mu = mu + (nocc - trP0)/trdPdmu
       occErr = abs(trP0 - nocc)
+      if (occErr .gt. occErrLimit) then 
+        mu = mu + (nocc - trP0)/trdPdmu
+      end if 
       write(*,*) "mu =", mu
     enddo
 
@@ -304,7 +305,7 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
 
     call bml_multiply(p_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
     ofactor = ((nocc - trP0)/trdPdmu) * beta
-    call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
+    !call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
     !call bml_print_matrix("P adjusted occupation",p_bml,0,10,0,10)
 
     deallocate(trace)
@@ -313,7 +314,7 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
     call bml_deallocate(w_bml)
     call bml_deallocate(d_bml)
     call bml_deallocate(y_bml)
-    if (k .gt. 2) then
+    if (k .ge. 2) then
       call bml_deallocate(aux1_bml)
       call bml_deallocate(aux2_bml)
     endif
@@ -438,7 +439,7 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
     integer, intent(in) :: nsteps
     type(bml_matrix_t) :: B_bml, C_bml, C0_bml
     character(20) :: bml_type
-    real(dp) :: p1_trace, dPdmu_trace, p1B_trace, mu1B, cnst 
+    real(dp) :: p1_trace, dPdmu_trace, p1B_trace, mu1B, cnst
     integer :: N, M, i, j, k
 
     bml_type = bml_get_type(H0_bml)
@@ -451,46 +452,46 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
 
     cnst = beta/(2**(2+nsteps))
 
-      ! P0 = 0.5*II - cnst*(H0-mu0*II)
-      call bml_copy(H0_bml, P0_bml)
-      call prg_normalize_implicit_fermi(P0_bml, cnst, mu0)
-
-      ! P1 =  - cnst*H1
-      call bml_copy(H1_bml, P1_bml)
-      call bml_scale(-1.0_dp*cnst, P1_bml)
-      do i = 1, nsteps
-
-        ! Calculate coefficient matrices
-        ! C0 = P0^2
-        call bml_multiply(P0_bml, P0_bml, C0_bml, 1.0_dp, 0.0_dp, threshold)
-        ! C = P0*P1+P1*P0, B = 2(P1 - C)
-        call bml_multiply(P0_bml, P1_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
-        call bml_multiply(P1_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
-        call bml_copy(P1_bml, B_bml)
-        call bml_add(B_bml, C_bml, 2.0_dp, -2.0_dp, threshold)
-        ! Get next P0
-        call bml_multiply(Inv_bml(i), C0_bml, P0_bml, 1.0_dp, 0.0_dp, threshold)
-        ! Get next P1
-        ! C = P0*P1+P1*P0 + 2(P1 -P0*P1-P1*P0)*P0(i+1)
-        call bml_multiply(B_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
-        call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
-      enddo
+    ! P0 = 0.5*II - cnst*(H0-mu0*II)
+    call bml_copy(H0_bml, P0_bml)
+    call prg_normalize_implicit_fermi(P0_bml, cnst, mu0)
+
+    ! P1 =  - cnst*H1
+    call bml_copy(H1_bml, P1_bml)
+    call bml_scale(-1.0_dp*cnst, P1_bml)
+    do i = 1, nsteps
+
+      ! Calculate coefficient matrices
+      ! C0 = P0^2
+      call bml_multiply(P0_bml, P0_bml, C0_bml, 1.0_dp, 0.0_dp, threshold)
+      ! C = P0*P1+P1*P0, B = 2(P1 - C)
+      call bml_multiply(P0_bml, P1_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+      call bml_multiply(P1_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+      call bml_copy(P1_bml, B_bml)
+      call bml_add(B_bml, C_bml, 2.0_dp, -2.0_dp, threshold)
+      ! Get next P0
+      call bml_multiply(Inv_bml(i), C0_bml, P0_bml, 1.0_dp, 0.0_dp, threshold)
+      ! Get next P1
+      ! C = P0*P1+P1*P0 + 2(P1 -P0*P1-P1*P0)*P0(i+1)
+      call bml_multiply(B_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+      call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
+    enddo
 
-      ! dPdmu = beta*P0(I-P0)
-      call bml_copy(P0_bml, B_bml)
-      call bml_scale_add_identity(B_bml, -1.0_dp, 1.0_dp, threshold)
-      call bml_multiply(P0_bml, B_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
-      call bml_scale(beta,C_bml)
-      dPdmu_trace = bml_trace(C_bml)
-      p1_trace = bml_trace(P1_bml)
-      mu1 =  - p1_trace/dPdmu_trace
-      if (abs(dPdmu_trace) > 1e-8) then
-        call bml_add(P1_bml,C_bml,1.0_dp,mu1,threshold)
-      endif 
+    ! dPdmu = beta*P0(I-P0)
+    call bml_copy(P0_bml, B_bml)
+    call bml_scale_add_identity(B_bml, -1.0_dp, 1.0_dp, threshold)
+    call bml_multiply(P0_bml, B_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+    call bml_scale(beta,C_bml)
+    dPdmu_trace = bml_trace(C_bml)
+    p1_trace = bml_trace(P1_bml)
+    mu1 =  - p1_trace/dPdmu_trace
+    if (abs(dPdmu_trace) > 1e-8) then
+      call bml_add(P1_bml,C_bml,1.0_dp,mu1,threshold)
+    endif
 
-      call bml_deallocate(B_bml)
-      call bml_deallocate(C_bml)
-      call bml_deallocate(C0_bml)
+    call bml_deallocate(B_bml)
+    call bml_deallocate(C_bml)
+    call bml_deallocate(C0_bml)
 
 
   end subroutine prg_implicit_fermi_first_order_response
@@ -513,7 +514,7 @@ end subroutine prg_implicit_fermi_first_order_response
   !! See \cite{niklasson2015}
   subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P1_bml, P2_bml, P3_bml, &
        nsteps, mu0, mu, beta, nocc, occ_tol, lin_tol, order, threshold)
-  
+
     implicit none
 
     type(bml_matrix_t), intent(in) :: H0_bml, H1_bml, H2_bml, H3_bml
@@ -868,7 +869,8 @@ subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, d_bml, I_bml, tol, th
     N2 = N*N
     scaled_tol = tol*N
     do while(err > scaled_tol)
-      !write(*,*) 'iter = ', i 
+      !write(*,*) 'iter = ', i
+      !write(*,*) 'ns error =', err
       call bml_copy(ai_bml, tmp_bml)
       call bml_multiply(a_bml, ai_bml, r_bml, 1.0_dp, 0.0_dp, threshold)
       call bml_scale_add_identity(r_bml, -1.0_dp, 1.0_dp, threshold)
@@ -876,12 +878,12 @@ subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, d_bml, I_bml, tol, th
       err = bml_fnorm(r_bml)
       !write(*,*) "err = ", err
       !write(*,*) "prev_err = ", prev_err
-      if (10*prev_err < err) then 
+      if (10*prev_err < err) then
         write(*,*) 'NS did not converge, calling conjugate gradient'
         call prg_conjgrad(a_bml, ai_bml, I_bml, r_bml, tmp_bml, d_bml, 0.0001_dp, threshold)
-      else 
+      else
         call bml_multiply(tmp_bml, r_bml, ai_bml, 1.0_dp, 1.0_dp, threshold)
-      endif 
+      endif
       i = i + 1
     enddo
     !write(*,*) "Number of NS iterations:", i
@@ -1004,7 +1006,7 @@ subroutine prg_conjgrad(A_bml, p_bml, p2_bml, tmp_bml, d_bml, w_bml, cg_tol, thr
       call bml_add(tmp_bml, w_bml, 1.0_dp, -alpha, threshold)
       r_norm_old = r_norm_new
       r_norm_new = bml_sum_squares(tmp_bml)
-      if (k .gt. 500) then
+      if (k .gt. 50) then
         write(*,*) "Conjugate gradient is not converging"
         stop
       endif
@@ -1124,8 +1126,10 @@ subroutine prg_test_density_matrix(ham_bml, p_bml, beta, mu, nocc, osteps, occEr
       trP0 = trdPdmu
       trdPdmu = trdPdmu - bml_sum_squares(p_bml) ! sum p(i,j)**2
       trdPdmu = beta * trdPdmu
-      mu = mu + (nocc - trP0)/trdPdmu
       occErr = abs(trP0 - nocc)
+      if (occErr .gt. occErrLimit) then
+         mu = mu + (nocc - trP0)/trdPdmu
+      end if 
       !write(*,*) "mu = ", mu
     enddo
 
@@ -1136,7 +1140,7 @@ subroutine prg_test_density_matrix(ham_bml, p_bml, beta, mu, nocc, osteps, occEr
 
     call bml_multiply(p_bml, aux_bml, aux1_bml, 1.0_dp, 0.0_dp, threshold)
     ofactor = ((nocc - trP0)/trdPdmu) * beta
-    call bml_add(p_bml, aux1_bml, 1.0_dp, ofactor, threshold)
+    !call bml_add(p_bml, aux1_bml, 1.0_dp, ofactor, threshold)
     !call bml_print_matrix("Diagonalization - Adjusted occupation",p_bml,0,10,0,10)
 
     call bml_deallocate(eigenvectors_bml)
diff --git a/src/prg_timer_mod.F90 b/src/prg_timer_mod.F90
index 204a90ca..c480b0d0 100644
--- a/src/prg_timer_mod.F90
+++ b/src/prg_timer_mod.F90
@@ -1,17 +1,17 @@
 !> The timer module.
 !! \ingroup PROGRESS
 !!
-!! \brief Sets up timers that can be used to time other routines. 
+!! \brief Sets up timers that can be used to time other routines.
 !!
-!! Example use of dynamic timing: 
+!! Example use of dynamic timing:
 !!
 !!    call timer_prg_init()
-!!     
+!!
 !!     call prg_timer_start(dyn_timer,"timer_tag")
 !!
-!!     .... code lines ... 
+!!     .... code lines ...
 !!
-!!     call prg_timer_stop(dyn_timer,1)  
+!!     call prg_timer_stop(dyn_timer,1)
 !!
 !!
 !! This will write the time it takes to execute "code lines" and it will name it "timer_tag"
@@ -53,41 +53,41 @@ module prg_timer_mod
   !> Timer status type
   type timer_status_t
 
-     !> Timer name
-     character(LEN=20) :: tname
+    !> Timer name
+    character(LEN=20) :: tname
 
-     !> Start time
-     integer :: tstart
+    !> Start time
+    integer :: tstart
 
-     !> Current total time
-     integer :: ttotal
+    !> Current total time
+    integer :: ttotal
 
-     !> Current call count
-     integer :: tcount
+    !> Current call count
+    integer :: tcount
 
-     !> Rank with min value
-     integer :: minRank
+    !> Rank with min value
+    integer :: minRank
 
-     !> Rank with max value
-     integer :: maxRank
+    !> Rank with max value
+    integer :: maxRank
 
-     !> Sum time - total time in secs
-     real(dp):: tsum
+    !> Sum time - total time in secs
+    real(dp):: tsum
 
-     !> Minimum value over all ranks
-     real(dp) :: minValue
+    !> Minimum value over all ranks
+    real(dp) :: minValue
 
-     !> Maximum value over all ranks
-     real(dp) :: maxValue
+    !> Maximum value over all ranks
+    real(dp) :: maxValue
 
-     !> Average value over all ranks
-     real(dp) :: tavg
+    !> Average value over all ranks
+    real(dp) :: tavg
 
-     !> Stdev across all ranks
-     real(dp) :: tstdev
+    !> Stdev across all ranks
+    real(dp) :: tstdev
 
-     !> Percent of time across all timers
-     real(dp) :: tpercent
+    !> Percent of time across all timers
+    real(dp) :: tpercent
 
   end type timer_status_t
 
@@ -114,7 +114,7 @@ module prg_timer_mod
   !
   !  ptimer(loop_timer)%tname = "Loop"
   !  ptimer(sp2_timer)%tname  = "  SP2"
-  !  ptimer(genx_timer)%tname = "  GenX" 
+  !  ptimer(genx_timer)%tname = "  GenX"
   !  ptimer(new_timer)%tname  = "  New"
   !
 
@@ -133,7 +133,7 @@ subroutine timer_prg_init()
     integer :: i
 
     ! Increment when adding a new timer
-    num_timers = 24 
+    num_timers = 24
 
     allocate(ptimer(num_timers))
 
@@ -178,19 +178,19 @@ subroutine timer_prg_init()
     ptimer(suball_timer)%tname     = "   SubAll"
     ptimer(bmult_timer)%tname      = "    BMult"
     ptimer(badd_timer)%tname       = "    BAdd"
-    ptimer(dyn_timer)%tname        = "        " !Reserved for dynamic timing  
-    ptimer(mdloop_timer)%tname     = "MDLoop"   
-    ptimer(buildz_timer)%tname     = "  BuildZ"   
-    ptimer(realcoul_timer)%tname   = "  RealCoul"   
-    ptimer(recipcoul_timer)%tname  = "  RecipCoul"   
-    ptimer(pairpot_timer)%tname    = "  PairPot"   
-    ptimer(halfverlet_timer)%tname = "  HalfVerlet"   
-    ptimer(pos_timer)%tname        = "  Pos"   
-    ptimer(nlist_timer)%tname      = "  NList"   
+    ptimer(dyn_timer)%tname        = "        " !Reserved for dynamic timing
+    ptimer(mdloop_timer)%tname     = "MDLoop"
+    ptimer(buildz_timer)%tname     = "  BuildZ"
+    ptimer(realcoul_timer)%tname   = "  RealCoul"
+    ptimer(recipcoul_timer)%tname  = "  RecipCoul"
+    ptimer(pairpot_timer)%tname    = "  PairPot"
+    ptimer(halfverlet_timer)%tname = "  HalfVerlet"
+    ptimer(pos_timer)%tname        = "  Pos"
+    ptimer(nlist_timer)%tname      = "  NList"
 
     do i = 1, num_timers
-       ptimer(i)%ttotal = 0
-       ptimer(i)%tcount = 0
+      ptimer(i)%ttotal = 0
+      ptimer(i)%tcount = 0
     end do
 
   end subroutine timer_prg_init
@@ -214,10 +214,10 @@ end subroutine prg_timer_shutdown
   subroutine prg_timer_start(itimer,tag)
 
     integer, intent(in) :: itimer
-    character(len=*), intent(in), optional :: tag 
+    character(len=*), intent(in), optional :: tag
 
-    if(present(tag))then 
-       ptimer(itimer)%tname = tag
+    if(present(tag))then
+      ptimer(itimer)%tname = tag
     endif
 
     call system_clock(tstart_clock, tclock_rate, tclock_max)
@@ -238,9 +238,9 @@ subroutine prg_timer_stop(itimer,verbose)
     call system_clock(tstop_clock, tclock_rate, tclock_max)
     tprg_delta = tstop_clock - ptimer(itimer)%tstart
     if(present(verbose))then
-       if(verbose.gt.0)then 
-          write(*,*)"Time for "//trim(ptimer(itimer)%tname)//" = "//to_string(tprg_delta)//" ms"
-       endif
+      if(verbose.gt.0)then
+        write(*,*)"Time for "//trim(ptimer(itimer)%tname)//" = "//to_string(tprg_delta)//" ms"
+      endif
     endif
     ptimer(itimer)%ttotal = ptimer(itimer)%ttotal + tprg_delta
     ptimer(itimer)%tcount = ptimer(itimer)%tcount + 1
@@ -266,12 +266,12 @@ subroutine prg_timer_collect()
 
     !! Determine average of each timer across ranks
     do i = 1, num_timers
-       sendBuf(i) = float(ptimer(i)%ttotal)/float(tclock_rate)
+      sendBuf(i) = float(ptimer(i)%ttotal)/float(tclock_rate)
     enddo
     call sumRealParallel(sendBuf, recvBuf, num_timers);
 
     do i = 1, num_timers
-       ptimer(i)%tavg = recvBuf(i) / rranks
+      ptimer(i)%tavg = recvBuf(i) / rranks
     enddo
 
     !! Determine min and max across ranks and which rank
@@ -279,18 +279,18 @@ subroutine prg_timer_collect()
     allocate(reduceRecvBuf(num_timers))
 
     do i = 1, num_timers
-       reduceSendBuf(i)%val = float(ptimer(i)%ttotal)/float(tclock_rate)
-       reduceSendBuf(i)%rank = getMyRank()
+      reduceSendBuf(i)%val = float(ptimer(i)%ttotal)/float(tclock_rate)
+      reduceSendBuf(i)%rank = getMyRank()
     enddo
     call minRankRealParallel(reduceSendBuf, reduceRecvBuf, num_timers);
     do i = 1, num_timers
-       ptimer(i)%minValue = reduceRecvBuf(i)%val
-       ptimer(i)%minRank = reduceRecvBuf(i)%rank
+      ptimer(i)%minValue = reduceRecvBuf(i)%val
+      ptimer(i)%minRank = reduceRecvBuf(i)%rank
     enddo
     call maxRankRealParallel(reduceSendBuf, reduceRecvBuf, num_timers);
     do i = 1, num_timers
-       ptimer(i)%maxValue = reduceRecvBuf(i)%val
-       ptimer(i)%maxRank = reduceRecvBuf(i)%rank
+      ptimer(i)%maxValue = reduceRecvBuf(i)%val
+      ptimer(i)%maxRank = reduceRecvBuf(i)%rank
     enddo
 
     deallocate(reduceSendBuf)
@@ -298,12 +298,12 @@ subroutine prg_timer_collect()
 
     !! Determine standard deviation
     do i = 1, num_timers
-       temp = float(ptimer(i)%ttotal)/float(tclock_rate) - ptimer(i)%tavg
-       sendBuf(i) = temp * temp;
+      temp = float(ptimer(i)%ttotal)/float(tclock_rate) - ptimer(i)%tavg
+      sendBuf(i) = temp * temp;
     enddo
     call sumRealParallel(sendBuf, recvBuf, num_timers);
     do i = 1, num_timers
-       ptimer(i)%tstdev = sqrt(recvBuf(i) / rranks)
+      ptimer(i)%tstdev = sqrt(recvBuf(i) / rranks)
     enddo
 
     deallocate(sendBuf)
@@ -323,35 +323,35 @@ subroutine prg_timer_results()
     ! Print timer results
     if (printRank() .eq. 1) then
 
-       write(*,*) ""
-       write(*,*) "Timings for Rank ", getMyRank()
-       write(*,*) "Timer                 # Calls  Avg/Call (s)     Total (s)       % Time"
-       write(*,*) ""
-
-       do i = 1, num_timers
-          if (ptimer(i)%tcount .gt. 0) then
-             !!      ptimer(i)%tavg = (float(ptimer(i)%ttotal)/float(tclock_rate))/float(ptimer(i)%tcount)
-             ptimer(i)%tsum = float(ptimer(i)%ttotal)/float(tclock_rate)
-             ptimer(i)%tpercent = (ptimer(i)%tsum / ptimer(1)%tsum) * 100.0
-             write(*,10) ptimer(i)%tname, ptimer(i)%tcount, ptimer(i)%tsum/float(ptimer(i)%tcount), ptimer(i)%tsum, ptimer(i)%tpercent
-10           format(A23, I6, 3G16.6)
-          end if
-       end do
-
-       write(*,*) ""
-       write(*,*) "Timing Statistics Across ", getNRanks(), " Ranks:"
-       write(*,*) "Timer                      Rank: Min(s)        Rank: Max(s)            Avg(s)        Stdev(s)"
-       write(*,*)
-
-       do i = 1, num_timers
-          if (ptimer(i)%tcount > 0) then
-             write(*, 20) ptimer(i)%tname, &
-                  ptimer(i)%minRank, ptimer(i)%minValue, &
-                  ptimer(i)%maxRank, ptimer(i)%maxValue, &
-                  ptimer(i)%tavg, ptimer(i)%tstdev 
-20           format(A23,2X,I4,G16.6,I4,3G16.6) 
-          endif
-       enddo
+      write(*,*) ""
+      write(*,*) "Timings for Rank ", getMyRank()
+      write(*,*) "Timer                 # Calls  Avg/Call (s)     Total (s)       % Time"
+      write(*,*) ""
+
+      do i = 1, num_timers
+        if (ptimer(i)%tcount .gt. 0) then
+          !!      ptimer(i)%tavg = (float(ptimer(i)%ttotal)/float(tclock_rate))/float(ptimer(i)%tcount)
+          ptimer(i)%tsum = float(ptimer(i)%ttotal)/float(tclock_rate)
+          ptimer(i)%tpercent = (ptimer(i)%tsum / ptimer(1)%tsum) * 100.0
+          write(*,10) ptimer(i)%tname, ptimer(i)%tcount, ptimer(i)%tsum/float(ptimer(i)%tcount), ptimer(i)%tsum, ptimer(i)%tpercent
+10        format(A23, I6, 3G16.6)
+        end if
+      end do
+
+      write(*,*) ""
+      write(*,*) "Timing Statistics Across ", getNRanks(), " Ranks:"
+      write(*,*) "Timer                      Rank: Min(s)        Rank: Max(s)            Avg(s)        Stdev(s)"
+      write(*,*)
+
+      do i = 1, num_timers
+        if (ptimer(i)%tcount > 0) then
+          write(*, 20) ptimer(i)%tname, &
+               ptimer(i)%minRank, ptimer(i)%minValue, &
+               ptimer(i)%maxRank, ptimer(i)%maxValue, &
+               ptimer(i)%tavg, ptimer(i)%tstdev
+20        format(A23,2X,I4,G16.6,I4,3G16.6)
+        endif
+      enddo
     endif
 
   end subroutine prg_timer_results
@@ -372,7 +372,7 @@ subroutine prg_print_date_and_time(tag)
     implicit none
 
     character(len=*), intent(in) :: tag
-    character(2) :: monthchar, daychar,hourchar,minchar,secchar    
+    character(2) :: monthchar, daychar,hourchar,minchar,secchar
     integer :: sec, mins, hour, day, month, year
     integer :: timevector(8)
 
@@ -394,14 +394,14 @@ function int2char(ival)
 
     implicit none
 
-    integer, intent(in) :: ival 
+    integer, intent(in) :: ival
     character(2) :: int2char, myintchar
 
     if ((ival/10) .lt. 1) then
-       write(myintchar,'(I2)') ival
-       myintchar="0"//trim(adjustl(myintchar))
+      write(myintchar,'(I2)') ival
+      myintchar="0"//trim(adjustl(myintchar))
     else
-       write(myintchar,'(I2)') ival
+      write(myintchar,'(I2)') ival
     endif
 
     int2char = myintchar
diff --git a/src/prg_xlbokernel_mod.F90 b/src/prg_xlbokernel_mod.F90
index 54f4ecf1..a19cb5b2 100644
--- a/src/prg_xlbokernel_mod.F90
+++ b/src/prg_xlbokernel_mod.F90
@@ -1,7 +1,7 @@
 !> Pre-conditioned O(N) calculation of the kernel for XL-BOMD.
 !! \ingroup PROGRESS
-!! \brief Here are subroutines implementing Niklasson's scheme for 
-!!  low-rank, Krylov subspace approximation of the kernel. 
+!! \brief Here are subroutines implementing Niklasson's scheme for
+!!  low-rank, Krylov subspace approximation of the kernel.
 module prg_xlbokernel_mod
 
   use omp_lib
@@ -11,7 +11,7 @@ module prg_xlbokernel_mod
   use prg_timer_mod
   use prg_parallel_mod
   use prg_ewald_mod
-  use prg_implicit_fermi_mod 
+  use prg_implicit_fermi_mod
 
   implicit none
 
@@ -27,809 +27,810 @@ module prg_xlbokernel_mod
 
 contains
 
-subroutine Invert(A,AI,N)
-
-implicit none
-integer, parameter                          :: PREC = 8
-integer, intent(in)                         :: N
-real(PREC),                 intent(in)      :: A(N,N)
-real(PREC), intent(out)                     :: AI(N,N)
-real(PREC)                                  :: WORK(N+N*N)!, C(N,N)
-integer                                     :: LDA, LWORK, M, INFO, IPIV(N)
-integer                                     :: I,J,K
-
-external DGETRF
-external DGETRI
-
-AI = A
-LDA = N
-M = N
-LWORK = N+N*N
-
-call DGETRF(M, N, AI, LDA, IPIV, INFO)
-call DGETRI(N, AI, N, IPIV, WORK, LWORK, INFO)
-
-end subroutine Invert
-  
-!> Compute low rank approximation of (K0*J)^(-1)*K0*(q[n]-n)(for LATTE)
-!! \param KRes The low rank approximation 
-!! \param KK0_bml The pre-conditioner K0.
-!! \param Res The residual q[n]-n
-!! \param FelTol Relative error tolerance for approximation 
-!! \param L Number of vectors used. 
-!! \param LMAX Maximum nr of vectors to use. 
-!! \param NUMRANK Nr of vectors to use. 
-!! \param HO_bml, Orthogonalized Hamiltonian matrix. 
-!! \param mu The chemical potiential. 
-!! \param beta Scaled inverse temperature.
-!! \param RXYZ Nuclear coordinates.
-!! \param Box Box dimensions. 
-!! \param Hubbard_U Hubbard U list. 
-!! \param Element_Pointer List to keep track of elements. 
-!! \param Nr_atoms The number of atoms. 
-!! \param HDIM Hamiltonian matrix dimension. 
-!! \param Max_Nr_Neigh Max neighbours for Ewald. 
-!! \param Coulomb_acc Coulomb accuracy.
-!! \param nebcoul Neighbour lists.
-!! \param totnebcoul Number of neighbours list. 
-!! \param Hinxlist List to keep track of atomic positions in the Hamiltonian. 
-!! \param S_bml The S matrix. 
-!! \param Z_bml, The Z matrix. 
-!! \param Nocc Occupation. 
-!! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
-!! \param DO_bml, D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
-!! \param m_rec Number of recursion steps.
-!! \param threshold Threshold value for matrix truncation.
-!! \param Nr_elem Number of elements in Hubbard list. 
-subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_bml,mu,beta,RXYZ,Box,Hubbard_U,Element_Pointer, & 
-    Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,nebcoul,totnebcoul,Hinxlist, &
-    S_bml,Z_bml,Nocc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold,Nr_elem)
-
-!! Res = q[n] - n
-!! KK0 is preconditioner
-!! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+  subroutine Invert(A,AI,N)
+
+    implicit none
+    integer, parameter                          :: PREC = 8
+    integer, intent(in)                         :: N
+    real(PREC),                 intent(in)      :: A(N,N)
+    real(PREC), intent(out)                     :: AI(N,N)
+    real(PREC)                                  :: WORK(N+N*N)!, C(N,N)
+    integer                                     :: LDA, LWORK, M, INFO, IPIV(N)
+    integer                                     :: I,J,K
+
+    external DGETRF
+    external DGETRI
+
+    AI = A
+    LDA = N
+    M = N
+    LWORK = N+N*N
+
+    call DGETRF(M, N, AI, LDA, IPIV, INFO)
+    call DGETRI(N, AI, N, IPIV, WORK, LWORK, INFO)
+
+  end subroutine Invert
+
+  !> Compute low rank approximation of (K0*J)^(-1)*K0*(q[n]-n)(for LATTE)
+  !! \param KRes The low rank approximation
+  !! \param KK0_bml The pre-conditioner K0.
+  !! \param Res The residual q[n]-n
+  !! \param FelTol Relative error tolerance for approximation
+  !! \param L Number of vectors used.
+  !! \param LMAX Maximum nr of vectors to use.
+  !! \param NUMRANK Nr of vectors to use.
+  !! \param HO_bml, Orthogonalized Hamiltonian matrix.
+  !! \param mu The chemical potiential.
+  !! \param beta Scaled inverse temperature.
+  !! \param RXYZ Nuclear coordinates.
+  !! \param Box Box dimensions.
+  !! \param Hubbard_U Hubbard U list.
+  !! \param Element_Pointer List to keep track of elements.
+  !! \param Nr_atoms The number of atoms.
+  !! \param HDIM Hamiltonian matrix dimension.
+  !! \param Max_Nr_Neigh Max neighbours for Ewald.
+  !! \param Coulomb_acc Coulomb accuracy.
+  !! \param nebcoul Neighbour lists.
+  !! \param totnebcoul Number of neighbours list.
+  !! \param Hinxlist List to keep track of atomic positions in the Hamiltonian.
+  !! \param S_bml The S matrix.
+  !! \param Z_bml, The Z matrix.
+  !! \param Nocc Occupation.
+  !! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+  !! \param DO_bml, D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+  !! \param m_rec Number of recursion steps.
+  !! \param threshold Threshold value for matrix truncation.
+  !! \param Nr_elem Number of elements in Hubbard list.
+  subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_bml,mu,beta,RXYZ,Box,Hubbard_U,Element_Pointer, &
+       Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,nebcoul,totnebcoul,Hinxlist, &
+       S_bml,Z_bml,Nocc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold,Nr_elem)
+
+    !! Res = q[n] - n
+    !! KK0 is preconditioner
+    !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+    implicit none
+    integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec,Nr_elem,NUMRANK
+    real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(dp), intent(in)      :: Coulomb_acc, FelTol, beta, mu
+    real(dp), intent(in)      :: RXYZ(3,Nr_atoms), Box(3,3)
+    real(dp), intent(in)      :: Res(Nr_atoms)
+    integer,    intent(in)      :: Hinxlist(HDIM),Element_Pointer(Nr_atoms)
+    real(dp), intent(in)      :: Hubbard_U(Nr_elem)
+    type(bml_matrix_t), intent(inout)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
+    real(dp)                  :: K0Res(Nr_atoms)
+    type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+    real(dp), intent(in)      :: threshold
+    integer,    intent(in)      :: LMAX
+    character(20)             :: bml_type
+    integer,    intent(in)      :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(dp)                  :: dq_v(Nr_atoms)
+    real(dp), intent(inout)   :: KRes(Nr_atoms)
+    integer                     :: I,J,K,It,N,MN
+    integer, intent(out)        :: L
+    real(dp)                  :: Fel, proj_tmp, start, finish
+    real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+    real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+    real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
+    type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml, ZT_bml, T_bml
+
+    call timer_prg_init()
+    bml_type = bml_get_type(HO_bml)
+    N = bml_get_N(HO_bml)
+    MN = bml_get_M(HO_bml)
+    allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
+    call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
+    call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
+    call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+
+    call bml_transpose(Z_bml,ZT_bml)
+
+    ! K0Res = KK0*Res temporary for matrix-vector multiplication
+    call bml_set_row(Res_bml,1,Res,1.0_dp*1e-10)
+    call bml_transpose(KK0_bml,KK0T_bml)
+    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,1.0_dp*1e-10)
+    call bml_get_row(K0Res_bml,1,row_NA)
+    K0Res = row_NA
+    dr = K0Res
+
+    I = 0
+    Fel = 1.D0
+    do while ((Fel > FelTol).AND.(I < (LMAX)).AND.(I < NUMRANK))   !! Fel = "Error" in Swedish
+      I = I + 1
+      !write(*,*) 'dr =', norm2(dr)
+      vi(:,I) = dr/norm2(dr)
+      do J = 1,I-1
+        vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+      enddo
+      vi(:,I) = vi(:,I)/norm2(vi(:,I))
+      v(:) = vi(:,I)  ! v_i
+
+      ! Compute H1 = H(v)
+      dq_v = v
+      call prg_timer_start(1)
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I)
+      do J = 1,Nr_atoms
+        call Ewald_Real_Space_latte(Coulomb_Pot_Real_I,J,RXYZ,Box, &
+             dq_v,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_elem)
+        Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+      enddo
+      !$OMP END PARALLEL DO
 
-  implicit none
-  integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec,Nr_elem,NUMRANK
-  real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-  real(dp), intent(in)      :: Coulomb_acc, FelTol, beta, mu
-  real(dp), intent(in)      :: RXYZ(3,Nr_atoms), Box(3,3)
-  real(dp), intent(in)      :: Res(Nr_atoms)
-  integer,    intent(in)      :: Hinxlist(HDIM),Element_Pointer(Nr_atoms)
-  real(dp), intent(in)      :: Hubbard_U(Nr_elem)
-  type(bml_matrix_t), intent(inout)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
-  real(dp)                  :: K0Res(Nr_atoms)
-  type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
-  real(dp), intent(in)      :: threshold
-  integer,    intent(in)      :: LMAX
-  character(20)             :: bml_type
-  integer,    intent(in)      :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
-  real(dp)                  :: dq_v(Nr_atoms)
-  real(dp), intent(inout)   :: KRes(Nr_atoms)
-  integer                     :: I,J,K,It,N,MN
-  integer, intent(out)        :: L
-  real(dp)                  :: Fel, proj_tmp, start, finish
-  real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
-  real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
-  real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
-  type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml, ZT_bml, T_bml
-
-  call timer_prg_init()
-  bml_type = bml_get_type(HO_bml)
-  N = bml_get_N(HO_bml)
-  MN = bml_get_M(HO_bml)
-  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
-  call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
-  call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
-  call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
-  call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
-
-  call bml_transpose(Z_bml,ZT_bml)
-
-  ! K0Res = KK0*Res temporary for matrix-vector multiplication
-  call bml_set_row(Res_bml,1,Res,threshold)
-  call bml_transpose(KK0_bml,KK0T_bml)
-  call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
-  call bml_get_row(K0Res_bml,1,row_NA)
-  K0Res = row_NA
-  dr = K0Res
-
-  I = 0
-  Fel = 1.D0  
-  do while ((Fel > FelTol).AND.(I < (LMAX)).AND.(I < NUMRANK))   !! Fel = "Error" in Swedish
-    I = I + 1
-    vi(:,I) = dr/norm2(dr)
-    do J = 1,I-1
-      vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
-    enddo
-    vi(:,I) = vi(:,I)/norm2(vi(:,I))
-    v(:) = vi(:,I)  ! v_i
+      call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc,Max_Nr_Neigh)
+      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+      call prg_timer_stop(1,1)
 
-    ! Compute H1 = H(v) 
-    dq_v = v 
-    call prg_timer_start(1)
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I)
-    do J = 1,Nr_atoms
-      call Ewald_Real_Space_latte(Coulomb_Pot_Real_I,J,RXYZ,Box, &
-      dq_v,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_elem)
-      Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
-    enddo
-    !$OMP END PARALLEL DO
-    
-    call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc,Max_Nr_Neigh)
-    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
-    call prg_timer_stop(1,1)
 
+      call bml_deallocate(H1_bml)
+      call bml_zero_matrix(bml_type,bml_element_real,dp,HDIM,MN,H1_bml)
 
-    call bml_deallocate(H1_bml)
-    call bml_zero_matrix(bml_type,bml_element_real,dp,HDIM,MN,H1_bml)
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+      do It = 1,Nr_atoms-1
+        do K = Hinxlist(It)+1,Hinxlist(It+1)
+          row1(K) = Hubbard_U(Element_Pointer(It))*dq_v(It) + Coulomb_Pot_dq_v(It)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+      do K = Hinxlist(Nr_atoms)+1,HDIM
+        row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
+      enddo
 
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
-    do It = 1,Nr_atoms-1
-       do K = Hinxlist(It)+1,Hinxlist(It+1)
-           row1(K) = Hubbard_U(Element_Pointer(It))*dq_v(It) + Coulomb_Pot_dq_v(It)
-       enddo
-    enddo
-    !$OMP END PARALLEL DO
-    do K = Hinxlist(Nr_atoms)+1,HDIM
-         row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
-    enddo
-     
-    ! H1 = 1/2(S*H1+H1*S)
-    call bml_set_diagonal(H1_bml,row1,threshold)
-    call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold) 
-    call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
-    call bml_copy(X_bml,H1_bml)
-
-    ! H1 = Z^T H1 Z
-    call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
-    call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
-
-    ! Compute D1 = F_FD(HO_bml + eps*H1_bml)/eps at eps = 0
-    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-        m_rec, mu, beta, real(nocc,dp), threshold)
-    
-    ! D1 = Z D1 Z^T
-    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
-    call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-
-    ! Compute dq/dv
-    call bml_multiply(D1_bml,S_bml,X_bml, 1.0_dp,0.0_dp,threshold)
-    call bml_get_diagonal(X_bml,row1)
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
-    do It = 1, Nr_atoms-1
-       dq_dv(It) = 0
-       do K = Hinxlist(It)+1,Hinxlist(It+1)
+      ! H1 = 1/2(S*H1+H1*S)
+      call bml_set_diagonal(H1_bml,row1,threshold)
+      call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(X_bml,H1_bml)
+
+      ! H1 = Z^T H1 Z
+      call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+      ! Compute D1 = F_FD(HO_bml + eps*H1_bml)/eps at eps = 0
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec, mu, beta, real(nocc,dp), threshold)
+
+      ! D1 = Z D1 Z^T
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      ! Compute dq/dv
+      call bml_multiply(D1_bml,S_bml,X_bml, 1.0_dp,0.0_dp,threshold)
+      call bml_get_diagonal(X_bml,row1)
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+      do It = 1, Nr_atoms-1
+        dq_dv(It) = 0
+        do K = Hinxlist(It)+1,Hinxlist(It+1)
           dq_dv(It) = dq_dv(It) + row1(K)
-       enddo
-    enddo
-    !$OMP END PARALLEL DO
-    dq_dv(Nr_atoms) = 0
-    do K = Hinxlist(Nr_atoms)+1,HDIM
-       dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K) 
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+      dq_dv(Nr_atoms) = 0
+      do K = Hinxlist(Nr_atoms)+1,HDIM
+        dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K)
+      enddo
+
+      dr = dq_dv - v
+      ! fi = K0(dq_dv - v)
+      call bml_set_row(Res_bml,1,dr,1.0_dp*1e-10)
+      call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,1.0_dp*1e-10)
+      call bml_get_row(K0Res_bml,1,row_NA)
+      dr = row_NA
+      fi(:,I) = dr
+
+      L = I
+      allocate(O(L,L), M(L,L))
+      do K = 1,L
+        do J = 1,L
+          O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+        enddo
+      enddo
+      call Invert(O,M,L)                        ! M = O^(-1)
+      IdentRes = 0.D0*K0Res
+      KRes = 0.D0
+      do K = 1,L
+        do J = 1,L
+          proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
+          IdentRes = IdentRes + proj_tmp*fi(:,K)
+          KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n)
+        enddo
+      enddo
+      Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+      write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+      deallocate(O, M)
+
     enddo
 
-    dr = dq_dv - v
-    ! fi = K0(dq_dv - v)
-    call bml_set_row(Res_bml,1,dr,threshold)
-    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
+    deallocate(row1);deallocate(row2);deallocate(row_NA)
+    call bml_deallocate(KK0T_bml)
+    call bml_deallocate(K0Res_bml)
+    call bml_deallocate(Res_bml)
+    call bml_deallocate(ZT_bml)
+    call prg_timer_shutdown()
+
+  end subroutine prg_kernel_multirank_latte
+
+  ! Above routine but for development code
+  subroutine prg_kernel_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
+       Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
+       S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+
+    !! Res = q[n] - n
+    !! KK0 is preconditioner
+    !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+    implicit none
+    integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
+    real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
+    real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+    real(dp), intent(in)      :: Res(Nr_atoms)
+    integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+    real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+    type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
+    real(dp)                  :: K0Res(Nr_atoms)
+    type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+    real(dp), intent(in)      :: threshold
+    integer,    intent(in)      :: LMAX
+    character(10), intent(in)   :: Element_Type(Nr_atoms)
+    integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+    real(dp)                  :: dq_v(Nr_atoms)
+    real(dp), intent(inout)   :: KRes(Nr_atoms)
+    integer                     :: I,J,K,It
+    integer, intent(out)        :: L
+    real(dp)                  :: Fel, proj_tmp
+    real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+    real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+    real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
+    type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml
+
+    allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
+    call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
+    call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
+    call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
+
+    ! K0Res = KK0*Res temporary for matrix-vector multiplication
+    call bml_transpose(KK0_bml,KK0T_bml)
+    call bml_set_row(Res_bml,1,Res,ONE*1e-14)
+    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,ONE*1e-14)
     call bml_get_row(K0Res_bml,1,row_NA)
-    dr = row_NA
-    fi(:,I) = dr
-
-    L = I
-    allocate(O(L,L), M(L,L))
-    do K = 1,L
-    do J = 1,L
-       O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
-    enddo
-    enddo
-    call Invert(O,M,L)                        ! M = O^(-1)
-    IdentRes = 0.D0*K0Res
-    KRes = 0.D0
-    do K = 1,L
-    do J = 1,L
-      proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
-      IdentRes = IdentRes + proj_tmp*fi(:,K)
-      KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n) 
-    enddo
-    enddo
-    Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
-    write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
-    deallocate(O, M)
+    K0Res = row_NA
+    dr = K0Res
+
+    I = 0
+    Fel = 1.D0
+    do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
+      I = I + 1
+      vi(:,I) = dr/norm2(dr)
+      do J = 1,I-1
+        vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+      enddo
+      vi(:,I) = vi(:,I)/norm2(vi(:,I))
+      v(:) = vi(:,I)  ! v_i
 
-  enddo
+      dq_v = v
 
-  deallocate(row1);deallocate(row2);deallocate(row_NA)
-  call bml_deallocate(KK0T_bml)
-  call bml_deallocate(K0Res_bml)
-  call bml_deallocate(Res_bml)
-  call bml_deallocate(ZT_bml)
-  call prg_timer_shutdown() 
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
+      do J = 1,Nr_atoms
+        call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
+             dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+        Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+      enddo
+      !$OMP END PARALLEL DO
 
-  end subroutine prg_kernel_multirank_latte
+      call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
-  ! Above routine but for development code   
-  subroutine prg_kernel_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, & 
-    Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
-    S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+      row1 = 0.0_dp
+      do J = 1,HDIM
+        call bml_set_row(H1_bml,J,row1,threshold)
+      enddo
 
-!! Res = q[n] - n
-!! KK0 is preconditioner
-!! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
+      do J = 1,Nr_atoms
+        do K = H_INDEX_START(J),H_INDEX_END(J)
+          row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
 
-  implicit none
-  integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
-  real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-  real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
-  real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
-  real(dp), intent(in)      :: Res(Nr_atoms)
-  integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
-  real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
-  type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
-  real(dp)                  :: K0Res(Nr_atoms)
-  type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
-  real(dp), intent(in)      :: threshold
-  integer,    intent(in)      :: LMAX
-  character(10), intent(in)   :: Element_Type(Nr_atoms)
-  integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
-  real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
-  real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
-  real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
-  real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
-  real(dp)                  :: dq_v(Nr_atoms)
-  real(dp), intent(inout)   :: KRes(Nr_atoms)
-  integer                     :: I,J,K,It
-  integer, intent(out)        :: L
-  real(dp)                  :: Fel, proj_tmp
-  real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
-  real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
-  real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
-  type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml
-
-  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
-  call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
-  call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
-  call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
-
-  ! K0Res = KK0*Res temporary for matrix-vector multiplication
-  call bml_transpose(KK0_bml,KK0T_bml) 
-  call bml_set_row(Res_bml,1,Res,ONE*1e-14)
-  call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,ONE*1e-14)
-  call bml_get_row(K0Res_bml,1,row_NA)
-  K0Res = row_NA
-  dr = K0Res
-
-  I = 0
-  Fel = 1.D0  
-  do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
-    I = I + 1
-    vi(:,I) = dr/norm2(dr)
-    do J = 1,I-1
-      vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
-    enddo
-    vi(:,I) = vi(:,I)/norm2(vi(:,I))
-    v(:) = vi(:,I)  ! v_i
+      call bml_set_diagonal(H1_bml,row1,threshold)
+      call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(D1_bml,H1_bml)
 
-    dq_v = v 
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
 
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
-    do J = 1,Nr_atoms
-      call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
-      dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
-      Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
-    enddo
-    !$OMP END PARALLEL DO
-    
-    call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
-    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
-
-    row1 = 0.0_dp
-    do J = 1,HDIM
-       call bml_set_row(H1_bml,J,row1,threshold)
-    enddo
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec, mu, beta, real(nocc,dp), threshold)
 
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
-    do J = 1,Nr_atoms
-       do K = H_INDEX_START(J),H_INDEX_END(J)
-           row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
-       enddo
-    enddo
-    !$OMP END PARALLEL DO
-
-    call bml_set_diagonal(H1_bml,row1,threshold)
-    call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold) 
-    call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
-    call bml_copy(D1_bml,H1_bml)
-
-    call bml_transpose(Z_bml,X_bml)
-    call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-    call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
-
-    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-        m_rec, mu, beta, real(nocc,dp), threshold)
-    
-    call bml_transpose(Z_bml,X_bml)
-    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
-    call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
-    do It = 1, Nr_atoms
-      dq_dv(It) = 0
-      do K = H_INDEX_START(It), H_INDEX_END(It)
-        call bml_get_row(S_bml,K,row1)
-        call bml_get_row(D1_bml,K,row2)
-        dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
+      do It = 1, Nr_atoms
+        dq_dv(It) = 0
+        do K = H_INDEX_START(It), H_INDEX_END(It)
+          call bml_get_row(S_bml,K,row1)
+          call bml_get_row(D1_bml,K,row2)
+          dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+        enddo
       enddo
-    enddo
-    !$OMP END PARALLEL DO
+      !$OMP END PARALLEL DO
+
+      dr = dq_dv - v
+      ! fi = K0(dq_dv - v)
+      call bml_set_row(Res_bml,1,dr,threshold)
+      call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
+      call bml_get_row(K0Res_bml,1,row_NA)
+      dr = row_NA
+      fi(:,I) = dr
+
+      L = I
+      allocate(O(L,L), M(L,L))
+      do K = 1,L
+        do J = 1,L
+          O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+        enddo
+      enddo
+      call Invert(O,M,L)                        ! M = O^(-1)
+      IdentRes = 0.D0*K0Res
+      KRes = 0.D0
+      do K = 1,L
+        do J = 1,L
+          proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
+          IdentRes = IdentRes + proj_tmp*fi(:,K)
+          KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n)
+        enddo
+      enddo
+      Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+      write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+      deallocate(O, M)
 
-    dr = dq_dv - v 
-    ! fi = K0(dq_dv - v)
-    call bml_set_row(Res_bml,1,dr,threshold)
-    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
-    call bml_get_row(K0Res_bml,1,row_NA)
-    dr = row_NA
-    fi(:,I) = dr
-
-    L = I
-    allocate(O(L,L), M(L,L))
-    do K = 1,L
-    do J = 1,L
-       O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
-    enddo
     enddo
-    call Invert(O,M,L)                        ! M = O^(-1)
-    IdentRes = 0.D0*K0Res
-    KRes = 0.D0
-    do K = 1,L
-    do J = 1,L
-      proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
-      IdentRes = IdentRes + proj_tmp*fi(:,K)
-      KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n) 
-    enddo
-    enddo
-    Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
-    write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
-    deallocate(O, M)
-
-  enddo
 
-  deallocate(row1);deallocate(row2);deallocate(row_NA)
-  call bml_deallocate(KK0T_bml)
-  call bml_deallocate(K0Res_bml)
-  call bml_deallocate(Res_bml)
+    deallocate(row1);deallocate(row2);deallocate(row_NA)
+    call bml_deallocate(KK0T_bml)
+    call bml_deallocate(K0Res_bml)
+    call bml_deallocate(Res_bml)
 
   end subroutine prg_kernel_multirank
 
-!> Compute full inverse Jacobian of q[n]-n (for LATTE)
-!! \param KK The inverse Jacobian.
-!! \param DO_bml Orthogonalized density matrix. 
-!! \param mu0 The chemical potiential. 
-!! \param RXYZ Nuclear coordinates.
-!! \param Box Box dimensions. 
-!! \param Hubbard_U Hubbard U list. 
-!! \param Element_Pointer List to keep track of elements. 
-!! \param Nr_atoms The number of atoms. 
-!! \param HDIM Hamiltonian matrix dimension. 
-!! \param Max_Nr_Neigh Max neighbours for Ewald. 
-!! \param Coulomb_acc Coulomb accuracy
-!! \param Hinxlist List to keep track of atomic positions in the Hamiltonian. 
-!! \param S_bml The S matrix. 
-!! \param Z_bml, The Z matrix. 
-!! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
-!! \param HO_bml, Orthogonalized Hamiltonian matrix. 
-!! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
-!! \param Nocc Occupation. 
-!! \param m_rec Number of recursion steps.
-!! \param threshold Threshold value for matrix truncation.
-!! \param beta Scaled inverse temperature.
-!! \param Nr_elem Number of elements in Hubbard list. 
-subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
-Element_Pointer, Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc, & 
-Hinxlist,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
-Nocc,m_rec,threshold,beta,Nr_elem)
-
-use bml
-
-implicit none
-integer, parameter             :: PREC = 8, dp = kind(1.0d0)
-integer,    intent(in)         :: Nr_atoms, Nr_elem,HDIM, Nocc, Max_Nr_Neigh,m_rec
-real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
-real(PREC), intent(in)         :: Coulomb_acc, threshold,beta
-real(PREC)                     :: v(Nr_atoms)
-real(PREC), intent(in)         :: RXYZ(3,Nr_atoms),Box(3,3)
-integer,    intent(in)         :: Hinxlist(HDIM)
-real(PREC), intent(in)         :: Hubbard_U(Nr_elem)
-type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
-type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
-type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml 
-real(PREC), intent(inout)      :: mu0
-integer,    intent(in)         :: Element_Pointer(Nr_atoms)
-real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
-real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
-real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
-real(PREC)                     :: dq_v(Nr_atoms)
-real(PREC)                     :: dq_dv(Nr_atoms), err,tol
-real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
-integer                        :: I,J,K, ITER, mm,It,N,MN
-real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)     
-type(bml_matrix_t)             :: ZT_bml, R_bml, JJI_bml, JJ_bml, tmp_bml
-character(20)                  :: bml_type
-
-  bml_type = bml_get_type(HO_bml)
-  N = bml_get_N(HO_bml)
-  MN = bml_get_M(HO_bml)
-  call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
-  call bml_transpose(Z_bml,ZT_bml)
-  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
-  Coulomb_Pot_dq_v = ZERO
-  Coulomb_Pot_k = ZERO
-  dq_v = ZERO
-  JJ = ZERO
-  KK = ZERO
-  
-  do J = 1,Nr_atoms  
-    dq_v(J) = ONE
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
-    do I = 1,Nr_atoms   
+  !> Compute full inverse Jacobian of q[n]-n (for LATTE)
+  !! \param KK The inverse Jacobian.
+  !! \param DO_bml Orthogonalized density matrix.
+  !! \param mu0 The chemical potiential.
+  !! \param RXYZ Nuclear coordinates.
+  !! \param Box Box dimensions.
+  !! \param Hubbard_U Hubbard U list.
+  !! \param Element_Pointer List to keep track of elements.
+  !! \param Nr_atoms The number of atoms.
+  !! \param HDIM Hamiltonian matrix dimension.
+  !! \param Max_Nr_Neigh Max neighbours for Ewald.
+  !! \param Coulomb_acc Coulomb accuracy
+  !! \param Hinxlist List to keep track of atomic positions in the Hamiltonian.
+  !! \param S_bml The S matrix.
+  !! \param Z_bml, The Z matrix.
+  !! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+  !! \param HO_bml, Orthogonalized Hamiltonian matrix.
+  !! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+  !! \param Nocc Occupation.
+  !! \param m_rec Number of recursion steps.
+  !! \param threshold Threshold value for matrix truncation.
+  !! \param beta Scaled inverse temperature.
+  !! \param Nr_elem Number of elements in Hubbard list.
+  subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
+       Element_Pointer, Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc, &
+       Hinxlist,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
+       Nocc,m_rec,threshold,beta,Nr_elem)
+
+    use bml
+
+    implicit none
+    integer, parameter             :: PREC = 8, dp = kind(1.0d0)
+    integer,    intent(in)         :: Nr_atoms, Nr_elem,HDIM, Nocc, Max_Nr_Neigh,m_rec
+    real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
+    real(PREC), intent(in)         :: Coulomb_acc, threshold,beta
+    real(PREC)                     :: v(Nr_atoms)
+    real(PREC), intent(in)         :: RXYZ(3,Nr_atoms),Box(3,3)
+    integer,    intent(in)         :: Hinxlist(HDIM)
+    real(PREC), intent(in)         :: Hubbard_U(Nr_elem)
+    type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
+    type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
+    type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml
+    real(PREC), intent(inout)      :: mu0
+    integer,    intent(in)         :: Element_Pointer(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(PREC)                     :: dq_v(Nr_atoms)
+    real(PREC)                     :: dq_dv(Nr_atoms), err,tol
+    real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
+    integer                        :: I,J,K, ITER, mm,It,N,MN
+    real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)
+    type(bml_matrix_t)             :: ZT_bml, R_bml, JJI_bml, JJ_bml, tmp_bml
+    character(20)                  :: bml_type
+
+    bml_type = bml_get_type(HO_bml)
+    N = bml_get_N(HO_bml)
+    MN = bml_get_M(HO_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+    call bml_transpose(Z_bml,ZT_bml)
+    allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
+    Coulomb_Pot_dq_v = ZERO
+    Coulomb_Pot_k = ZERO
+    dq_v = ZERO
+    JJ = ZERO
+    KK = ZERO
+
+    do J = 1,Nr_atoms
+      dq_v(J) = ONE
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
+      do I = 1,Nr_atoms
         call Ewald_Real_Space_Single_latte(Coulomb_Pot_Real_I,I,RXYZ,Box,Nr_elem, &
-        dq_v,J,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,HDIM,Max_Nr_Neigh)
+             dq_v,J,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,HDIM,Max_Nr_Neigh)
         Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
-    enddo
-    !$OMP END PARALLEL DO
-    call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc, &
-    Max_Nr_Neigh)
-    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
-
-    call bml_deallocate(H1_bml)
-    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,H1_bml)
-
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
-    do I = 1,Nr_atoms-1
-       do K = Hinxlist(I)+1,Hinxlist(I+1)
-           row1(K) = Hubbard_U(Element_Pointer(I))*dq_v(I) + Coulomb_Pot_dq_v(I)
-       enddo
-    enddo
-    !$OMP END PARALLEL DO
-    do K = Hinxlist(Nr_atoms)+1,HDIM
-         row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
-    enddo
+      enddo
+      !$OMP END PARALLEL DO
+      call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc, &
+           Max_Nr_Neigh)
+      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+      call bml_deallocate(H1_bml)
+      call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,H1_bml)
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
+      do I = 1,Nr_atoms-1
+        do K = Hinxlist(I)+1,Hinxlist(I+1)
+          row1(K) = Hubbard_U(Element_Pointer(I))*dq_v(I) + Coulomb_Pot_dq_v(I)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+      do K = Hinxlist(Nr_atoms)+1,HDIM
+        row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
+      enddo
 
-    call bml_set_diagonal(H1_bml,row1,threshold)
-    call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold) 
-    call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
-    call bml_copy(X_bml,H1_bml)
+      call bml_set_diagonal(H1_bml,row1,threshold)
+      call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(X_bml,H1_bml)
 
-    call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
-    call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
 
-    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-           m_rec,mu0,beta,real(nocc,PREC),threshold) 
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec,mu0,beta,real(nocc,PREC),threshold)
 
-    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
-    call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
 
-    call bml_multiply(D1_bml,S_bml,X_bml,1.0_dp,0.0_dp,threshold)
-    call bml_get_diagonal(X_bml,row1)
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
-    do It = 1, Nr_atoms-1
-       dq_dv(It) = 0
-       do K = Hinxlist(It)+1,Hinxlist(It+1)
+      call bml_multiply(D1_bml,S_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_get_diagonal(X_bml,row1)
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+      do It = 1, Nr_atoms-1
+        dq_dv(It) = 0
+        do K = Hinxlist(It)+1,Hinxlist(It+1)
           dq_dv(It) = dq_dv(It) + row1(K)
-       enddo
-       JJ(It,J) = dq_dv(It)
-    enddo
-    !$OMP END PARALLEL DO
-    dq_dv(Nr_atoms) = 0
-    do K = Hinxlist(Nr_atoms)+1,HDIM
-       dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K) 
+        enddo
+        JJ(It,J) = dq_dv(It)
+      enddo
+      !$OMP END PARALLEL DO
+      dq_dv(Nr_atoms) = 0
+      do K = Hinxlist(Nr_atoms)+1,HDIM
+        dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K)
+      enddo
+      JJ(Nr_atoms,J) = dq_dv(Nr_atoms)
+      dq_v = ZERO
     enddo
-    JJ(Nr_atoms,J) = dq_dv(Nr_atoms)
+
+    do I = 1,Nr_atoms
+      JJ(I,I) = JJ(I,I) - ONE
+    enddo
+
+    call Invert(JJ,KK,Nr_atoms)
+
+    deallocate(row1); deallocate(row2); deallocate(JJ)
+    call bml_deallocate(ZT_bml)
+
+  end subroutine prg_full_kernel_latte
+
+  !> Compute full inverse Jacobian of q[n]-n (for development code)
+  !! \param KK The inverse Jacobian.
+  !! \param DO_bml Orthogonalized density matrix.
+  !! \param mu0 The chemical potiential.
+  !! \param RX,RY,RZ Nuclear coordinates.
+  !! \param Lbox Box dimensions.
+  !! \param Hubbard_U Hubbard U list.
+  !! \param Element_Type List to keep track of elements.
+  !! \param Nr_atoms The number of atoms.
+  !! \param HDIM Hamiltonian matrix dimension.
+  !! \param Max_Nr_Neigh Max neighbours for Ewald.
+  !! \param Coulomb_acc Coulomb accuracy
+  !! \param TIMERATIO Parameter for Ewald
+  !! \param nnRx,nnRy,nnRz Neighbour lists.
+  !! \param nrnnlist Number of neighbours list.
+  !! \param nnType Refers to original order of atoms.
+  !! \param H_INDEX_START, H_INDEX_END Lists to keep track of atomic positions in the Hamiltonian.
+  !! \param S_bml The S matrix.
+  !! \param Z_bml, The Z matrix.
+  !! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+  !! \param HO_bml, Orthogonalized Hamiltonian matrix.
+  !! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+  !! \param Nocc Occupation.
+  !! \param Znuc List of nuclear charges.
+  !! \param m_rec Number of recursion steps.
+  !! \param threshold Threshold value for matrix truncation.
+  !! \param beta Scaled inverse temperature.
+  !! \param diagonal Auxillary vector.
+
+  subroutine prg_full_kernel(KK,DO_bml,mu0,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
+       Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz, &
+       nrnnlist,nnType,H_INDEX_START,H_INDEX_END,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
+       Nocc,Znuc,m_rec,threshold,beta,diagonal)
+
+    use bml
+
+    implicit none
+    integer, parameter             :: PREC = 8, dp = kind(1.0d0)
+    integer,    intent(in)         :: Nr_atoms, HDIM, Nocc, Max_Nr_Neigh,m_rec
+    real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
+    real(PREC), intent(in)         :: Coulomb_acc, TIMERATIO,threshold,beta
+    real(PREC)                     :: v(Nr_atoms)
+    real(PREC), intent(in)         :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+    integer,    intent(in)         :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+    real(PREC), intent(in)         :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+    type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
+    type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
+    type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml
+    real(PREC), intent(inout)      :: mu0, diagonal(HDIM)
+    character(10), intent(in)      :: Element_Type(Nr_atoms)
+    integer,    intent(in)         :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)         :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)         :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(PREC)                     :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+    real(PREC)                     :: dq_v(Nr_atoms)
+    real(PREC)                     :: dq_dv(Nr_atoms)
+    real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
+    integer                        :: I,J,K, ITER, mm,It
+    real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)
+
+    allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
+    Coulomb_Pot_dq_v = ZERO
+    Coulomb_Pot_k = ZERO
     dq_v = ZERO
-  enddo
-  
-  do I = 1,Nr_atoms
-     JJ(I,I) = JJ(I,I) - ONE
-  enddo
-  
-  call Invert(JJ,KK,Nr_atoms)
-  
-  deallocate(row1); deallocate(row2); deallocate(JJ)
-  call bml_deallocate(ZT_bml)
-
-end subroutine prg_full_kernel_latte
-
-!> Compute full inverse Jacobian of q[n]-n (for development code) 
-!! \param KK The inverse Jacobian.
-!! \param DO_bml Orthogonalized density matrix. 
-!! \param mu0 The chemical potiential. 
-!! \param RX,RY,RZ Nuclear coordinates.
-!! \param Lbox Box dimensions. 
-!! \param Hubbard_U Hubbard U list. 
-!! \param Element_Type List to keep track of elements. 
-!! \param Nr_atoms The number of atoms. 
-!! \param HDIM Hamiltonian matrix dimension. 
-!! \param Max_Nr_Neigh Max neighbours for Ewald. 
-!! \param Coulomb_acc Coulomb accuracy
-!! \param TIMERATIO Parameter for Ewald
-!! \param nnRx,nnRy,nnRz Neighbour lists.
-!! \param nrnnlist Number of neighbours list.
-!! \param nnType Refers to original order of atoms. 
-!! \param H_INDEX_START, H_INDEX_END Lists to keep track of atomic positions in the Hamiltonian. 
-!! \param S_bml The S matrix. 
-!! \param Z_bml, The Z matrix. 
-!! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
-!! \param HO_bml, Orthogonalized Hamiltonian matrix. 
-!! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
-!! \param Nocc Occupation. 
-!! \param Znuc List of nuclear charges.
-!! \param m_rec Number of recursion steps.
-!! \param threshold Threshold value for matrix truncation.
-!! \param beta Scaled inverse temperature.
-!! \param diagonal Auxillary vector. 
-
-subroutine prg_full_kernel(KK,DO_bml,mu0,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
-Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz, & 
-nrnnlist,nnType,H_INDEX_START,H_INDEX_END,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
-Nocc,Znuc,m_rec,threshold,beta,diagonal)
-
-use bml
-
-implicit none
-integer, parameter             :: PREC = 8, dp = kind(1.0d0)
-integer,    intent(in)         :: Nr_atoms, HDIM, Nocc, Max_Nr_Neigh,m_rec
-real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
-real(PREC), intent(in)         :: Coulomb_acc, TIMERATIO,threshold,beta
-real(PREC)                     :: v(Nr_atoms)
-real(PREC), intent(in)         :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
-integer,    intent(in)         :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
-real(PREC), intent(in)         :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
-type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
-type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
-type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml 
-real(PREC), intent(inout)      :: mu0, diagonal(HDIM)
-character(10), intent(in)      :: Element_Type(Nr_atoms)
-integer,    intent(in)         :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
-real(PREC), intent(in)         :: nnRx(Nr_atoms,Max_Nr_Neigh)
-real(PREC), intent(in)         :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
-real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
-real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
-real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
-real(PREC)                     :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
-real(PREC)                     :: dq_v(Nr_atoms)
-real(PREC)                     :: dq_dv(Nr_atoms) 
-real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
-integer                        :: I,J,K, ITER, mm,It
-real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)        
-
-  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
-  Coulomb_Pot_dq_v = ZERO
-  Coulomb_Pot_k = ZERO
-  dq_v = ZERO
-  JJ = ZERO
-  KK = ZERO
-  
-  do J = 1,Nr_atoms  
-    dq_v(J) = ONE
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
-    do I = 1,Nr_atoms   
+    JJ = ZERO
+    KK = ZERO
+
+    do J = 1,Nr_atoms
+      dq_v(J) = ONE
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
+      do I = 1,Nr_atoms
         call Ewald_Real_Space_Single(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,I,RX,RY,RZ,LBox, &
-        dq_v,J,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,HDIM,Max_Nr_Neigh)
+             dq_v,J,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,HDIM,Max_Nr_Neigh)
         Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
-    enddo
-    !$OMP END PARALLEL DO
-    call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
-    TIMERATIO,Max_Nr_Neigh)
-    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
-
-    diagonal = 0.0_dp
-    do I = 1,HDIM
-       call bml_set_row(H1_bml,I,diagonal,threshold)
-    enddo
-    
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
-    do I = 1,Nr_atoms
-       do K = H_INDEX_START(I),H_INDEX_END(I)
-           diagonal(K) = Hubbard_U(I)*dq_v(I) + Coulomb_Pot_dq_v(I)
-       enddo
-    enddo
-    !$OMP END PARALLEL DO
-    
-    call bml_set_diagonal(H1_bml,diagonal,threshold)
-    call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold) 
-    call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
-    call bml_copy(D1_bml,H1_bml)
-
-    call bml_transpose(Z_bml,X_bml)
-    call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-    call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
-
-    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-           m_rec,mu0,beta,real(nocc,PREC),threshold) 
-
-    call bml_transpose(Z_bml,X_bml)
-    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
-    call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-
-    do It = 1, Nr_atoms
-       dq_dv(It) = 0
-       do K = H_INDEX_START(It), H_INDEX_END(It)
+      enddo
+      !$OMP END PARALLEL DO
+      call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
+           TIMERATIO,Max_Nr_Neigh)
+      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+      diagonal = 0.0_dp
+      do I = 1,HDIM
+        call bml_set_row(H1_bml,I,diagonal,threshold)
+      enddo
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
+      do I = 1,Nr_atoms
+        do K = H_INDEX_START(I),H_INDEX_END(I)
+          diagonal(K) = Hubbard_U(I)*dq_v(I) + Coulomb_Pot_dq_v(I)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      call bml_set_diagonal(H1_bml,diagonal,threshold)
+      call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(D1_bml,H1_bml)
+
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec,mu0,beta,real(nocc,PREC),threshold)
+
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      do It = 1, Nr_atoms
+        dq_dv(It) = 0
+        do K = H_INDEX_START(It), H_INDEX_END(It)
           call bml_get_row(S_bml,K,row1)
           call bml_get_row(D1_bml,K,row2)
           dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
-       enddo
-       JJ(It,J) = dq_dv(It)
+        enddo
+        JJ(It,J) = dq_dv(It)
+      enddo
+      dq_v = ZERO
     enddo
-    dq_v = ZERO
-  enddo
 
-  do I = 1,Nr_atoms
-     JJ(I,I) = JJ(I,I) - ONE
-  enddo
-  call Invert(JJ,KK,Nr_atoms)
-  deallocate(row1); deallocate(row2); deallocate(JJ)
+    do I = 1,Nr_atoms
+      JJ(I,I) = JJ(I,I) - ONE
+    enddo
+    call Invert(JJ,KK,Nr_atoms)
+    deallocate(row1); deallocate(row2); deallocate(JJ)
+
+  end subroutine prg_full_kernel
+
+
+  ! Compute the low-rank kernel matrix. (For development code)
+  subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
+       Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
+       S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+
+    !! Res = q[n] - n
+    !! KK0 is preconditioner
+    !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+    implicit none
+    integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
+    real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
+    real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+    real(dp), intent(in)      :: Res(Nr_atoms)
+    integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+    real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+    type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec)
+    real(dp)                  :: K0Res(Nr_atoms)
+    type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+    type(bml_matrix_t),intent(inout) :: KK0_bml
+    real(dp), intent(in)      :: threshold
+    integer,    intent(in)      :: LMAX
+    character(10), intent(in)   :: Element_Type(Nr_atoms)
+    integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+    real(dp)                  :: dq_v(Nr_atoms)
+    real(dp), intent(out)     :: KRes(Nr_atoms)
+    integer                     :: I,J,K,It,col,row
+    integer, intent(out)        :: L
+    real(dp)                  :: Fel, proj_tmp,elem
+    real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+    real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+    real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:)
+
+    allocate(row1(HDIM));allocate(row2(HDIM));
+
+    dr = Res
+    I = 0
+    Fel = 1.D0
+    do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
+      I = I + 1
+      vi(:,I) = dr/norm2(dr)
+      do J = 1,I-1
+        vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+      enddo
+      vi(:,I) = vi(:,I)/norm2(vi(:,I))
+      v(:) = vi(:,I)  ! v_i
 
-end subroutine prg_full_kernel
+      dq_v = v
 
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
+      do J = 1,Nr_atoms
+        call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
+             dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+        Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+      enddo
+      !$OMP END PARALLEL DO
 
-! Compute the low-rank kernel matrix. (For development code) 
-  subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, & 
-    Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
-    S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+      call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
-!! Res = q[n] - n
-!! KK0 is preconditioner
-!! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+      row1 = 0.0_dp
+      do J = 1,HDIM
+        call bml_set_row(H1_bml,J,row1,threshold)
+      enddo
 
-  implicit none
-  integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
-  real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-  real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
-  real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
-  real(dp), intent(in)      :: Res(Nr_atoms)
-  integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
-  real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
-  type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec)
-  real(dp)                  :: K0Res(Nr_atoms)
-  type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
-  type(bml_matrix_t),intent(inout) :: KK0_bml
-  real(dp), intent(in)      :: threshold
-  integer,    intent(in)      :: LMAX
-  character(10), intent(in)   :: Element_Type(Nr_atoms)
-  integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
-  real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
-  real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
-  real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
-  real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
-  real(dp)                  :: dq_v(Nr_atoms)
-  real(dp), intent(out)     :: KRes(Nr_atoms)
-  integer                     :: I,J,K,It,col,row
-  integer, intent(out)        :: L
-  real(dp)                  :: Fel, proj_tmp,elem
-  real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
-  real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
-  real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:)
-
-  allocate(row1(HDIM));allocate(row2(HDIM));
-
-  dr = Res
-  I = 0
-  Fel = 1.D0  
-  do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
-    I = I + 1
-    vi(:,I) = dr/norm2(dr)
-    do J = 1,I-1
-      vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
-    enddo
-    vi(:,I) = vi(:,I)/norm2(vi(:,I))
-    v(:) = vi(:,I)  ! v_i
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
+      do J = 1,Nr_atoms
+        do K = H_INDEX_START(J),H_INDEX_END(J)
+          row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
 
-    dq_v = v 
+      call bml_set_diagonal(H1_bml,row1,threshold)
+      call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(D1_bml,H1_bml)
 
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
-    do J = 1,Nr_atoms
-      call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
-      dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
-      Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
-    enddo
-    !$OMP END PARALLEL DO
-    
-    call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
-    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
-
-    row1 = 0.0_dp
-    do J = 1,HDIM
-       call bml_set_row(H1_bml,J,row1,threshold)
-    enddo
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
 
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
-    do J = 1,Nr_atoms
-       do K = H_INDEX_START(J),H_INDEX_END(J)
-           row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
-       enddo
-    enddo
-    !$OMP END PARALLEL DO
-
-    call bml_set_diagonal(H1_bml,row1,threshold)
-    call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold) 
-    call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
-    call bml_copy(D1_bml,H1_bml)
-
-    call bml_transpose(Z_bml,X_bml)
-    call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-    call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
-
-    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-        m_rec, mu, beta, real(nocc,dp), threshold)
-    
-    call bml_transpose(Z_bml,X_bml)
-    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
-    call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
-    do It = 1, Nr_atoms
-      dq_dv(It) = 0
-      do K = H_INDEX_START(It), H_INDEX_END(It)
-        call bml_get_row(S_bml,K,row1)
-        call bml_get_row(D1_bml,K,row2)
-        dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec, mu, beta, real(nocc,dp), threshold)
+
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
+      do It = 1, Nr_atoms
+        dq_dv(It) = 0
+        do K = H_INDEX_START(It), H_INDEX_END(It)
+          call bml_get_row(S_bml,K,row1)
+          call bml_get_row(D1_bml,K,row2)
+          dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+        enddo
       enddo
-    enddo
-    !$OMP END PARALLEL DO
+      !$OMP END PARALLEL DO
 
-    dr = dq_dv - v 
-    fi(:,I) = dr
+      dr = dq_dv - v
+      fi(:,I) = dr
 
-    L = I
-    allocate(O(L,L), M(L,L))
-    do K = 1,L
-    do J = 1,L
-       O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
-    enddo
-    enddo
-    call Invert(O,M,L)                        ! M = O^(-1)
-    IdentRes = 0.D0*Res
-    KRes = 0.D0
-    do K = 1,L
-    do J = 1,L
-      proj_tmp = M(K,J)*dot_product(fi(:,J),Res)
-      IdentRes = IdentRes + proj_tmp*fi(:,K)
-      KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (J)^(-1)*(q[n]-n) 
-    enddo
+      L = I
+      allocate(O(L,L), M(L,L))
+      do K = 1,L
+        do J = 1,L
+          O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+        enddo
+      enddo
+      call Invert(O,M,L)                        ! M = O^(-1)
+      IdentRes = 0.D0*Res
+      KRes = 0.D0
+      do K = 1,L
+        do J = 1,L
+          proj_tmp = M(K,J)*dot_product(fi(:,J),Res)
+          IdentRes = IdentRes + proj_tmp*fi(:,K)
+          KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (J)^(-1)*(q[n]-n)
+        enddo
+      enddo
+      Fel = norm2(IdentRes-Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+      write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+
+      ! Does not work, need to normalize matrix
+      if ((Fel > FelTol).AND.(I < (LMAX))) then
+        deallocate(O, M)
+      else
+        do row = 1,Nr_atoms
+          do col = 1,Nr_atoms
+            elem = 0.0
+            do K = 1,L
+              do J = 1,L
+                elem = elem + M(J,K)*vi(row,J)*fi(col,K)
+              enddo
+            enddo
+            if (abs(elem) > threshold) then
+              call bml_set_element(KK0_bml,row,col,elem)
+            endif
+          enddo
+        enddo
+        deallocate(O,M)
+      endif
     enddo
-    Fel = norm2(IdentRes-Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
-    write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
-  
-    ! Does not work, need to normalize matrix  
-        if ((Fel > FelTol).AND.(I < (LMAX))) then 
-          deallocate(O, M)
-        else 
-         do row = 1,Nr_atoms
-         do col = 1,Nr_atoms 
-           elem = 0.0
-           do K = 1,L
-             do J = 1,L 
-               elem = elem + M(J,K)*vi(row,J)*fi(col,K)
-             enddo
-           enddo 
-           if (abs(elem) > threshold) then
-             call bml_set_element(KK0_bml,row,col,elem)
-           endif
-         enddo
-         enddo  
-         deallocate(O,M)
-         endif
-    enddo 
     deallocate(row1); deallocate(row2);
 
   end subroutine prg_kernel_matrix_multirank
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index cbc484e7..f2d8d5df 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -56,6 +56,7 @@ progress_test(prg_density_T)
 progress_test(prg_density_T_fermi)
 progress_test(prg_sp2_basic)
 progress_test(prg_implicit_fermi)
+progress_test(prg_implicit_fermi_save_inverse)
 progress_test(prg_sp2_alg1_dense)
 progress_test(prg_sp2_alg2_dense)
 progress_test(prg_sp2_alg1_ellpack)
diff --git a/tests/src/main.F90 b/tests/src/main.F90
index 995644c0..3ab0354f 100644
--- a/tests/src/main.F90
+++ b/tests/src/main.F90
@@ -32,6 +32,7 @@ program main
   implicit none
 
   integer :: norb, mdim, verbose
+  type(bml_matrix_t) :: inv_bml(10)  
   type(bml_matrix_t) :: ham_bml
   type(bml_matrix_t) :: rho_bml, rho1_bml
   type(bml_matrix_t) :: rho_ortho_bml
@@ -59,7 +60,7 @@ program main
   real(dp) :: mineval, maxeval, occerrlimit
   real(dp), allocatable :: gbnd(:)
   integer :: minsp2iter, icount, nodesPerPart, occsteps
-  integer :: norecs
+  integer :: norecs,occiter,i
   integer :: maxsp2iter, npts, sp2all_timer, sp2all_timer_init
   integer, allocatable :: pp(:), signlist(:)
   real(dp), allocatable :: vv(:)
@@ -95,7 +96,7 @@ program main
 
   !The following Hamiltonian belongs to a water box structure
   !which was precalculated with dftb+
-!  call h_read(ham,nOrb)
+  !  call h_read(ham,nOrb)
 
 
   !Convert the Hamiltonian to bml
@@ -108,1006 +109,1041 @@ program main
 
   case("prg_density") !Diagonalize H and build \rho
 
-     write(*,*) "Testing the construction of the density matrix from density_mod"
+    write(*,*) "Testing the construction of the density matrix from density_mod"
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     call prg_build_density_T0(ham_bml, rho_bml, threshold, bndfil)
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     write(*,*)"Idempotency for prg_build_density_T0",idempotency
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call prg_build_density_T0(ham_bml, rho_bml, threshold, bndfil)
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    write(*,*)"Idempotency for prg_build_density_T0",idempotency
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
   case("prg_density_T") !Diagonalize H and build \rho with electronic temperature KbT
 
-     write(*,*) "Testing the construction of the density at KbT > 0 matrix from density_mod"
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    write(*,*) "Testing the construction of the density at KbT > 0 matrix from density_mod"
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     call prg_build_density_T(ham_bml, rho_bml, threshold, bndfil, 0.01_dp, mu)
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     write(*,*)"Idempotency for prg_build_density_T0",idempotency
-     write(*,*)"Fermi level:",mu
-     if(idempotency.gt.1.0D-5)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call prg_build_density_T(ham_bml, rho_bml, threshold, bndfil, 0.01_dp, mu)
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    write(*,*)"Idempotency for prg_build_density_T0",idempotency
+    write(*,*)"Fermi level:",mu
+    if(idempotency.gt.1.0D-5)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
   case("prg_density_T_fermi") !Diagonalize H and build \rho with electronic temperature KbT and with chemical potential mu
 
-     write(*,*) "Testing the construction of the density matrix at KbT > 0 and at mu = Ef from density_mod"
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    write(*,*) "Testing the construction of the density matrix at KbT > 0 and at mu = Ef from density_mod"
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     call prg_build_density_T_Fermi(ham_bml, rho_bml, threshold,0.01_dp, -0.10682896819759_dp, 1)
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     write(*,*)"Idempotency for prg_build_density_T0",idempotency
-     write(*,*)"Fermi level:",mu
-     if(idempotency.gt.1.0D-5)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call prg_build_density_T_Fermi(ham_bml, rho_bml, threshold,0.01_dp, -0.10682896819759_dp, 1)
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    write(*,*)"Idempotency for prg_build_density_T0",idempotency
+    write(*,*)"Fermi level:",mu
+    if(idempotency.gt.1.0D-5)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
   case("prg_density_cheb_fermi") !Use Chebyshev expansion to build the density matrix
 
-     write(*,*) "Testing the construction of the density matrix at KbT > 0 and at mu = Ef from chebyshev_mod"
+    write(*,*) "Testing the construction of the density matrix at KbT > 0 and at mu = Ef from chebyshev_mod"
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     mu=1.0_dp
-     write(*,*)"mu",mu
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho1_bml)
-     call prg_build_density_T_fermi(ham_bml, rho_bml, threshold,0.01_dp, -0.10682896819759_dp, 0)
-     call prg_build_density_cheb_fermi(ham_bml,rho1_bml,1.0_dp,&
-          threshold,200,0.01_dp,mu,bndfil,.true.,0.001_dp,&
-          .true.,2000,.false.,1)
-     call bml_add_deprecated(1.0_dp,rho1_bml,-1.0_dp,rho_bml,0.0_dp)
-     error_calc = bml_fnorm(rho1_bml)
-     if(error_calc.gt.0.1_dp)then
-        write(*,*) "Error in Chebyshev expansion","Error = ",error_calc
-        error stop
-     endif
+    mu=1.0_dp
+    write(*,*)"mu",mu
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho1_bml)
+    call prg_build_density_T_fermi(ham_bml, rho_bml, threshold,0.01_dp, -0.10682896819759_dp, 0)
+    call prg_build_density_cheb_fermi(ham_bml,rho1_bml,1.0_dp,&
+         threshold,200,0.01_dp,mu,bndfil,.true.,0.001_dp,&
+         .true.,2000,.false.,1)
+    call bml_add_deprecated(1.0_dp,rho1_bml,-1.0_dp,rho_bml,0.0_dp)
+    error_calc = bml_fnorm(rho1_bml)
+    if(error_calc.gt.0.1_dp)then
+      write(*,*) "Error in Chebyshev expansion","Error = ",error_calc
+      error stop
+    endif
 
   case("prg_implicit_fermi") !Use implicit recursive expansion by Niklasson to build density matrix
 
-     write(*,*) "Testing the construction of the density matrix at KbT > 0 and at mu = Ef from implicit_fermi_mod"
-     
-     mu = 0.2_dp
-     beta = 4.0_dp !nocc,osteps,occerrlimit
+    write(*,*) "Testing the construction of the density matrix at KbT > 0 and at mu = Ef from implicit_fermi_mod"
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    mu = 0.2_dp
+    beta = 4.0_dp !nocc,osteps,occerrlimit
+    norecs = 10
+    bml_type = 'ellpack'
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho1_bml)
-     call prg_implicit_fermi(ham_bml, rho1_bml, 10, 2, 10.0_dp, mu, beta, 0, 1, 1.0_dp, threshold, 10e-8_dp)
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     mu = 0.2_dp
-     
-     call prg_test_density_matrix(ham_bml, rho_bml, beta, mu, 10.0_dp, 1, 1.0_dp, threshold)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho1_bml)
+    call prg_implicit_fermi(ham_bml, rho1_bml, norecs, 2, 10.0_dp, mu, beta, 0, 1, 1.0_dp, threshold, 1e-6_dp)
 
-     call bml_add(rho1_bml,rho_bml,1.0_dp,-1.0_dp,threshold)
+    mu = 0.2_dp
 
-     error_calc = bml_fnorm(rho1_bml)
-     if(error_calc.gt.0.1_dp)then
-        write(*,*) "Error in Implicit Fermi expansion ","Error = ",error_calc
-        error stop
-     endif
+    call prg_test_density_matrix(ham_bml, rho_bml, beta, mu, 10.0_dp, 1, 1.0_dp, threshold)
+
+    call bml_add(rho1_bml,rho_bml,1.0_dp,-1.0_dp,threshold)
+
+    error_calc = bml_fnorm(rho1_bml)
+    write(*,*) error_calc
+    if(error_calc.gt.0.1_dp)then
+      write(*,*) "Error in Implicit Fermi expansion ","Error = ",error_calc
+      error stop
+    endif
+
+  case("prg_implicit_fermi_save_inverse") 
+
+    mu = 0.2_dp
+    beta = 4.0_dp !nocc,osteps,occerrlimit
+    norecs = 10
+    nocc = 10.0_dp
+
+    do i = 1,norecs
+      call bml_identity_matrix(bml_type,bml_element_real,dp,norb,norb,inv_bml(i))
+    enddo 
+
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho1_bml)
+    call prg_test_density_matrix(ham_bml,rho1_bml,beta,mu,nocc,1,1e-4_dp,threshold)   
+    write(*,*) mu
+    mu = 0.2_dp  
+    call prg_implicit_fermi_save_inverse(inv_bml,ham_bml,rho_bml,norecs,nocc,mu,beta,1e-4_dp, threshold, 1e-5_dp, 1,occiter)
+
+    write(*,*) mu
+    call bml_scale(0.5_dp,rho_bml)
+    call bml_add(rho1_bml,rho_bml,1.0_dp,-1.0_dp,threshold)
+
+    error_calc = bml_fnorm(rho1_bml)
+    if(error_calc.gt.0.1_dp)then
+      write(*,*) "Error in Implicit Fermi expansion save inverse","Error = ",error_calc
+      error stop
+    endif
 
   case("prg_sp2_basic") !Sp2 original version
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_basic(ham_bml,rho_bml,threshold,bndfil,minsp2iter,maxsp2iter &
-          ,sp2conv,sp2tol,verbose)
-     call prg_timer_stop(sp2_timer)
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_basic(ham_bml,rho_bml,threshold,bndfil,minsp2iter,maxsp2iter &
+         ,sp2conv,sp2tol,verbose)
+    call prg_timer_stop(sp2_timer)
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_sp2_alg1_dense") !Sp2 algorithm 1
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     bml_type = "dense"
+    bml_type = "dense"
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_alg1(ham_bml,rho_bml,threshold,bndfil,minsp2iter,maxsp2iter &
-          ,sp2conv,sp2tol)
-     call prg_timer_stop(sp2_timer)
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_alg1(ham_bml,rho_bml,threshold,bndfil,minsp2iter,maxsp2iter &
+         ,sp2conv,sp2tol)
+    call prg_timer_stop(sp2_timer)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_sp2_alg2_dense") !Sp2 algorithm 2
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     bml_type = "dense"
+    bml_type = "dense"
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_alg2(ham_bml,rho_bml,threshold,bndfil,minsp2iter,maxsp2iter &
-          ,sp2conv,sp2tol)
-     call prg_timer_stop(sp2_timer)
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_alg2(ham_bml,rho_bml,threshold,bndfil,minsp2iter,maxsp2iter &
+         ,sp2conv,sp2tol)
+    call prg_timer_stop(sp2_timer)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_sp2_alg1_ellpack") !Sp2 algorithm 1
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     idempotency_tol = 1d-6
-     bml_type = "ellpack"
-     bndfil = 0.5_dp
-     norb = 6144
-     mdim = 600
-     threshold = 1.0d-9
-     sp2tol = 1.0d-10
+    idempotency_tol = 1d-6
+    bml_type = "ellpack"
+    bndfil = 0.5_dp
+    norb = 6144
+    mdim = 600
+    threshold = 1.0d-9
+    sp2tol = 1.0d-10
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
-     call bml_read_matrix(ham_bml, "poly.512.mtx")
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
+    call bml_read_matrix(ham_bml, "poly.512.mtx")
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_alg1(ham_bml,rho_bml,threshold,bndfil,minsp2iter,maxsp2iter &
-          ,sp2conv,sp2tol)
-     call prg_timer_stop(sp2_timer)
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_alg1(ham_bml,rho_bml,threshold,bndfil,minsp2iter,maxsp2iter &
+         ,sp2conv,sp2tol)
+    call prg_timer_stop(sp2_timer)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_sp2_alg2_ellpack") !Sp2 algorithm 2 ellpack version
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     idempotency_tol = 1d-6
-     bml_type = "ellpack"
-     bndfil = 0.5_dp
-     norb = 6144
-     mdim = 600
-     threshold = 1.0d-9
-     sp2tol = 1.0d-10
+    idempotency_tol = 1d-6
+    bml_type = "ellpack"
+    bndfil = 0.5_dp
+    norb = 6144
+    mdim = 600
+    threshold = 1.0d-9
+    sp2tol = 1.0d-10
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
-     call bml_read_matrix(ham_bml, "poly.512.mtx")
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
+    call bml_read_matrix(ham_bml, "poly.512.mtx")
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_alg2(ham_bml,rho_bml,threshold,bndfil,minsp2iter,maxsp2iter &
-          ,sp2conv,sp2tol)
-     call prg_timer_stop(sp2_timer)
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_alg2(ham_bml,rho_bml,threshold,bndfil,minsp2iter,maxsp2iter &
+         ,sp2conv,sp2tol)
+    call prg_timer_stop(sp2_timer)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_sp2_alg2_ellpack_poly") !Sp2 algorithm 2 ellpack version
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     idempotency_tol = 1.0d-2
-     bml_type = "ellpack"
-     bndfil = 0.5_dp
-     norb = 6144
-     mdim = 288
-     threshold = 1.0d-5
-     sp2tol = 1.0d-10
+    idempotency_tol = 1.0d-2
+    bml_type = "ellpack"
+    bndfil = 0.5_dp
+    norb = 6144
+    mdim = 288
+    threshold = 1.0d-5
+    sp2tol = 1.0d-10
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
-     call bml_read_matrix(ham_bml, "poly.512.mtx")
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
+    call bml_read_matrix(ham_bml, "poly.512.mtx")
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_alg2(ham_bml,rho_bml,threshold,bndfil,minsp2iter,maxsp2iter &
-          ,sp2conv,sp2tol)
-     call prg_timer_stop(sp2_timer)
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_alg2(ham_bml,rho_bml,threshold,bndfil,minsp2iter,maxsp2iter &
+         ,sp2conv,sp2tol)
+    call prg_timer_stop(sp2_timer)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml, threshold, idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml, threshold, idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_sp2_alg1_seq_dense") !Sp2 algorithm 1 sequence version
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     bml_type = "dense"
+    bml_type = "dense"
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
 
-     allocate(pp(100),vv(100))
-     icount = 0
+    allocate(pp(100),vv(100))
+    icount = 0
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_alg1_genseq(ham_bml, rho_bml, threshold, bndfil, &
-          minsp2iter, maxsp2iter, sp2conv, sp2tol, &
-          pp, icount, vv)
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_alg1_genseq(ham_bml, rho_bml, threshold, bndfil, &
+         minsp2iter, maxsp2iter, sp2conv, sp2tol, &
+         pp, icount, vv)
 
-     call prg_sp2_alg1_seq(ham_bml, rho_bml, threshold, pp, icount, vv)
-     call prg_timer_stop(sp2_timer)
+    call prg_sp2_alg1_seq(ham_bml, rho_bml, threshold, pp, icount, vv)
+    call prg_timer_stop(sp2_timer)
 
-     deallocate(pp, vv)
+    deallocate(pp, vv)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_sp2_alg2_seq_dense") !Sp2 algorithm 2 sequence version
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     bml_type = "dense"
+    bml_type = "dense"
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
 
-     allocate(pp(100),vv(100))
-     icount = 0
+    allocate(pp(100),vv(100))
+    icount = 0
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_alg2_genseq(ham_bml, rho_bml, threshold, bndfil, &
-          minsp2iter, maxsp2iter, sp2conv, sp2tol, &
-          pp, icount, vv)
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_alg2_genseq(ham_bml, rho_bml, threshold, bndfil, &
+         minsp2iter, maxsp2iter, sp2conv, sp2tol, &
+         pp, icount, vv)
 
-     call prg_sp2_alg2_seq(ham_bml, rho_bml, threshold, pp, icount, vv)
-     call prg_timer_stop(sp2_timer)
+    call prg_sp2_alg2_seq(ham_bml, rho_bml, threshold, pp, icount, vv)
+    call prg_timer_stop(sp2_timer)
 
-     deallocate(pp, vv)
+    deallocate(pp, vv)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_sp2_alg1_seq_ellpack") !Sp2 algorithm 1 sequence version
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     idempotency_tol = 1d-6
-     bml_type = "ellpack"
-     bndfil = 0.5_dp
-     norb = 6144
-     mdim = 600
-     threshold = 1.0d-9
-     sp2tol = 1.0d-10
+    idempotency_tol = 1d-6
+    bml_type = "ellpack"
+    bndfil = 0.5_dp
+    norb = 6144
+    mdim = 600
+    threshold = 1.0d-9
+    sp2tol = 1.0d-10
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
-     call bml_read_matrix(ham_bml, "poly.512.mtx")
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
+    call bml_read_matrix(ham_bml, "poly.512.mtx")
 
-     allocate(pp(100),vv(100))
-     icount = 0
+    allocate(pp(100),vv(100))
+    icount = 0
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_alg1_genseq(ham_bml, rho_bml, threshold, bndfil, &
-          minsp2iter, maxsp2iter, sp2conv, sp2tol, &
-          pp, icount, vv)
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_alg1_genseq(ham_bml, rho_bml, threshold, bndfil, &
+         minsp2iter, maxsp2iter, sp2conv, sp2tol, &
+         pp, icount, vv)
 
-     call prg_sp2_alg1_seq(ham_bml, rho_bml, threshold, pp, icount, vv)
-     call prg_timer_stop(sp2_timer)
+    call prg_sp2_alg1_seq(ham_bml, rho_bml, threshold, pp, icount, vv)
+    call prg_timer_stop(sp2_timer)
 
-     deallocate(pp, vv)
+    deallocate(pp, vv)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_sp2_alg2_seq_ellpack") !Sp2 algorithm 2 sequence version
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     idempotency_tol = 1d-6
-     bml_type = "ellpack"
-     bndfil = 0.5_dp
-     norb = 6144
-     mdim = 600
-     threshold = 1.0d-9
-     sp2tol = 1.0d-10
+    idempotency_tol = 1d-6
+    bml_type = "ellpack"
+    bndfil = 0.5_dp
+    norb = 6144
+    mdim = 600
+    threshold = 1.0d-9
+    sp2tol = 1.0d-10
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
-     call bml_read_matrix(ham_bml, "poly.512.mtx")
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
+    call bml_read_matrix(ham_bml, "poly.512.mtx")
 
-     allocate(pp(100),vv(100))
-     icount = 0
+    allocate(pp(100),vv(100))
+    icount = 0
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_alg2_genseq(ham_bml, rho_bml, threshold, bndfil, &
-          minsp2iter, maxsp2iter, sp2conv, sp2tol, &
-          pp, icount, vv)
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_alg2_genseq(ham_bml, rho_bml, threshold, bndfil, &
+         minsp2iter, maxsp2iter, sp2conv, sp2tol, &
+         pp, icount, vv)
 
-     call prg_sp2_alg2_seq(ham_bml, rho_bml, threshold, pp, icount, vv)
-     call prg_timer_stop(sp2_timer)
+    call prg_sp2_alg2_seq(ham_bml, rho_bml, threshold, pp, icount, vv)
+    call prg_timer_stop(sp2_timer)
 
-     deallocate(pp, vv)
+    deallocate(pp, vv)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_sp2_alg1_seq_inplace_dense") !SP2 algorithm 1 seq version in place
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     bml_type = "dense"
+    bml_type = "dense"
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     allocate(pp(100),vv(100), gbnd(2))
-     icount = 0
+    allocate(pp(100),vv(100), gbnd(2))
+    icount = 0
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_alg2_genseq(ham_bml, rho_bml, threshold, bndfil, &
-          minsp2iter, maxsp2iter, sp2conv, sp2tol, &
-          pp, icount, vv)
-     call prg_timer_stop(sp2_timer)
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_alg2_genseq(ham_bml, rho_bml, threshold, bndfil, &
+         minsp2iter, maxsp2iter, sp2conv, sp2tol, &
+         pp, icount, vv)
+    call prg_timer_stop(sp2_timer)
 
-     call bml_copy(ham_bml, rho_bml)
-     call bml_gershgorin(rho_bml, gbnd)
+    call bml_copy(ham_bml, rho_bml)
+    call bml_gershgorin(rho_bml, gbnd)
 
-     call prg_timer_start(sp2_timer)
-     call prg_prg_sp2_alg1_seq_inplace(rho_bml, threshold, pp, icount, &
-          vv, gbnd(1), gbnd(2))
-     call prg_timer_stop(sp2_timer)
+    call prg_timer_start(sp2_timer)
+    call prg_prg_sp2_alg1_seq_inplace(rho_bml, threshold, pp, icount, &
+         vv, gbnd(1), gbnd(2))
+    call prg_timer_stop(sp2_timer)
 
-     deallocate(pp, vv, gbnd)
+    deallocate(pp, vv, gbnd)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_sp2_alg2_seq_inplace_dense") !SP2 algorithm 2 seq version in place
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     bml_type = "dense"
+    bml_type = "dense"
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     allocate(pp(100),vv(100), gbnd(2))
-     icount = 0
+    allocate(pp(100),vv(100), gbnd(2))
+    icount = 0
 
-     call prg_timer_start(sp2_timer)
+    call prg_timer_start(sp2_timer)
 
-     call prg_sp2_alg2_genseq(ham_bml, rho_bml, threshold, bndfil, &
-          minsp2iter, maxsp2iter, sp2conv, sp2tol, &
-          pp, icount, vv)
-     call prg_timer_stop(sp2_timer)
+    call prg_sp2_alg2_genseq(ham_bml, rho_bml, threshold, bndfil, &
+         minsp2iter, maxsp2iter, sp2conv, sp2tol, &
+         pp, icount, vv)
+    call prg_timer_stop(sp2_timer)
 
-     call bml_copy(ham_bml, rho_bml)
-     call bml_gershgorin(rho_bml, gbnd)
+    call bml_copy(ham_bml, rho_bml)
+    call bml_gershgorin(rho_bml, gbnd)
 
-     call prg_timer_start(sp2_timer)
-     call prg_prg_sp2_alg2_seq_inplace(rho_bml, threshold, pp, icount, &
-          vv, gbnd(1), gbnd(2))
-     call prg_timer_stop(sp2_timer)
+    call prg_timer_start(sp2_timer)
+    call prg_prg_sp2_alg2_seq_inplace(rho_bml, threshold, pp, icount, &
+         vv, gbnd(1), gbnd(2))
+    call prg_timer_stop(sp2_timer)
 
-     deallocate(pp, vv, gbnd)
+    deallocate(pp, vv, gbnd)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_sp2_alg1_seq_inplace_ellpack") !SP2 algorithm 1 seq version in place
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     idempotency_tol = 1d-6
-     bml_type = "ellpack"
-     bndfil = 0.5_dp
-     norb = 6144
-     mdim = 600
-     threshold = 1.0d-9
-     sp2tol = 1.0d-10
+    idempotency_tol = 1d-6
+    bml_type = "ellpack"
+    bndfil = 0.5_dp
+    norb = 6144
+    mdim = 600
+    threshold = 1.0d-9
+    sp2tol = 1.0d-10
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
-     call bml_read_matrix(ham_bml, "poly.512.mtx")
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
+    call bml_read_matrix(ham_bml, "poly.512.mtx")
 
-     allocate(pp(100),vv(100), gbnd(2))
-     icount = 0
+    allocate(pp(100),vv(100), gbnd(2))
+    icount = 0
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_alg2_genseq(ham_bml, rho_bml, threshold, bndfil, &
-          minsp2iter, maxsp2iter, sp2conv, sp2tol, &
-          pp, icount, vv)
-     call prg_timer_stop(sp2_timer)
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_alg2_genseq(ham_bml, rho_bml, threshold, bndfil, &
+         minsp2iter, maxsp2iter, sp2conv, sp2tol, &
+         pp, icount, vv)
+    call prg_timer_stop(sp2_timer)
 
-     call bml_copy(ham_bml, rho_bml)
-     call bml_gershgorin(rho_bml, gbnd)
+    call bml_copy(ham_bml, rho_bml)
+    call bml_gershgorin(rho_bml, gbnd)
 
-     call prg_timer_start(sp2_timer)
-     call prg_prg_sp2_alg1_seq_inplace(rho_bml, threshold, pp, icount, &
-          vv, gbnd(1), gbnd(2))
-     call prg_timer_stop(sp2_timer)
+    call prg_timer_start(sp2_timer)
+    call prg_prg_sp2_alg1_seq_inplace(rho_bml, threshold, pp, icount, &
+         vv, gbnd(1), gbnd(2))
+    call prg_timer_stop(sp2_timer)
 
-     deallocate(pp, vv, gbnd)
+    deallocate(pp, vv, gbnd)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
 
   case("prg_sp2_alg2_seq_inplace_ellpack") !SP2 algorithm 2 seq version in place
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     idempotency_tol = 1d-6
-     bml_type = "ellpack"
-     bndfil = 0.5_dp
-     norb = 6144
-     mdim = 600
-     threshold = 1.0d-9
-     sp2tol = 1.0d-10
+    idempotency_tol = 1d-6
+    bml_type = "ellpack"
+    bndfil = 0.5_dp
+    norb = 6144
+    mdim = 600
+    threshold = 1.0d-9
+    sp2tol = 1.0d-10
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
-     call bml_read_matrix(ham_bml, "poly.512.mtx")
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
+    call bml_read_matrix(ham_bml, "poly.512.mtx")
 
-     allocate(pp(100),vv(100), gbnd(2))
-     icount = 0
+    allocate(pp(100),vv(100), gbnd(2))
+    icount = 0
 
-     call prg_timer_start(sp2_timer)
-     call prg_sp2_alg2_genseq(ham_bml, rho_bml, threshold, bndfil, &
-          minsp2iter, maxsp2iter, sp2conv, sp2tol, &
-          pp, icount, vv)
-     call prg_timer_stop(sp2_timer)
+    call prg_timer_start(sp2_timer)
+    call prg_sp2_alg2_genseq(ham_bml, rho_bml, threshold, bndfil, &
+         minsp2iter, maxsp2iter, sp2conv, sp2tol, &
+         pp, icount, vv)
+    call prg_timer_stop(sp2_timer)
 
-     call bml_copy(ham_bml, rho_bml)
-     call bml_gershgorin(rho_bml, gbnd)
+    call bml_copy(ham_bml, rho_bml)
+    call bml_gershgorin(rho_bml, gbnd)
 
-     call prg_timer_start(sp2_timer)
-     call prg_prg_sp2_alg2_seq_inplace(rho_bml, threshold, pp, icount, &
-          vv, gbnd(1), gbnd(2))
-     call prg_timer_stop(sp2_timer)
+    call prg_timer_start(sp2_timer)
+    call prg_prg_sp2_alg2_seq_inplace(rho_bml, threshold, pp, icount, &
+         vv, gbnd(1), gbnd(2))
+    call prg_timer_stop(sp2_timer)
 
-     deallocate(pp, vv, gbnd)
+    deallocate(pp, vv, gbnd)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
 
   case("prg_sp2_fermi_dense") !SP2 algorithm 2 seq version in place
 
-     bml_type = "dense"
-     threshold = 1.0d-9
-     mdim = -1
-     verbose = 1
-     norecs = 20
-     nocc  = bndfil*norb
-     tscale = 1.0_dp
-     occerrlimit = 1.0D-9
-     tracelimit = 1.0D-12
-     mu = 0.0_dp
-     beta0 = 20.1_dp
-     mineval = 0.0_dp
-     maxeval = 0.0_dp
-     eps = 1.0D-4
-     occsteps = 0
-     sp2all_timer = 2
-     sp2all_timer_init = 3
-
-     allocate(signlist(30))
-     signlist = 0
-
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
-
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
-
-     call bml_zero_matrix(bml_type, bml_element_real, dp, norb, norb, orthox_bml)
-
-     call prg_timer_start(sp2all_timer_init,"SP2INIT")
-     call prg_sp2_fermi_init_norecs(ham_bml, norecs, nocc, tscale,&
-          & threshold, occerrlimit, tracelimit, orthox_bml, mu,&
-          & beta0, mineval, maxeval, signlist, 10)
-     call prg_sp2_fermi_init(ham_bml, norecs, nocc, tscale,&
-          & threshold, occerrlimit, tracelimit, orthox_bml, mu,&
-          & beta0, mineval, maxeval, signlist)
-     call prg_timer_stop(sp2all_timer_init,10)
-     call prg_timer_start(sp2all_timer,"SP2FERMI")
-     call prg_sp2_fermi(ham_bml, occsteps, norecs,&
-          & nocc, mu, beta0, mineval, maxeval,&
-          & signlist, threshold, eps, tracelimit, orthox_bml)
-
-     call prg_timer_stop(SP2ALL_TIMER,10)
-
-     write(*,*)"BETA", beta0
-     write(*,*)"ETEMP (eV)", 1.0_dp/beta0
-     write(*,*)"NORECS USED", norecs
-     write(*,*)"CHEMPOT", mu
-
-     kbt = 1.0_dp/beta0
-
-     call prg_build_density_T_fermi(ham_bml, rho_bml, threshold, kbt, mu, 0)
-
-     call bml_add_deprecated(2.0_dp,orthox_bml,-1.0_dp,rho_bml,0.0_dp)
-     error_calc = bml_fnorm(orthox_bml)
-
-     if(error_calc.gt.0.01_dp)then
-        write(*,*) "Error in sp2 Fermi","Error = ",error_calc
-        error stop
-     endif
+    bml_type = "dense"
+    threshold = 1.0d-9
+    mdim = -1
+    verbose = 1
+    norecs = 20
+    nocc  = bndfil*norb
+    tscale = 1.0_dp
+    occerrlimit = 1.0D-9
+    tracelimit = 1.0D-12
+    mu = 0.0_dp
+    beta0 = 20.1_dp
+    mineval = 0.0_dp
+    maxeval = 0.0_dp
+    eps = 1.0D-4
+    occsteps = 0
+    sp2all_timer = 2
+    sp2all_timer_init = 3
+
+    allocate(signlist(30))
+    signlist = 0
+
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+
+    call bml_zero_matrix(bml_type, bml_element_real, dp, norb, norb, orthox_bml)
+
+    call prg_timer_start(sp2all_timer_init,"SP2INIT")
+    call prg_sp2_fermi_init_norecs(ham_bml, norecs, nocc, tscale,&
+         & threshold, occerrlimit, tracelimit, orthox_bml, mu,&
+         & beta0, mineval, maxeval, signlist, 10)
+    call prg_sp2_fermi_init(ham_bml, norecs, nocc, tscale,&
+         & threshold, occerrlimit, tracelimit, orthox_bml, mu,&
+         & beta0, mineval, maxeval, signlist)
+    call prg_timer_stop(sp2all_timer_init,10)
+    call prg_timer_start(sp2all_timer,"SP2FERMI")
+    call prg_sp2_fermi(ham_bml, occsteps, norecs,&
+         & nocc, mu, beta0, mineval, maxeval,&
+         & signlist, threshold, eps, tracelimit, orthox_bml)
+
+    call prg_timer_stop(SP2ALL_TIMER,10)
+
+    write(*,*)"BETA", beta0
+    write(*,*)"ETEMP (eV)", 1.0_dp/beta0
+    write(*,*)"NORECS USED", norecs
+    write(*,*)"CHEMPOT", mu
+
+    kbt = 1.0_dp/beta0
+
+    call prg_build_density_T_fermi(ham_bml, rho_bml, threshold, kbt, mu, 0)
+
+    call bml_add_deprecated(2.0_dp,orthox_bml,-1.0_dp,rho_bml,0.0_dp)
+    error_calc = bml_fnorm(orthox_bml)
+
+    if(error_calc.gt.0.01_dp)then
+      write(*,*) "Error in sp2 Fermi","Error = ",error_calc
+      error stop
+    endif
 
   case("prg_equal_partition") ! Create equal partitions
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     call prg_timer_start(part_timer)
-     call prg_equalPartition(gp, 6, 72)
-     call prg_timer_stop(part_timer)
+    call prg_timer_start(part_timer)
+    call prg_equalPartition(gp, 6, 72)
+    call prg_timer_stop(part_timer)
 
-     call prg_printGraphPartitioning(gp)
-     if (gp%totalParts .ne. 12) then
-        write(*,*) "Number of parts is wrong ", gp%totalParts
-        call exit(-1)
-     endif
+    call prg_printGraphPartitioning(gp)
+    if (gp%totalParts .ne. 12) then
+      write(*,*) "Number of parts is wrong ", gp%totalParts
+      call exit(-1)
+    endif
 
-     call prg_destroyGraphPartitioning(gp)
+    call prg_destroyGraphPartitioning(gp)
 
-     call prg_timer_start(part_timer)
-     call prg_equalPartition(gp, 7, 72)
-     call prg_timer_stop(part_timer)
+    call prg_timer_start(part_timer)
+    call prg_equalPartition(gp, 7, 72)
+    call prg_timer_stop(part_timer)
 
-     call prg_printGraphPartitioning(gp)
-     if (gp%totalParts .ne. 11) then
-        write(*,*) "Number of parts is wrong ", gp%totalParts
-        !error stop(-1)
-        call exit(-1)
-     endif
+    call prg_printGraphPartitioning(gp)
+    if (gp%totalParts .ne. 11) then
+      write(*,*) "Number of parts is wrong ", gp%totalParts
+      !error stop(-1)
+      call exit(-1)
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_file_partition") ! Create partition from a file
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     call prg_timer_start(part_timer)
-     call prg_filePartition(gp, 'test.part')
-     call prg_timer_stop(part_timer)
+    call prg_timer_start(part_timer)
+    call prg_filePartition(gp, 'test.part')
+    call prg_timer_stop(part_timer)
 
-     call prg_printGraphPartitioning(gp)
-     if (gp%totalParts .ne. 104) then
-        write(*,*) "Number of parts is wrong ", gp%totalParts
-        error stop
-     endif
+    call prg_printGraphPartitioning(gp)
+    if (gp%totalParts .ne. 104) then
+      write(*,*) "Number of parts is wrong ", gp%totalParts
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_subgraphsp2_equal") ! Subgraph SP2 using equal size parts
 
-     call prg_timer_start(subgraph_timer)
+    call prg_timer_start(subgraph_timer)
 
-     bml_type = "ellpack"
-     norb = 6144
-     mdim = 300
-     threshold = 1.0d-5
-     bndfil = 0.5_dp
-     gthreshold = 1.0d-3
-     sp2tol = 1.0d-10
-     errlimit = 1.0d-12
-     nodesPerPart = 48
-     idempotency_tol = 1.0d-2
+    bml_type = "ellpack"
+    norb = 6144
+    mdim = 300
+    threshold = 1.0d-5
+    bndfil = 0.5_dp
+    gthreshold = 1.0d-3
+    sp2tol = 1.0d-10
+    errlimit = 1.0d-12
+    nodesPerPart = 48
+    idempotency_tol = 1.0d-2
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
-     call bml_read_matrix(ham_bml, "poly.512.mtx")
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,mdim,rho_bml)
+    call bml_read_matrix(ham_bml, "poly.512.mtx")
 
-     call prg_timer_start(loop_timer)
-     call test_subgraphloop(ham_bml, rho_bml, threshold, bndfil, &
-          minsp2iter, maxsp2iter, sp2conv, sp2tol, gthreshold, errlimit, &
-          nodesPerPart)
-     call prg_timer_stop(loop_timer)
+    call prg_timer_start(loop_timer)
+    call test_subgraphloop(ham_bml, rho_bml, threshold, bndfil, &
+         minsp2iter, maxsp2iter, sp2conv, sp2tol, gthreshold, errlimit, &
+         nodesPerPart)
+    call prg_timer_stop(loop_timer)
 
-     call bml_scale(0.5_dp, rho_bml)
-     call prg_check_idempotency(rho_bml,threshold,idempotency)
+    call bml_scale(0.5_dp, rho_bml)
+    call prg_check_idempotency(rho_bml,threshold,idempotency)
 
-     call prg_timer_stop(subgraph_timer)
+    call prg_timer_stop(subgraph_timer)
 
-     if(idempotency.gt.idempotency_tol)then
-        write(*,*) "Idempotency is too high", idempotency
-        error stop
-     endif
+    if(idempotency.gt.idempotency_tol)then
+      write(*,*) "Idempotency is too high", idempotency
+      error stop
+    endif
 
 
-stop
+    stop
   case("prg_deorthogonalize_dense") !Deorthogonalization of the density matrix
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     error_tol = 1.0d-9
+    error_tol = 1.0d-9
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_ortho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zmat_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,aux_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_ortho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zmat_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,aux_bml)
 
-     call bml_read_matrix(zmat_bml,'zmatrix.mtx')
-     call bml_read_matrix(rho_bml,'density.mtx')
-     call bml_read_matrix(rho_ortho_bml,'density_ortho.mtx')
+    call bml_read_matrix(zmat_bml,'zmatrix.mtx')
+    call bml_read_matrix(rho_bml,'density.mtx')
+    call bml_read_matrix(rho_ortho_bml,'density_ortho.mtx')
 
-     call prg_timer_start(deortho_timer)
-     call prg_deorthogonalize(rho_ortho_bml,zmat_bml,aux_bml,threshold,bml_type,verbose)
-     call prg_timer_stop(deortho_timer)
+    call prg_timer_start(deortho_timer)
+    call prg_deorthogonalize(rho_ortho_bml,zmat_bml,aux_bml,threshold,bml_type,verbose)
+    call prg_timer_stop(deortho_timer)
 
-     call bml_add_deprecated(-1.0_dp,aux_bml,1.0_dp,rho_bml,0.0_dp)
-     error_calc = bml_fnorm(aux_bml)
+    call bml_add_deprecated(-1.0_dp,aux_bml,1.0_dp,rho_bml,0.0_dp)
+    error_calc = bml_fnorm(aux_bml)
 
-     call bml_deallocate(nonortho_ham_bml)
-     call bml_deallocate(zmat_bml)
-     call bml_deallocate(aux_bml)
+    call bml_deallocate(nonortho_ham_bml)
+    call bml_deallocate(zmat_bml)
+    call bml_deallocate(aux_bml)
 
-     write(*,*)"prg_orthogonalize error ", error_calc
+    write(*,*)"prg_orthogonalize error ", error_calc
 
-     if(error_calc.gt.error_tol)then
-        write(*,*) "Error is too high", error_calc
-        error stop
-     endif
+    if(error_calc.gt.error_tol)then
+      write(*,*) "Error is too high", error_calc
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_orthogonalize_dense") ! Orthogonalization of the Hamiltonian
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     error_tol = 1.0d-9
-     bml_type = "dense"
+    error_tol = 1.0d-9
+    bml_type = "dense"
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
 
-     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zmat_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,nonortho_ham_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,aux_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zmat_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,nonortho_ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,aux_bml)
 
-     call bml_read_matrix(zmat_bml,'zmatrix.mtx')
-     call bml_read_matrix(nonortho_ham_bml,'hamiltonian.mtx')
+    call bml_read_matrix(zmat_bml,'zmatrix.mtx')
+    call bml_read_matrix(nonortho_ham_bml,'hamiltonian.mtx')
 
-     call prg_timer_start(ortho_timer)
-     call prg_orthogonalize(nonortho_ham_bml,zmat_bml,aux_bml,threshold,bml_type,verbose)
-     call prg_timer_stop(ortho_timer)
+    call prg_timer_start(ortho_timer)
+    call prg_orthogonalize(nonortho_ham_bml,zmat_bml,aux_bml,threshold,bml_type,verbose)
+    call prg_timer_stop(ortho_timer)
 
-     call bml_add_deprecated(-1.0_dp,aux_bml,1.0_dp,ham_bml,0.0_dp)
-     error_calc = bml_fnorm(aux_bml)
+    call bml_add_deprecated(-1.0_dp,aux_bml,1.0_dp,ham_bml,0.0_dp)
+    error_calc = bml_fnorm(aux_bml)
 
-     call bml_deallocate(nonortho_ham_bml)
-     call bml_deallocate(zmat_bml)
-     call bml_deallocate(aux_bml)
+    call bml_deallocate(nonortho_ham_bml)
+    call bml_deallocate(zmat_bml)
+    call bml_deallocate(aux_bml)
 
-     write(*,*)"Orthogonalize error ", error_calc
+    write(*,*)"Orthogonalize error ", error_calc
 
-     if(error_calc.gt.error_tol)then
-        write(*,*) "Error is too high", error_calc
-        error stop
-     endif
+    if(error_calc.gt.error_tol)then
+      write(*,*) "Error is too high", error_calc
+      error stop
+    endif
 
-     call prg_timer_stop(ortho_timer)
+    call prg_timer_stop(ortho_timer)
 
-   case("prg_pulaycomponent0")
+  case("prg_pulaycomponent0")
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-      error_tol = 1.0d-9
-      bml_type = "dense"
+    error_tol = 1.0d-9
+    bml_type = "dense"
 
-      call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zmat_bml)
-      call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
-      call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
-      call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,pcm_bml)
-      call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,pcm_ref_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zmat_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,pcm_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,pcm_ref_bml)
 
-      call bml_read_matrix(zmat_bml,"zmatrix.mtx")
-      call bml_read_matrix(ham_bml,"hamiltonian.mtx")
-      call bml_read_matrix(rho_bml,"density.mtx")
-      call bml_read_matrix(pcm_ref_bml,"pcm.mtx")
+    call bml_read_matrix(zmat_bml,"zmatrix.mtx")
+    call bml_read_matrix(ham_bml,"hamiltonian.mtx")
+    call bml_read_matrix(rho_bml,"density.mtx")
+    call bml_read_matrix(pcm_ref_bml,"pcm.mtx")
 
-      call prg_timer_start(zdiag_timer)
-      call prg_PulayComponent0(rho_bml,ham_bml,pcm_bml,threshold,mdim,&
-           &bml_type,verbose)
-      call prg_timer_stop(zdiag_timer)
+    call prg_timer_start(zdiag_timer)
+    call prg_PulayComponent0(rho_bml,ham_bml,pcm_bml,threshold,mdim,&
+         &bml_type,verbose)
+    call prg_timer_stop(zdiag_timer)
 
-      write(*,*)"Pulay Component error ", error_calc
+    write(*,*)"Pulay Component error ", error_calc
 
-      call bml_add_deprecated(-1.0_dp,pcm_bml,1.0_dp,pcm_ref_bml,0.0_dp)
-      error_calc = bml_fnorm(pcm_bml)
+    call bml_add_deprecated(-1.0_dp,pcm_bml,1.0_dp,pcm_ref_bml,0.0_dp)
+    error_calc = bml_fnorm(pcm_bml)
 
-      if(error_calc.gt.error_tol)then
-         write(*,*) "Error is too high", error_calc
-         error stop
-      endif
+    if(error_calc.gt.error_tol)then
+      write(*,*) "Error is too high", error_calc
+      error stop
+    endif
 
-      call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_buildzdiag")  ! Building inverse overlap factor matrix (Lowdin method)
 
-     call prg_timer_start(loop_timer)
+    call prg_timer_start(loop_timer)
 
-     write(*,*) "Testing buildzdiag from prg_genz_mod"
-     error_tol = 1.0d-9
-     bml_type = "dense"
+    write(*,*) "Testing buildzdiag from prg_genz_mod"
+    error_tol = 1.0d-9
+    bml_type = "dense"
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zmat_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,over_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zmat_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,over_bml)
 
-     call bml_read_matrix(zmat_bml,'zmatrix.mtx')
-     call bml_read_matrix(over_bml,'overlap.mtx')
+    call bml_read_matrix(zmat_bml,'zmatrix.mtx')
+    call bml_read_matrix(over_bml,'overlap.mtx')
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,aux_bml)
-     !
-     call prg_timer_start(zdiag_timer)
-     call prg_buildzdiag(over_bml,aux_bml,threshold,norb,bml_type)
-     call prg_timer_stop(zdiag_timer)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,aux_bml)
+    !
+    call prg_timer_start(zdiag_timer)
+    call prg_buildzdiag(over_bml,aux_bml,threshold,norb,bml_type)
+    call prg_timer_stop(zdiag_timer)
 
-     call bml_add_deprecated(-1.0_dp,aux_bml,1.0_dp,zmat_bml,0.0_dp)
+    call bml_add_deprecated(-1.0_dp,aux_bml,1.0_dp,zmat_bml,0.0_dp)
 
-     error_calc = bml_fnorm(aux_bml)
+    error_calc = bml_fnorm(aux_bml)
 
-     if(error_calc.gt.error_tol)then
-        write(*,*) "Error is too high", error_calc
-        error stop
-     endif
+    if(error_calc.gt.error_tol)then
+      write(*,*) "Error is too high", error_calc
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
   case("prg_buildzsparse")  ! Building inverse overlap factor matrix (Lowdin method)
 
-     write(*,*) "Testing buildzsparse from prg_genz_mod"
-     error_tol = 1.0d-7
-     bml_type = "ellpack"
+    write(*,*) "Testing buildzsparse from prg_genz_mod"
+    error_tol = 1.0d-7
+    bml_type = "ellpack"
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zmat_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zk1_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zk2_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zk3_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zk4_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zk5_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zk6_bml)
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,over_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zmat_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zk1_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zk2_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zk3_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zk4_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zk5_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,zk6_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,over_bml)
 
-     call bml_read_matrix(zmat_bml,'zmatrix.mtx')
-     call bml_read_matrix(over_bml,'overlap.mtx')
+    call bml_read_matrix(zmat_bml,'zmatrix.mtx')
+    call bml_read_matrix(over_bml,'overlap.mtx')
 
-     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,aux_bml)
-     !
-     call prg_timer_start(zdiag_timer)
-     call prg_buildZsparse(over_bml,aux_bml,1,mdim,bml_type,zk1_bml,zk2_bml,zk3_bml&
-          &,zk4_bml,zk5_bml,zk6_bml,4,4,3,threshold,threshold,.true.,1)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,aux_bml)
+    !
+    call prg_timer_start(zdiag_timer)
+    call prg_buildZsparse(over_bml,aux_bml,1,mdim,bml_type,zk1_bml,zk2_bml,zk3_bml&
+         &,zk4_bml,zk5_bml,zk6_bml,4,4,3,threshold,threshold,.true.,1)
 
-     call prg_buildzdiag(over_bml,aux_bml,threshold,norb,bml_type)
-     call prg_timer_stop(zdiag_timer)
+    call prg_buildzdiag(over_bml,aux_bml,threshold,norb,bml_type)
+    call prg_timer_stop(zdiag_timer)
 
-     call bml_add_deprecated(-1.0_dp,aux_bml,1.0_dp,zmat_bml,0.0_dp)
+    call bml_add_deprecated(-1.0_dp,aux_bml,1.0_dp,zmat_bml,0.0_dp)
 
-     error_calc = bml_fnorm(aux_bml)
+    error_calc = bml_fnorm(aux_bml)
 
-     if(error_calc.gt.error_tol)then
-        write(*,*) "Error is too high", error_calc
-        error stop
-     endif
+    if(error_calc.gt.error_tol)then
+      write(*,*) "Error is too high", error_calc
+      error stop
+    endif
 
-     call prg_timer_stop(loop_timer)
+    call prg_timer_stop(loop_timer)
 
 
   case("prg_system_parse_write_xyz")
-     call prg_parse_system(mol,"coords_100","xyz")
-     call prg_write_system(mol, "mysystem","xyz")
-     call system("diff -qs --ignore-all-space mysystem.xyz coords_100.xyz > tmp.tmp")
-     open(1,file="tmp.tmp")
-     read(1,*)dummy(1),dummy(2),dummy(3),dummy(4),dummy(5)
-     if(trim(dummy(5)).eq."differ")then
-        write(*,*) "Error coords are not the same"
-        error stop
-     endif
+    call prg_parse_system(mol,"coords_100","xyz")
+    call prg_write_system(mol, "mysystem","xyz")
+    call system("diff -qs --ignore-all-space mysystem.xyz coords_100.xyz > tmp.tmp")
+    open(1,file="tmp.tmp")
+    read(1,*)dummy(1),dummy(2),dummy(3),dummy(4),dummy(5)
+    if(trim(dummy(5)).eq."differ")then
+      write(*,*) "Error coords are not the same"
+      error stop
+    endif
 
   case("prg_system_parse_write_pdb")
-     call prg_parse_system(mol,"protein","pdb")
-     call prg_write_system(mol, "mysystem","pdb")
-     call system("diff -qs --ignore-all-space mysystem.pdb protein.pdb > tmp.tmp")
-     open(1,file="tmp.tmp")
-     read(1,*)dummy(1),dummy(2),dummy(3),dummy(4),dummy(5)
-     if(trim(dummy(5)).eq."differ")then
-        write(*,*) "Error coords are not the same"
-        error stop
-     endif
+    call prg_parse_system(mol,"protein","pdb")
+    call prg_write_system(mol, "mysystem","pdb")
+    call system("diff -qs --ignore-all-space mysystem.pdb protein.pdb > tmp.tmp")
+    open(1,file="tmp.tmp")
+    read(1,*)dummy(1),dummy(2),dummy(3),dummy(4),dummy(5)
+    if(trim(dummy(5)).eq."differ")then
+      write(*,*) "Error coords are not the same"
+      error stop
+    endif
 
   case("prg_system_parse_write_dat")
-     call prg_parse_system(mol,"inputblock","dat")
-     call prg_write_system(mol, "mysystem","dat")
-     call system("diff -qs --ignore-all-space mysystem.dat inputblock.dat > tmp.tmp")
-     open(1,file="tmp.tmp")
-     read(1,*)dummy(1),dummy(2),dummy(3),dummy(4),dummy(5)
-     if(trim(dummy(5)).eq."differ")then
-        write(*,*) "Error coords are not the same"
-        error stop
-     endif
+    call prg_parse_system(mol,"inputblock","dat")
+    call prg_write_system(mol, "mysystem","dat")
+    call system("diff -qs --ignore-all-space mysystem.dat inputblock.dat > tmp.tmp")
+    open(1,file="tmp.tmp")
+    read(1,*)dummy(1),dummy(2),dummy(3),dummy(4),dummy(5)
+    if(trim(dummy(5)).eq."differ")then
+      write(*,*) "Error coords are not the same"
+      error stop
+    endif
 
   case("prg_twolevel_model")
-     error_tol = 1.0D-10
-     call prg_parse_mham(mham,"input-twolevel.in")
-     call bml_zero_matrix(mham%bml_type,bml_element_real,dp,mham%norbs,mham%norbs,aux_bml)
-     call bml_zero_matrix(mham%bml_type,bml_element_real,dp,mham%norbs,mham%norbs,ham_bml)   
-     call prg_twolevel_model(mham%ea, mham%eb, mham%dab, mham%daiaj, mham%dbibj, &
-            &mham%dec, mham%rcoeff, mham%reshuffle, mham%seed, ham_bml, verbose)
-     call bml_read_matrix(aux_bml,'hamiltonian-twolevel-ref.mtx')
-     call bml_add_deprecated(-1.0_dp,aux_bml,1.0_dp,ham_bml,0.0_dp)
-     error_calc = bml_fnorm(aux_bml)
-
-     write(*,*)"prg_twolevel_model error ", error_calc
-
-     if(error_calc.gt.error_tol)then
-        write(*,*) "Error is too high", error_calc
-        error stop
-     endif
-
-     
-     
-     !---------------------------------------------
-     !LATTE routines
-     !---------------------------------------------
+    error_tol = 1.0D-10
+    call prg_parse_mham(mham,"input-twolevel.in")
+    call bml_zero_matrix(mham%bml_type,bml_element_real,dp,mham%norbs,mham%norbs,aux_bml)
+    call bml_zero_matrix(mham%bml_type,bml_element_real,dp,mham%norbs,mham%norbs,ham_bml)
+    call prg_twolevel_model(mham%ea, mham%eb, mham%dab, mham%daiaj, mham%dbibj, &
+         &mham%dec, mham%rcoeff, mham%reshuffle, mham%seed, ham_bml, verbose)
+    call bml_read_matrix(aux_bml,'hamiltonian-twolevel-ref.mtx')
+    call bml_add_deprecated(-1.0_dp,aux_bml,1.0_dp,ham_bml,0.0_dp)
+    error_calc = bml_fnorm(aux_bml)
+
+    write(*,*)"prg_twolevel_model error ", error_calc
+
+    if(error_calc.gt.error_tol)then
+      write(*,*) "Error is too high", error_calc
+      error stop
+    endif
+
+
+
+    !---------------------------------------------
+    !LATTE routines
+    !---------------------------------------------
 
   case("load_tbparms_latte")
-     call prg_parse_system(mol,"protein","pdb")
-     !> Loading the tb parameters (electrons.dat)
-     call load_latteTBparams(tbparams,mol%splist,"./")
-     call write_latteTBparams(tbparams,"myelectrons.dat")
-     call system("diff --report-identical-files --ignore-all-space myelectrons.dat electrons.dat > tmp.tmp")
-     open(1,file="tmp.tmp")
-     read(1,*)dummy(1),dummy(2),dummy(3),dummy(4),dummy(5)
-     if(trim(dummy(5)).eq."differ")then
-        write(*,*) "Error tbparams are not the same"
-        error stop
-     endif
+    call prg_parse_system(mol,"protein","pdb")
+    !> Loading the tb parameters (electrons.dat)
+    call load_latteTBparams(tbparams,mol%splist,"./")
+    call write_latteTBparams(tbparams,"myelectrons.dat")
+    call system("diff --report-identical-files --ignore-all-space myelectrons.dat electrons.dat > tmp.tmp")
+    open(1,file="tmp.tmp")
+    read(1,*)dummy(1),dummy(2),dummy(3),dummy(4),dummy(5)
+    if(trim(dummy(5)).eq."differ")then
+      write(*,*) "Error tbparams are not the same"
+      error stop
+    endif
 
   case("load_bintTBparamsH")
-     call prg_parse_system(mol,"protein","pdb")
-     !> Loading the bint parameters (bondints.nonorth)
-     call load_latteTBparams(tbparams,mol%splist,"./")
-     call load_bintTBparamsH(mol%splist,tbparams%onsite_energ,&
-          typeA,typeB,intKind,onsitesH,onsitesS,intPairsH,intPairsS,"./")
-     call write_bintTBparamsH(typeA,typeB,&
-          intKind,intPairsH,intPairsS,"mybondints.nonorth")
-     call system("diff --report-identical-files --ignore-all-space mybondints.nonorth bondints.nonorth > tmp.tmp")
-     open(1,file="tmp.tmp")
-     read(1,*)dummy(1),dummy(2),dummy(3),dummy(4),dummy(5)
-     if(trim(dummy(5)).eq."differ")then
-        write(*,*) "Error bond int tbparams are not the same"
-        error stop
-     endif
+    call prg_parse_system(mol,"protein","pdb")
+    !> Loading the bint parameters (bondints.nonorth)
+    call load_latteTBparams(tbparams,mol%splist,"./")
+    call load_bintTBparamsH(mol%splist,tbparams%onsite_energ,&
+         typeA,typeB,intKind,onsitesH,onsitesS,intPairsH,intPairsS,"./")
+    call write_bintTBparamsH(typeA,typeB,&
+         intKind,intPairsH,intPairsS,"mybondints.nonorth")
+    call system("diff --report-identical-files --ignore-all-space mybondints.nonorth bondints.nonorth > tmp.tmp")
+    open(1,file="tmp.tmp")
+    read(1,*)dummy(1),dummy(2),dummy(3),dummy(4),dummy(5)
+    if(trim(dummy(5)).eq."differ")then
+      write(*,*) "Error bond int tbparams are not the same"
+      error stop
+    endif
 
   case default
 
-     write(*,*)"ERROR: unknown test ",test
-     error stop
+    write(*,*)"ERROR: unknown test ",test
+    error stop
 
   end select
 

From 2ac9f501595d7963809ed1e25eee6cddfa34aed9 Mon Sep 17 00:00:00 2001
From: Linnea Andersson <lian5777@fredholm.it.uu.se>
Date: Fri, 16 Jul 2021 17:51:18 +0200
Subject: [PATCH 03/10] Added kernel routine

---
 src/CMakeLists.txt             |   1 +
 src/prg_ewald_mod.F90          | 242 +++++++++++++++++++++++++++++++--
 src/prg_implicit_fermi_mod.F90 | 125 +++++++++++------
 src/prg_xlbokernel_mod.F90     |  73 ++++++----
 4 files changed, 363 insertions(+), 78 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index becd6040..7d5a6641 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -96,6 +96,7 @@ install(FILES
   ${CMAKE_CURRENT_BINARY_DIR}/prg_chebyshev_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_densitymatrix_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_dos_mod.mod
+  ${CMAKE_CURRENT_BINARY_DIR}/prg_ewald_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_extras_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_genz_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_graph_mod.mod
diff --git a/src/prg_ewald_mod.F90 b/src/prg_ewald_mod.F90
index 95127797..5539c12d 100644
--- a/src/prg_ewald_mod.F90
+++ b/src/prg_ewald_mod.F90
@@ -15,8 +15,10 @@ module prg_ewald_mod
   public :: Ewald_Real_Space_Single_latte
   public :: Ewald_Real_Space
   public :: Ewald_Real_Space_latte
-  public :: Ewald_k_space
+  public :: Ewald_Real_Space_Test
+  public :: Ewald_k_space_latte_single
   public :: Ewald_k_space_latte
+  public :: Ewald_k_space_Test
 
 contains
 
@@ -374,6 +376,123 @@ subroutine  Ewald_Real_Space_latte(COULOMBV,I,RXYZ,Box, &
 
   end subroutine Ewald_Real_Space_latte
 
+  subroutine  Ewald_Real_Space_Test(COULOMBV,I,RX,RY,RZ,LBox, &
+       DELTAQ,U,Element_Type,Nr_atoms,COULACC,nnRx,nnRy,nnRz,nrnnlist,nnType,Max_Nr_Neigh)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh, I
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+    real(PREC), intent(in)    :: U(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    character(10), intent(in) :: Element_Type(Nr_atoms)
+    integer,    intent(in)    :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)    :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)    :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(out)   :: COULOMBV
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+    integer                   :: J,K, ccnt, nnI
+
+    COULVOL = LBox(1)*LBox(2)*LBox(3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    COULOMBV = ZERO
+
+    TI = TFACT*U(I)
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RX(I)
+    Ra(2) = RY(I)
+    Ra(3) = RZ(I)
+
+!    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(nnI,J,Rb,Rab,dR,MAGR,MAGR2,TJ,DC,Z,NUMREP_ERFC,CA) &
+!    !$OMP REDUCTION(+:COULOMBV)
+    do nnI = 1,nrnnlist(I)
+      Rb(1) = nnRx(I,nnI)
+      Rb(2) = nnRy(I,nnI)
+      Rb(3) = nnRz(I,nnI)
+      J = nnType(I,nnI)
+      Rab = Rb-Ra  ! OBS b - a !!!
+      dR = norm2(Rab)
+      MAGR = dR
+      MAGR2 = dR*dR
+
+      if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+        TJ = TFACT*U(J)
+        DC = Rab/dR
+
+        ! Not Using Numerical Recipes ERFC
+        Z = abs(CALPHA*MAGR)
+        NUMREP_ERFC = erfc(Z)
+
+        CA = NUMREP_ERFC/MAGR
+        COULOMBV = COULOMBV + DELTAQ(J)*CA
+        ccnt = ccnt + 1
+        !TEST(ccnt) = DELTAQ(J)*CA
+        CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+        EXPTI = exp(-TI*MAGR )
+
+        if (Element_Type(I).eq.Element_Type(J)) then
+          COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          ccnt = ccnt + 1
+          !TEST(ccnt) = - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+        else
+          TJ2 = TJ*TJ
+          TJ3 = TJ2*TJ
+          TJ4 = TJ2*TJ2
+          TJ6 = TJ4*TJ2
+          EXPTJ = exp( -TJ*MAGR )
+          TI2MTJ2 = TI2 - TJ2
+          TJ2MTI2 = -TI2MTJ2
+          SA = TI
+          SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+          SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+          SD = TJ
+          SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+          SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+          COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+        endif
+
+      endif
+    enddo
+!    !$OMP END PARALLEL DO
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space_Test
+
   subroutine  Ewald_Real_Space(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
        DELTAQ,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
 
@@ -623,21 +742,119 @@ subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_
 
   end subroutine Ewald_k_Space_latte
 
-  subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TIMERATIO,Max_Nr_Neigh)
+  subroutine Ewald_k_Space_latte_single(COULOMBV,J,RXYZ,Box,DELTAQ,Nr_atoms,COULACC)
 
     implicit none
 
+    integer, parameter        :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, J
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: KECONST, TFACT, RELPERM
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2
+    real(PREC)                :: CORRFACT,FOURCALPHA2
+    real(PREC)                :: RECIPVECS(3,3)
+    real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+    real(PREC)                :: IDOT, JDOT, COSJDOT, SINJDOT, KEPREF
+
+    integer                   :: I,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+
+    COULCUT = 12.0D0
+    CALPHA = SQRTX/COULCUT
+
+    COULCUT2 = COULCUT*COULCUT
+    KCUTOFF = TWO*CALPHA*SQRTX
+    KCUTOFF2 = KCUTOFF*KCUTOFF
+    CALPHA2 = CALPHA*CALPHA
+    FOURCALPHA2 = FOUR*CALPHA2
+
+    RECIPVECS = ZERO
+    RECIPVECS(1,1) = TWO*pi/Box(1,1)
+    RECIPVECS(2,2) = TWO*pi/Box(2,2)
+    RECIPVECS(3,3) = TWO*pi/Box(3,3)
+    LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+    MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+    NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+    RELPERM = 1.D0
+    KECONST = 14.3996437701414D0*RELPERM
+
+    COULOMBV = ZERO
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,IDOT,JDOT,COSJDOT,SINJDOT,L,M,N,MMIN,MMAX,NMIN,NMAX,L11,M22,K,K2,PREFACTOR,KEPREF)
+    do L = 0,LMAX
+
+      if (L.eq.0) then
+        MMIN = 0
+      else
+        MMIN = -MMAX
+      endif
+
+      L11 = L*RECIPVECS(1,1)
+
+      do M = MMIN,MMAX
+
+        NMIN = -NMAX
+        if ((L==0).and.(M==0)) then
+          NMIN = 1
+        endif
+
+        M22 = M*RECIPVECS(2,2)
+
+        do N = NMIN,NMAX
+          K(1) = L11 
+          K(2) = M22 
+          K(3) = N*RECIPVECS(3,3)
+          K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+
+          PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+          KEPREF = KECONST*PREFACTOR
+          JDOT = K(1)*RXYZ(1,J) + K(2)*RXYZ(2,J) + K(3)*RXYZ(3,J)
+          SINJDOT = sin(JDOT)
+          COSJDOT = cos(JDOT) 
+          do I = 1,Nr_atoms
+            IDOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
+            COULOMBV(I) = COULOMBV(I) + KEPREF*DELTAQ(J)*(COSJDOT*cos(IDOT)+SINJDOT*sin(IDOT))
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    ! Point self energy
+    CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+    COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+  end subroutine Ewald_k_Space_latte_single
+
+  subroutine Ewald_k_Space_Test(COULOMBV,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,Max_Nr_Neigh)
+!
+    implicit none
+!
     integer, parameter        :: PREC = 8
     real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
     real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
     real(PREC), parameter     :: pi = 3.14159265358979323846264D0
     real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
     integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
-    real(PREC), intent(in)    :: COULACC, TIMERATIO
+    real(PREC), intent(in)    :: COULACC
     real(PREC)                :: KECONST, TFACT, RELPERM
     real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
     real(PREC)                :: COULCUT, COULCUT2
-    real(PREC), intent(out)   :: COULOMBV(Nr_atoms), FCOUL(3,Nr_atoms)
+    real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
     real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
     real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
     real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
@@ -647,7 +864,7 @@ subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TI
     real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
 
     integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
-
+!
     COULVOL = LBox(1)*LBox(2)*LBox(3)
     SQRTX = sqrt(-log(COULACC))
 
@@ -673,7 +890,6 @@ subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TI
     RELPERM = 1.D0
     KECONST = 14.3996437701414D0*RELPERM
 
-    FCOUL = ZERO
     COULOMBV = ZERO
     SINLIST = ZERO
     COSLIST = ZERO
@@ -714,6 +930,9 @@ subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TI
             SINSUM = 0.D0
 
             ! Doing the sin and cos sums
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,DOT) &
+            !$OMP REDUCTION(+:COSSUM) &
+            !$OMP REDUCTION(+:SINSUM)
             do I = 1,Nr_atoms
               DOT = K(1)*RX(I) + K(2)*RY(I) + K(3)*RZ(I)
               ! We re-use these in the next loop...
@@ -722,19 +941,18 @@ subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TI
               COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
               SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
             enddo
+            !$OMP END PARALLEL DO
             COSSUM2 = COSSUM*COSSUM
             SINSUM2 = SINSUM*SINSUM
 
             ! Add up energy and force contributions
-
+!
             KEPREF = KECONST*PREFACTOR
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I)
             do I = 1,Nr_atoms
               COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
-              FORCE = KEPREF*DELTAQ(I)*(SINLIST(I)*COSSUM - COSLIST(I)*SINSUM)
-              FCOUL(1,I) = FCOUL(1,I) + FORCE*K(1)
-              FCOUL(2,I) = FCOUL(2,I) + FORCE*K(2)
-              FCOUL(3,I) = FCOUL(3,I) + FORCE*K(3)
             enddo
+            !$OMP END PARALLEL DO
 
             KEPREF = KEPREF*(COSSUM2 + SINSUM2)
           endif
@@ -746,6 +964,6 @@ subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TI
     CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
     COULOMBV = COULOMBV - CORRFACT*DELTAQ;
 
-  end subroutine Ewald_k_Space
+  end subroutine Ewald_k_Space_Test
 
 end module prg_ewald_mod
diff --git a/src/prg_implicit_fermi_mod.F90 b/src/prg_implicit_fermi_mod.F90
index 76bd2bc2..37d03618 100644
--- a/src/prg_implicit_fermi_mod.F90
+++ b/src/prg_implicit_fermi_mod.F90
@@ -58,11 +58,11 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
     integer, intent(inout) :: occiter
 
     type(bml_matrix_t) :: w_bml, y_bml, d_bml, aux_bml, p2_bml, I_bml, ai_bml
-    real(dp) :: trdPdmu, trP0, occErr, alpha
-    real(dp) :: cnst, ofactor, mustep
+    real(dp) :: trdPdmu, trP0, occErr, alpha, newerr
+    real(dp) :: cnst, ofactor, mustep, preverr
     real(dp), allocatable :: trace(:), gbnd(:)
     character(20) :: bml_type
-    integer :: N, M, i, iter, muadj, prev
+    integer :: N, M, i, iter, muadj, prev, maxiter
 
     bml_type = bml_get_type(h_bml)
     N = bml_get_N(h_bml)
@@ -78,9 +78,12 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
     call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, ai_bml)
 
     occErr = 10.0_dp
-    alpha = 1.0_dp
+    newerr = 1000_dp
+    preverr = 1000_dp
+    alpha = 8.0_dp
     prev = 0
     iter = 0
+    maxiter = 30
     cnst = beta/(1.0_dp*2**(nsteps+2))
 
     if (SCF_IT .eq. 1) then
@@ -95,12 +98,15 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
       call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
       call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
       call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.0001_dp, threshold)
+      do i = 1, nsteps
+        call bml_copy(I_bml, Inv_bml(i))
+      enddo 
     else
       ! Otherwise use previous inverse as starting guess
       call bml_copy(Inv_bml(1),ai_bml)
     end if
 
-    do while ((occErr .gt. occErrLimit .or. muadj .eq. 1) .and. iter < 50)
+    do while ((occErr .gt. occErrLimit .or. muadj .eq. 1) .and. iter < maxiter)
       iter = iter + 1
       muadj = 0
       write(*,*) 'mu =', mu
@@ -116,46 +122,65 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
         call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
         call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
         ! Find inverse ai = (2*(P2-P)+I)^-1
-        !call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.01_dp, threshold)
-        call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, aux_bml, I_bml, tol, threshold)
-        call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
-        call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation
+        !call prg_conjgrad(y_bml, Inv_bml(i), I_bml, aux_bml, d_bml, w_bml, tol, threshold)
+        !call bml_copy(Inv_bml(i),ai_bml) 
+        call prg_newtonschulz(y_bml, Inv_bml(i), d_bml, w_bml, aux_bml, I_bml, tol, threshold)
+        call bml_multiply(Inv_bml(i), p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
+        !call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation
       enddo
 
-      trdPdmu = bml_trace(p_bml)
-      trP0 = trdPdmu
-      trdPdmu = trdPdmu - bml_sum_squares(p_bml) ! sum p(i,j)**2
-      trdPdmu = beta * trdPdmu
+      trP0 = bml_trace(p_bml)
+      trdPdmu = beta*(trP0 - bml_sum_squares(p_bml)) ! sum p(i,j)**2
       occErr = abs(trP0 - nocc)
-      write(*,*) 'occerr =', nocc-trP0
-
+      write(*,*) 'occerr =', occErr
+       
       ! If occupation error is too large, do bisection method
-      if (occerr > 10.0_dp) then
-        if (nocc-trP0 < 0.0_dp) then
-          if (prev .eq. 1) then
-            alpha = alpha/2
-          endif
+      if (occerr > 1.0_dp) then
+     ! if (newerr > occerr) then 
+        if (nocc-trP0 < 0.0_dp .and. prev .eq. -1) then
           prev = -1
-          mu = mu - alpha
-        else
-          if (prev .eq. -1) then
-            alpha = alpha/2
-          endif
+        else if (nocc-trP0 > 0.0_dp .and. prev .eq. 1) then 
           prev = 1
-          mu = mu + alpha
+        else if (nocc-trP0 > 0.0_dp .and. prev .eq. -1) then
+          prev = 1
+          alpha = alpha/2
+      !    newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+      !    do while(newerr > occerr)
+      !      alpha = alpha/2
+      !      newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+      !    enddo
+        else 
+          prev = -1 
+          alpha = alpha/2 
+      !    newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+      !    do while(newerr > occerr)
+      !      alpha = alpha/2
+      !      newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+      !    enddo
         endif
+        !newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+        !do while(newerr > occerr .and. abs(occerr-newerr)/occerr<0.1 )
+        !  alpha = alpha/2
+        !  newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+        !enddo
+!        write(*,*) 'newerr =', newerr
+        mu = mu + prev*alpha
+        muadj = 1
+
         ! Otherwise do Newton
       else if (occErr .gt. occErrLimit) then
         mustep = (nocc -trP0)/trdPdmu
-        if (abs(mustep) > 1.0) then
-          mustep = 0.1_dp*mustep
-        end if
+        !if (abs(mustep) > 1.0 .or. preverr < occErr) then
+        !  alpha = alpha/2
+        !  mustep = alpha*mustep
+        !end if
         mu = mu + mustep
         muadj = 1
+        preverr = occErr
       end if
     enddo
 
-    if (iter .ge. 50) then 
+    if (iter .ge. maxiter) then 
             write(*,*) 'Could not converge chemical potential in prg_impplicit_fermi_save_inverse'
     end if 
     ! Adjusting the occupation sometimes causes the perturbation calculation to not converge.
@@ -432,12 +457,12 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
     implicit none
 
     type(bml_matrix_t), intent(in) :: H0_bml, H1_bml, Inv_bml(nsteps)
-    type(bml_matrix_t), intent(inout) :: P0_bml, P1_bml
+    type(bml_matrix_t), intent(inout) :: P0_bml,P1_bml
     real(dp), intent(in) :: mu0, threshold
     real(dp)  :: mu1
     real(dp), intent(in) :: beta, nocc
     integer, intent(in) :: nsteps
-    type(bml_matrix_t) :: B_bml, C_bml, C0_bml
+    type(bml_matrix_t) :: B0_bml, B_bml, C_bml, C0_bml
     character(20) :: bml_type
     real(dp) :: p1_trace, dPdmu_trace, p1B_trace, mu1B, cnst
     integer :: N, M, i, j, k
@@ -446,6 +471,7 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
     N = bml_get_N(H0_bml)
     M = bml_get_M(H0_bml)
 
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, B0_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, B_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C0_bml)
@@ -477,11 +503,31 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
       call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
     enddo
 
+!    do i = 1, nsteps-1
+      ! D = A^-1*P0
+!      call bml_multiply(Inv_bml(i), B0_bml, C0_bml, 1.0_dp, 0.0_dp, threshold)
+!      call bml_multiply(C0_bml, B0_bml, B_bml, 1.0_dp, 0.0_dp, threshold)
+      ! B0 = A^-1*P0^2
+!      call bml_copy(B_bml,B0_bml)
+      ! B = I + D -P0*D
+!      call bml_add(B_bml, C0_bml, -1.0_dp, 1.0_dp, threshold)
+!      call bml_scale_add_identity(B_bml, 1.0_dp, 1.0_dp, threshold)
+      ! P1 = 2D*P1(I+D-P0*D)
+!      call bml_multiply(C0_bml, P1_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+!      call bml_multiply(C_bml, B_bml, P1_bml, 2.0_dp, 0.0_dp, threshold)
+!    enddo 
+!      call bml_multiply(B0_bml, P1_bml, C_bml, 2.0_dp, 0.0_dp, threshold)
+!      call bml_copy(P1_bml, B_bml)
+!      call bml_add(B_bml, C_bml, 2.0_dp, -2.0_dp, threshold)
+      ! Get next P1
+!      call bml_multiply(B_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+!      call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
+
+
     ! dPdmu = beta*P0(I-P0)
     call bml_copy(P0_bml, B_bml)
     call bml_scale_add_identity(B_bml, -1.0_dp, 1.0_dp, threshold)
-    call bml_multiply(P0_bml, B_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
-    call bml_scale(beta,C_bml)
+    call bml_multiply(P0_bml, B_bml, C_bml, beta, 0.0_dp, threshold)
     dPdmu_trace = bml_trace(C_bml)
     p1_trace = bml_trace(P1_bml)
     mu1 =  - p1_trace/dPdmu_trace
@@ -490,6 +536,7 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
     endif
 
     call bml_deallocate(B_bml)
+    call bml_deallocate(B0_bml)
     call bml_deallocate(C_bml)
     call bml_deallocate(C0_bml)
 
@@ -866,9 +913,7 @@ subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, d_bml, I_bml, tol, th
     N = bml_get_N(a_bml)
     err = 100000.0
     i = 0
-    N2 = N*N
-    scaled_tol = tol*N
-    do while(err > scaled_tol)
+    do while(err > tol)
       !write(*,*) 'iter = ', i
       !write(*,*) 'ns error =', err
       call bml_copy(ai_bml, tmp_bml)
@@ -880,7 +925,7 @@ subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, d_bml, I_bml, tol, th
       !write(*,*) "prev_err = ", prev_err
       if (10*prev_err < err) then
         write(*,*) 'NS did not converge, calling conjugate gradient'
-        call prg_conjgrad(a_bml, ai_bml, I_bml, r_bml, tmp_bml, d_bml, 0.0001_dp, threshold)
+        call prg_conjgrad(a_bml, ai_bml, I_bml, r_bml, tmp_bml, d_bml, 0.00001_dp, threshold)
       else
         call bml_multiply(tmp_bml, r_bml, ai_bml, 1.0_dp, 1.0_dp, threshold)
       endif
@@ -990,7 +1035,7 @@ subroutine prg_conjgrad(A_bml, p_bml, p2_bml, tmp_bml, d_bml, w_bml, cg_tol, thr
 
     do while (r_norm_new .gt. cg_tol)
 
-      write(*,*) r_norm_new
+    !  write(*,*) r_norm_new
       k = k + 1
       if (k .eq. 1) then
         call bml_copy(tmp_bml, d_bml)
@@ -1011,7 +1056,7 @@ subroutine prg_conjgrad(A_bml, p_bml, p2_bml, tmp_bml, d_bml, w_bml, cg_tol, thr
         stop
       endif
     enddo
-    write(*,*) "Number of CG-iterations:", k
+    !write(*,*) "Number of CG-iterations:", k
 
   end subroutine prg_conjgrad
 
diff --git a/src/prg_xlbokernel_mod.F90 b/src/prg_xlbokernel_mod.F90
index a19cb5b2..81496915 100644
--- a/src/prg_xlbokernel_mod.F90
+++ b/src/prg_xlbokernel_mod.F90
@@ -83,23 +83,23 @@ end subroutine Invert
   !! \param Nr_elem Number of elements in Hubbard list.
   subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_bml,mu,beta,RXYZ,Box,Hubbard_U,Element_Pointer, &
        Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,nebcoul,totnebcoul,Hinxlist, &
-       S_bml,Z_bml,Nocc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold,Nr_elem)
+       S_bml,Z_bml,Nocc,Inv_bml,H1_bml,DO_bml,D1_bml,m_rec,threshold,Nr_elem)
 
     !! Res = q[n] - n
     !! KK0 is preconditioner
     !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
 
     implicit none
-    integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec,Nr_elem,NUMRANK
+    integer,    intent(in)      :: Nr_atoms, HDIM, Max_Nr_Neigh,m_rec,Nr_elem,NUMRANK
     real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-    real(dp), intent(in)      :: Coulomb_acc, FelTol, beta, mu
+    real(dp), intent(in)      :: Coulomb_acc, FelTol, beta, mu, Nocc
     real(dp), intent(in)      :: RXYZ(3,Nr_atoms), Box(3,3)
     real(dp), intent(in)      :: Res(Nr_atoms)
     integer,    intent(in)      :: Hinxlist(HDIM),Element_Pointer(Nr_atoms)
     real(dp), intent(in)      :: Hubbard_U(Nr_elem)
     type(bml_matrix_t), intent(inout)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
     real(dp)                  :: K0Res(Nr_atoms)
-    type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+    type(bml_matrix_t),intent(inout) :: H1_bml, DO_bml,D1_bml
     real(dp), intent(in)      :: threshold
     integer,    intent(in)      :: LMAX
     character(20)             :: bml_type
@@ -115,17 +115,19 @@ subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_
     real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
     real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
     real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
-    type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml, ZT_bml, T_bml
+    type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml, ZT_bml, T_bml, X_bml, Y_bml
 
     call timer_prg_init()
     bml_type = bml_get_type(HO_bml)
     N = bml_get_N(HO_bml)
     MN = bml_get_M(HO_bml)
     allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
-    call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
-    call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
-    call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
     call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,X_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,Y_bml)
 
     call bml_transpose(Z_bml,ZT_bml)
 
@@ -137,6 +139,7 @@ subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_
     K0Res = row_NA
     dr = K0Res
 
+    write(*,*) 'resnorm =', norm2(Res), 'kresnorm =', norm2(dr)
     I = 0
     Fel = 1.D0
     do while ((Fel > FelTol).AND.(I < (LMAX)).AND.(I < NUMRANK))   !! Fel = "Error" in Swedish
@@ -249,6 +252,8 @@ subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_
     call bml_deallocate(K0Res_bml)
     call bml_deallocate(Res_bml)
     call bml_deallocate(ZT_bml)
+    call bml_deallocate(X_bml)
+    call bml_deallocate(Y_bml)
     call prg_timer_shutdown()
 
   end subroutine prg_kernel_multirank_latte
@@ -327,8 +332,8 @@ subroutine prg_kernel_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX
       enddo
       !$OMP END PARALLEL DO
 
-      call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
-      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+     ! call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+     ! Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
       row1 = 0.0_dp
       do J = 1,HDIM
@@ -433,41 +438,43 @@ end subroutine prg_kernel_multirank
   !! \param Nr_elem Number of elements in Hubbard list.
   subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
        Element_Pointer, Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc, &
-       Hinxlist,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
+       Hinxlist,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml, &
        Nocc,m_rec,threshold,beta,Nr_elem)
 
     use bml
 
     implicit none
     integer, parameter             :: PREC = 8, dp = kind(1.0d0)
-    integer,    intent(in)         :: Nr_atoms, Nr_elem,HDIM, Nocc, Max_Nr_Neigh,m_rec
+    integer,    intent(in)         :: Nr_atoms, Nr_elem,HDIM, Max_Nr_Neigh,m_rec
     real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
     real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
-    real(PREC), intent(in)         :: Coulomb_acc, threshold,beta
+    real(PREC), intent(in)         :: Coulomb_acc, threshold,beta,Nocc
     real(PREC)                     :: v(Nr_atoms)
     real(PREC), intent(in)         :: RXYZ(3,Nr_atoms),Box(3,3)
     integer,    intent(in)         :: Hinxlist(HDIM)
     real(PREC), intent(in)         :: Hubbard_U(Nr_elem)
     type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
     type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
-    type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml
+    type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml
     real(PREC), intent(inout)      :: mu0
     integer,    intent(in)         :: Element_Pointer(Nr_atoms)
     real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
     real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
     real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
     real(PREC)                     :: dq_v(Nr_atoms)
-    real(PREC)                     :: dq_dv(Nr_atoms), err,tol
+    real(PREC)                     :: dq_dv(Nr_atoms), err,tol,start,ewaldk,ewaldr,ewaldkacc,ewaldracc,response,respacc
     real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
     integer                        :: I,J,K, ITER, mm,It,N,MN
     real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)
-    type(bml_matrix_t)             :: ZT_bml, R_bml, JJI_bml, JJ_bml, tmp_bml
+    type(bml_matrix_t)             :: ZT_bml, R_bml, JJI_bml, JJ_bml, tmp_bml, X_bml, Y_bml
     character(20)                  :: bml_type
 
     bml_type = bml_get_type(HO_bml)
     N = bml_get_N(HO_bml)
     MN = bml_get_M(HO_bml)
     call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,X_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,Y_bml)
     call bml_transpose(Z_bml,ZT_bml)
     allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
     Coulomb_Pot_dq_v = ZERO
@@ -476,7 +483,11 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
     JJ = ZERO
     KK = ZERO
 
+    write(*,*) 'Beginning response calculations'
+    ewaldracc = 0.0; ewaldkacc = 0.0; respacc = 0.0;
     do J = 1,Nr_atoms
+      write(*,*) 'J =',J
+      call cpu_time(start)
       dq_v(J) = ONE
       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
       do I = 1,Nr_atoms
@@ -485,9 +496,12 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
         Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
       enddo
       !$OMP END PARALLEL DO
-      call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc, &
-           Max_Nr_Neigh)
+      call cpu_time(ewaldr)
+      ewaldracc = ewaldracc + ewaldr-start
+      call Ewald_k_Space_latte_single(Coulomb_Pot_k,J,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc)
       Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+      call cpu_time(ewaldk)
+      ewaldkacc = ewaldkacc + ewaldk-ewaldr
 
       call bml_deallocate(H1_bml)
       call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,H1_bml)
@@ -512,7 +526,9 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
       call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
 
       call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-           m_rec,mu0,beta,real(nocc,PREC),threshold)
+           m_rec,mu0,beta,Nocc,threshold)
+      call cpu_time(response)
+      respacc = respacc + response-ewaldk
 
       call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
       call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
@@ -540,10 +556,15 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
       JJ(I,I) = JJ(I,I) - ONE
     enddo
 
+    write(*,*) 'ewaldracc =', ewaldracc
+    write(*,*) 'ewaldkacc =', ewaldkacc 
+    write(*,*) 'implcit response time =', respacc
     call Invert(JJ,KK,Nr_atoms)
 
     deallocate(row1); deallocate(row2); deallocate(JJ)
     call bml_deallocate(ZT_bml)
+    call bml_deallocate(X_bml)
+    call bml_deallocate(Y_bml)
 
   end subroutine prg_full_kernel_latte
 
@@ -627,8 +648,8 @@ subroutine prg_full_kernel(KK,DO_bml,mu0,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
         Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
       enddo
       !$OMP END PARALLEL DO
-      call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
-           TIMERATIO,Max_Nr_Neigh)
+    !  call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
+    !       TIMERATIO,Max_Nr_Neigh)
       Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
       diagonal = 0.0_dp
@@ -691,9 +712,9 @@ subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,
     !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
 
     implicit none
-    integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
+    integer,    intent(in)      :: Nr_atoms, HDIM, Max_Nr_Neigh,m_rec
     real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-    real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
+    real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu,Nocc
     real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
     real(dp), intent(in)      :: Res(Nr_atoms)
     integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
@@ -745,8 +766,8 @@ subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,
       enddo
       !$OMP END PARALLEL DO
 
-      call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
-      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+     ! call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+     ! Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
       row1 = 0.0_dp
       do J = 1,HDIM
@@ -771,7 +792,7 @@ subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,
       call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
 
       call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-           m_rec, mu, beta, real(nocc,dp), threshold)
+           m_rec, mu, beta, Nocc, threshold)
 
       call bml_transpose(Z_bml,X_bml)
       call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)

From 02e93a6c87bbfaa9fdfd775ee5c4b77c10f0f0fd Mon Sep 17 00:00:00 2001
From: Linnea Andersson <lian5777@fredholm.it.uu.se>
Date: Fri, 16 Jul 2021 18:23:21 +0200
Subject: [PATCH 04/10] Fixed test

---
 tests/src/main.F90 | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/src/main.F90 b/tests/src/main.F90
index 3ab0354f..530a3093 100644
--- a/tests/src/main.F90
+++ b/tests/src/main.F90
@@ -231,11 +231,10 @@ program main
     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho1_bml)
-    call prg_test_density_matrix(ham_bml,rho1_bml,beta,mu,nocc,1,1e-4_dp,threshold)   
-    write(*,*) mu
+    
     mu = 0.2_dp  
     call prg_implicit_fermi_save_inverse(inv_bml,ham_bml,rho_bml,norecs,nocc,mu,beta,1e-4_dp, threshold, 1e-5_dp, 1,occiter)
-
+    call prg_test_density_matrix(ham_bml,rho1_bml,beta,mu,nocc,1,1e-4_dp,threshold)   
     write(*,*) mu
     call bml_scale(0.5_dp,rho_bml)
     call bml_add(rho1_bml,rho_bml,1.0_dp,-1.0_dp,threshold)

From ce0e474cf5151279226874e13ac4d791a444218a Mon Sep 17 00:00:00 2001
From: Linnea Andersson <lian5777@fredholm.it.uu.se>
Date: Mon, 11 Jan 2021 15:49:53 +0100
Subject: [PATCH 05/10] Added kernel routines for XL-BOMD

---
 src/CMakeLists.txt             |   3 +
 src/prg_ewald_mod.F90          | 751 +++++++++++++++++++++++++++++
 src/prg_implicit_fermi_mod.F90 | 326 +++++++++++--
 src/prg_xlbokernel_mod.F90     | 837 +++++++++++++++++++++++++++++++++
 4 files changed, 1880 insertions(+), 37 deletions(-)
 create mode 100644 src/prg_ewald_mod.F90
 create mode 100644 src/prg_xlbokernel_mod.F90

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e0f63b54..9c30e5be 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,6 +18,7 @@ add_library(progress
   prg_chebyshev_mod.F90
   prg_densitymatrix_mod.F90
   prg_dos_mod.F90
+  prg_ewald_mod.F90
   prg_extras_mod.F90
   prg_genz_mod.F90
   prg_graph_mod.F90
@@ -46,6 +47,7 @@ add_library(progress
   prg_subgraphloop_mod.F90
   prg_system_mod.F90
   prg_timer_mod.F90
+  prg_xlbokernel_mod.F90
   prg_xlbo_mod.F90)
   #prg_xlkernel_mod.F90)
 
@@ -122,6 +124,7 @@ install(FILES
   ${CMAKE_CURRENT_BINARY_DIR}/prg_system_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_timer_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_xlbo_mod.mod
+  ${CMAKE_CURRENT_BINARY_DIR}/prg_xlbokernel_mod.mod
   #  ${CMAKE_CURRENT_BINARY_DIR}/prg_xlkernel_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/slaterkosterforce_latte_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/tbparams_latte_mod.mod
diff --git a/src/prg_ewald_mod.F90 b/src/prg_ewald_mod.F90
new file mode 100644
index 00000000..5a032412
--- /dev/null
+++ b/src/prg_ewald_mod.F90
@@ -0,0 +1,751 @@
+! Ewald sum routines for kernel calculation 
+module prg_ewald_mod
+
+  use bml
+  use prg_timer_mod
+  use prg_parallel_mod
+
+  implicit none
+
+  private  !Everything is private by default
+
+  integer, parameter :: dp = kind(1.0d0)
+
+  public :: Ewald_Real_Space_Single
+  public :: Ewald_Real_Space_Single_latte
+  public :: Ewald_Real_Space
+  public :: Ewald_Real_Space_latte
+  public :: Ewald_k_space
+  public :: Ewald_k_space_latte
+
+contains
+
+!> Find Coulomb potential on site I from single charge at site J          
+subroutine  Ewald_Real_Space_Single_latte(COULOMBV,I,RXYZ,Box,Nr_elem, &
+    DELTAQ,J,U,Element_Pointer,Nr_atoms,COULACC,HDIM,Max_Nr_Neigh)
+
+implicit none
+
+integer,    parameter     :: PREC = 8
+real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+integer,    intent(in)    :: Nr_atoms, Nr_elem, HDIM, Max_Nr_Neigh, I, J, Element_Pointer(Nr_atoms)
+real(PREC), intent(in)    :: COULACC, DELTAQ(Nr_atoms)
+real(PREC)                :: TFACT, RELPERM, KECONST
+real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3)
+real(PREC), intent(in)    :: U(Nr_elem)
+real(PREC)                :: COULCUT, COULCUT2
+real(PREC), intent(out)   :: COULOMBV 
+real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+
+integer                   :: K, ccnt,l,m,n
+
+COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+SQRTX = sqrt(-log(COULACC))
+
+ccnt = 0
+COULCUT = 12.D0
+CALPHA = SQRTX/COULCUT
+COULCUT2 = COULCUT*COULCUT
+CALPHA2 = CALPHA*CALPHA
+
+RELPERM = ONE
+KECONST = 14.3996437701414D0*RELPERM
+TFACT  = 16.0D0/(5.0D0*KECONST)
+
+COULOMBV = ZERO
+
+TI = TFACT*U(Element_Pointer(I))
+TI2 = TI*TI
+TI3 = TI2*TI
+TI4 = TI2*TI2
+TI6 = TI4*TI2
+
+SSA = TI
+SSB = TI3/48.D0
+SSC = 3.D0*TI2/16.D0
+SSD = 11.D0*TI/16.D0
+SSE = 1.D0
+
+Ra(1) = RXYZ(1,I)
+Ra(2) = RXYZ(2,I)
+Ra(3) = RXYZ(3,I)
+
+    do k = -1,1
+    do m = -1,1
+    do l = -1,1
+
+    Rb(1) = RXYZ(1,J)+k*box(1,1)
+    Rb(2) = RXYZ(2,J)+m*box(2,2)
+    Rb(3) = RXYZ(3,J)+l*box(3,3)
+    Rab = Rb-Ra  ! OBS b - a !!!
+    dR = norm2(Rab)
+    MAGR = dR
+    MAGR2 = dR*dR
+
+    if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+       TJ = TFACT*U(Element_Pointer(J))
+       DC = Rab/dR
+
+       Z = abs(CALPHA*MAGR)
+       NUMREP_ERFC = erfc(Z)
+
+       CA = NUMREP_ERFC/MAGR
+       COULOMBV = COULOMBV + DELTAQ(J)*CA
+       ccnt = ccnt + 1
+       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+       EXPTI = exp(-TI*MAGR )
+
+       if (Element_Pointer(I).eq.Element_Pointer(J)) then
+           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+           ccnt = ccnt + 1
+       else
+           TJ2 = TJ*TJ
+           TJ3 = TJ2*TJ
+           TJ4 = TJ2*TJ2
+           TJ6 = TJ4*TJ2
+           EXPTJ = exp( -TJ*MAGR )
+           TI2MTJ2 = TI2 - TJ2
+           TJ2MTI2 = -TI2MTJ2
+           SA = TI
+           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+           SD = TJ
+           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+       endif
+    endif
+    enddo
+    enddo
+    enddo
+
+COULOMBV = KECONST*COULOMBV
+
+end subroutine Ewald_Real_Space_Single_latte
+
+subroutine  Ewald_Real_Space_Single(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
+    DELTAQ,J,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,HDIM,Max_Nr_Neigh)
+
+
+implicit none
+
+integer,    parameter     :: PREC = 8
+real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, J
+real(PREC), intent(in)    :: COULACC, TIMERATIO,DELTAQ(Nr_atoms)
+real(PREC)                :: TFACT, RELPERM, KECONST
+real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+real(PREC), intent(in)    :: U(Nr_atoms)
+real(PREC)                :: COULCUT, COULCUT2
+character(10), intent(in) :: Element_Type(Nr_atoms)
+real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
+real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+
+integer                   :: K, ccnt,l,m,n
+
+COULVOL = LBox(1)*LBox(2)*LBox(3)
+SQRTX = sqrt(-log(COULACC))
+
+ccnt = 0
+COULCUT = 12.D0
+CALPHA = SQRTX/COULCUT
+COULCUT2 = COULCUT*COULCUT
+CALPHA2 = CALPHA*CALPHA
+
+RELPERM = ONE
+KECONST = 14.3996437701414D0*RELPERM
+TFACT  = 16.0D0/(5.0D0*KECONST)
+
+FCOUL = ZERO
+COULOMBV = ZERO
+
+TI = TFACT*U(I)
+TI2 = TI*TI
+TI3 = TI2*TI
+TI4 = TI2*TI2
+TI6 = TI4*TI2
+
+SSA = TI
+SSB = TI3/48.D0
+SSC = 3.D0*TI2/16.D0
+SSD = 11.D0*TI/16.D0
+SSE = 1.D0
+
+Ra(1) = RX(I)
+Ra(2) = RY(I)
+Ra(3) = RZ(I)
+
+    do k = -1,1
+    do m = -1,1
+    do l = -1,1
+
+    Rb(1) = RX(J)+k*Lbox(1)
+    Rb(2) = RY(J)+m*Lbox(2)
+    Rb(3) = RZ(J)+l*Lbox(3)
+    Rab = Rb-Ra  ! OBS b - a !!!
+    dR = norm2(Rab)
+    MAGR = dR
+    MAGR2 = dR*dR
+
+    if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+       TJ = TFACT*U(J)
+       DC = Rab/dR
+
+       Z = abs(CALPHA*MAGR)
+       NUMREP_ERFC = erfc(Z)
+
+       CA = NUMREP_ERFC/MAGR
+       COULOMBV = COULOMBV + DELTAQ(J)*CA
+       ccnt = ccnt + 1
+       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+       FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
+       EXPTI = exp(-TI*MAGR )
+
+       if (Element_Type(I).eq.Element_Type(J)) then
+           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+           ccnt = ccnt + 1
+           FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
+             + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
+       else
+           TJ2 = TJ*TJ
+           TJ3 = TJ2*TJ
+           TJ4 = TJ2*TJ2
+           TJ6 = TJ4*TJ2
+           EXPTJ = exp( -TJ*MAGR )
+           TI2MTJ2 = TI2 - TJ2
+           TJ2MTI2 = -TI2MTJ2
+           SA = TI
+           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+           SD = TJ
+           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+           FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
+              + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
+       endif
+
+       FCOUL(1) = FCOUL(1) + DC(1)*FORCE
+       FCOUL(2) = FCOUL(2) + DC(2)*FORCE
+       FCOUL(3) = FCOUL(3) + DC(3)*FORCE
+    endif
+    enddo
+    enddo
+    enddo
+
+COULOMBV = KECONST*COULOMBV
+
+end subroutine Ewald_Real_Space_Single
+
+subroutine  Ewald_Real_Space_latte(COULOMBV,I,RXYZ,Box, &
+    DELTAQ,U,Element_Pointer,Nr_atoms,COULACC,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_Elem)
+
+implicit none
+
+integer,    parameter     :: PREC = 8
+real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, Nr_Elem
+real(PREC), intent(in)    :: COULACC
+real(PREC)                :: TFACT, RELPERM, KECONST
+real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+real(PREC), intent(in)    :: U(Nr_elem)
+real(PREC)                :: COULCUT, COULCUT2
+integer, intent(in)       :: Element_Pointer(Nr_atoms)
+integer,    intent(in)    :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
+real(PREC), intent(out)   :: COULOMBV
+real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+integer                   :: J,K, ccnt, newj, PBCI,PBCJ,PBCK
+
+COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+SQRTX = sqrt(-log(COULACC))
+
+ccnt = 0
+COULCUT = 12.D0
+CALPHA = SQRTX/COULCUT
+COULCUT2 = COULCUT*COULCUT
+CALPHA2 = CALPHA*CALPHA
+
+RELPERM = ONE
+KECONST = 14.3996437701414D0*RELPERM
+TFACT  = 16.0D0/(5.0D0*KECONST)
+
+COULOMBV = ZERO
+
+TI = TFACT*U(Element_Pointer(I))
+TI2 = TI*TI
+TI3 = TI2*TI
+TI4 = TI2*TI2
+TI6 = TI4*TI2
+
+SSA = TI
+SSB = TI3/48.D0
+SSC = 3.D0*TI2/16.D0
+SSD = 11.D0*TI/16.D0
+SSE = 1.D0
+
+Ra(1) = RXYZ(1,I)
+Ra(2) = RXYZ(2,I)
+Ra(3) = RXYZ(3,I)
+
+do newj = 1,totnebcoul(I)
+        J = NEBCOUL(1, NEWJ, I)
+        PBCI = NEBCOUL(2, NEWJ, I)
+        PBCJ = NEBCOUL(3, NEWJ, I)
+        PBCK = NEBCOUL(4, NEWJ, I)
+        Rb(1) = RXYZ(1,J) + REAL(PBCI)*BOX(1,1) + REAL(PBCJ)*BOX(2,1) + &
+             REAL(PBCK)*BOX(3,1) 
+
+        Rb(2) = RXYZ(2,J) + REAL(PBCI)*BOX(1,2) + REAL(PBCJ)*BOX(2,2) + &
+             REAL(PBCK)*BOX(3,2) 
+
+        Rb(3) = RXYZ(3,J) + REAL(PBCI)*BOX(1,3) + REAL(PBCJ)*BOX(2,3) + &
+             REAL(PBCK)*BOX(3,3) 
+        Rab = Rb-Ra  ! OBS b - a !!!
+        dR = norm2(Rab)
+        MAGR = dR
+        MAGR2 = dR*dR
+
+    if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+       TJ = TFACT*U(Element_Pointer(J))
+       DC = Rab/dR
+
+       Z = abs(CALPHA*MAGR)
+       NUMREP_ERFC = erfc(Z)
+
+       CA = NUMREP_ERFC/MAGR
+       COULOMBV = COULOMBV + DELTAQ(J)*CA
+       ccnt = ccnt + 1
+       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+       EXPTI = exp(-TI*MAGR )
+
+       if (Element_Pointer(I).eq.Element_Pointer(J)) then
+           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+           ccnt = ccnt + 1
+       else
+           TJ2 = TJ*TJ
+           TJ3 = TJ2*TJ
+           TJ4 = TJ2*TJ2
+           TJ6 = TJ4*TJ2
+           EXPTJ = exp( -TJ*MAGR )
+           TI2MTJ2 = TI2 - TJ2
+           TJ2MTI2 = -TI2MTJ2
+           SA = TI
+           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+           SD = TJ
+           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+       endif
+
+    endif
+enddo
+COULOMBV = KECONST*COULOMBV
+
+end subroutine Ewald_Real_Space_latte
+
+subroutine  Ewald_Real_Space(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
+    DELTAQ,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+
+implicit none
+
+integer,    parameter     :: PREC = 8
+real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I
+real(PREC), intent(in)    :: COULACC, TIMERATIO
+real(PREC)                :: TFACT, RELPERM, KECONST
+real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+real(PREC), intent(in)    :: U(Nr_atoms)
+real(PREC)                :: COULCUT, COULCUT2
+character(10), intent(in) :: Element_Type(Nr_atoms)
+integer,    intent(in)    :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+real(PREC), intent(in)    :: nnRx(Nr_atoms,Max_Nr_Neigh)
+real(PREC), intent(in)    :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
+real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+integer                   :: J,K, ccnt, nnI
+
+COULVOL = LBox(1)*LBox(2)*LBox(3)
+SQRTX = sqrt(-log(COULACC))
+
+ccnt = 0
+COULCUT = 12.D0
+CALPHA = SQRTX/COULCUT
+COULCUT2 = COULCUT*COULCUT
+CALPHA2 = CALPHA*CALPHA
+
+RELPERM = ONE
+KECONST = 14.3996437701414D0*RELPERM
+TFACT  = 16.0D0/(5.0D0*KECONST)
+
+FCOUL = ZERO
+COULOMBV = ZERO
+
+TI = TFACT*U(I)
+TI2 = TI*TI
+TI3 = TI2*TI
+TI4 = TI2*TI2
+TI6 = TI4*TI2
+
+SSA = TI
+SSB = TI3/48.D0
+SSC = 3.D0*TI2/16.D0
+SSD = 11.D0*TI/16.D0
+SSE = 1.D0
+
+Ra(1) = RX(I)
+Ra(2) = RY(I)
+Ra(3) = RZ(I)
+
+do nnI = 1,nrnnlist(I)
+  Rb(1) = nnRx(I,nnI)
+  Rb(2) = nnRy(I,nnI)
+  Rb(3) = nnRz(I,nnI)
+  J = nnType(I,nnI)
+    Rab = Rb-Ra  ! OBS b - a !!!
+    dR = norm2(Rab)
+    MAGR = dR
+    MAGR2 = dR*dR
+
+    if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+       TJ = TFACT*U(J)
+       DC = Rab/dR
+
+       ! Not Using Numerical Recipes ERFC
+       Z = abs(CALPHA*MAGR)
+       NUMREP_ERFC = erfc(Z)
+
+       CA = NUMREP_ERFC/MAGR
+       COULOMBV = COULOMBV + DELTAQ(J)*CA
+       ccnt = ccnt + 1
+       !TEST(ccnt) = DELTAQ(J)*CA
+       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+       FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
+       EXPTI = exp(-TI*MAGR )
+
+       if (Element_Type(I).eq.Element_Type(J)) then
+           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+           ccnt = ccnt + 1
+           !TEST(ccnt) = - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+           FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
+             + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
+       else
+           TJ2 = TJ*TJ
+           TJ3 = TJ2*TJ
+           TJ4 = TJ2*TJ2
+           TJ6 = TJ4*TJ2
+           EXPTJ = exp( -TJ*MAGR )
+           TI2MTJ2 = TI2 - TJ2
+           TJ2MTI2 = -TI2MTJ2
+           SA = TI
+           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+           SD = TJ
+           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+           FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
+              + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
+       endif
+
+       FCOUL(1) = FCOUL(1) + DC(1)*FORCE
+       FCOUL(2) = FCOUL(2) + DC(2)*FORCE
+       FCOUL(3) = FCOUL(3) + DC(3)*FORCE
+    endif
+enddo
+COULOMBV = KECONST*COULOMBV
+
+end subroutine Ewald_Real_Space
+
+subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_Neigh)
+
+implicit none
+
+integer, parameter        :: PREC = 8
+real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
+real(PREC), intent(in)    :: COULACC
+real(PREC)                :: KECONST, TFACT, RELPERM
+real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+real(PREC)                :: COULCUT, COULCUT2
+real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
+real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
+real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
+real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
+
+integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+SQRTX = sqrt(-log(COULACC))
+
+ccnt = 0
+
+COULCUT = 12.0D0
+CALPHA = SQRTX/COULCUT
+
+COULCUT2 = COULCUT*COULCUT
+KCUTOFF = TWO*CALPHA*SQRTX
+KCUTOFF2 = KCUTOFF*KCUTOFF
+CALPHA2 = CALPHA*CALPHA
+FOURCALPHA2 = FOUR*CALPHA2
+
+RECIPVECS = ZERO
+RECIPVECS(1,1) = TWO*pi/Box(1,1)
+RECIPVECS(2,2) = TWO*pi/Box(2,2)
+RECIPVECS(3,3) = TWO*pi/Box(3,3)
+LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+RELPERM = 1.D0
+KECONST = 14.3996437701414D0*RELPERM
+
+COULOMBV = ZERO
+SINLIST = ZERO
+COSLIST = ZERO
+
+do L = 0,LMAX
+
+  if (L.eq.0) then
+    MMIN = 0
+  else
+    MMIN = -MMAX
+  endif
+
+  L11 = L*RECIPVECS(1,1)
+  L12 = L*RECIPVECS(1,2)
+  L13 = L*RECIPVECS(1,3)
+
+  do M = MMIN,MMAX
+
+        NMIN = -NMAX
+        if ((L==0).and.(M==0)) then
+          NMIN = 1
+        endif
+
+        M21 = L11 + M*RECIPVECS(2,1)
+        M22 = L12 + M*RECIPVECS(2,2)
+        M23 = L13 + M*RECIPVECS(2,3)
+
+        do N = NMIN,NMAX
+           K(1) = M21 + N*RECIPVECS(3,1)
+           K(2) = M22 + N*RECIPVECS(3,2)
+           K(3) = M23 + N*RECIPVECS(3,3)
+           K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+           if (K2.le.KCUTOFF2) then
+              PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+              PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
+
+              COSSUM = 0.D0
+              SINSUM = 0.D0
+
+              ! Doing the sin and cos sums
+              !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,DOT) &
+              !$OMP REDUCTION(+:COSSUM) & 
+              !$OMP REDUCTION(+:SINSUM)  
+              do I = 1,Nr_atoms
+                 DOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
+                 ! We re-use these in the next loop...
+                 SINLIST(I) = sin(DOT)
+                 COSLIST(I) = cos(DOT)
+                 COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
+                 SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
+              enddo
+              !$OMP END PARALLEL DO
+              COSSUM2 = COSSUM*COSSUM
+              SINSUM2 = SINSUM*SINSUM
+
+              ! Add up energy and force contributions
+
+              KEPREF = KECONST*PREFACTOR
+              !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I) 
+              do I = 1,Nr_atoms
+                 COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
+              enddo
+              !$OMP END PARALLEL DO
+
+              KEPREF = KEPREF*(COSSUM2 + SINSUM2)
+           endif
+        enddo
+  enddo
+enddo
+
+! Point self energy
+CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+end subroutine Ewald_k_Space_latte
+
+subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TIMERATIO,Max_Nr_Neigh)
+
+implicit none
+
+integer, parameter        :: PREC = 8
+real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
+real(PREC), intent(in)    :: COULACC, TIMERATIO
+real(PREC)                :: KECONST, TFACT, RELPERM
+real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+real(PREC)                :: COULCUT, COULCUT2
+real(PREC), intent(out)   :: COULOMBV(Nr_atoms), FCOUL(3,Nr_atoms)
+real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
+real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
+real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
+
+integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+COULVOL = LBox(1)*LBox(2)*LBox(3)
+SQRTX = sqrt(-log(COULACC))
+
+ccnt = 0
+
+COULCUT = 12.0D0
+CALPHA = SQRTX/COULCUT
+
+COULCUT2 = COULCUT*COULCUT
+KCUTOFF = TWO*CALPHA*SQRTX
+KCUTOFF2 = KCUTOFF*KCUTOFF
+CALPHA2 = CALPHA*CALPHA
+FOURCALPHA2 = FOUR*CALPHA2
+
+RECIPVECS = ZERO
+RECIPVECS(1,1) = TWO*pi/LBox(1)
+RECIPVECS(2,2) = TWO*pi/LBox(2)
+RECIPVECS(3,3) = TWO*pi/LBox(3)
+LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+RELPERM = 1.D0
+KECONST = 14.3996437701414D0*RELPERM
+
+FCOUL = ZERO
+COULOMBV = ZERO
+SINLIST = ZERO
+COSLIST = ZERO
+
+do L = 0,LMAX
+
+  if (L.eq.0) then
+    MMIN = 0
+  else
+    MMIN = -MMAX
+  endif
+
+  L11 = L*RECIPVECS(1,1)
+  L12 = L*RECIPVECS(1,2)
+  L13 = L*RECIPVECS(1,3)
+
+  do M = MMIN,MMAX
+
+        NMIN = -NMAX
+        if ((L==0).and.(M==0)) then
+          NMIN = 1
+        endif
+
+        M21 = L11 + M*RECIPVECS(2,1)
+        M22 = L12 + M*RECIPVECS(2,2)
+        M23 = L13 + M*RECIPVECS(2,3)
+
+        do N = NMIN,NMAX
+           K(1) = M21 + N*RECIPVECS(3,1)
+           K(2) = M22 + N*RECIPVECS(3,2)
+           K(3) = M23 + N*RECIPVECS(3,3)
+           K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+           if (K2.le.KCUTOFF2) then
+              PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+              PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
+
+              COSSUM = 0.D0
+              SINSUM = 0.D0
+
+              ! Doing the sin and cos sums
+              do I = 1,Nr_atoms
+                 DOT = K(1)*RX(I) + K(2)*RY(I) + K(3)*RZ(I)
+                 ! We re-use these in the next loop...
+                 SINLIST(I) = sin(DOT)
+                 COSLIST(I) = cos(DOT)
+                 COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
+                 SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
+              enddo
+              COSSUM2 = COSSUM*COSSUM
+              SINSUM2 = SINSUM*SINSUM
+
+              ! Add up energy and force contributions
+
+              KEPREF = KECONST*PREFACTOR
+              do I = 1,Nr_atoms
+                 COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
+                 FORCE = KEPREF*DELTAQ(I)*(SINLIST(I)*COSSUM - COSLIST(I)*SINSUM)
+                 FCOUL(1,I) = FCOUL(1,I) + FORCE*K(1)
+                 FCOUL(2,I) = FCOUL(2,I) + FORCE*K(2)
+                 FCOUL(3,I) = FCOUL(3,I) + FORCE*K(3)
+              enddo
+
+              KEPREF = KEPREF*(COSSUM2 + SINSUM2)
+           endif
+        enddo
+  enddo
+enddo
+
+! Point self energy
+CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+end subroutine Ewald_k_Space
+
+end module prg_ewald_mod
diff --git a/src/prg_implicit_fermi_mod.F90 b/src/prg_implicit_fermi_mod.F90
index 5c30947c..2e217a98 100644
--- a/src/prg_implicit_fermi_mod.F90
+++ b/src/prg_implicit_fermi_mod.F90
@@ -5,11 +5,13 @@
 !!
 module prg_implicit_fermi_mod
 
+  use omp_lib
   use bml
   use prg_normalize_mod
   use prg_densitymatrix_mod
   use prg_timer_mod
   use prg_parallel_mod
+  use prg_ewald_mod
 
   implicit none
 
@@ -18,13 +20,164 @@ module prg_implicit_fermi_mod
   integer, parameter :: dp = kind(1.0d0)
 
   public :: prg_implicit_fermi
+  public :: prg_implicit_fermi_save_inverse
   public :: prg_implicit_fermi_zero
   public :: prg_test_density_matrix
   public :: prg_implicit_fermi_response
+  public :: prg_implicit_fermi_first_order_response
   public :: prg_finite_diff
 
 contains
 
+  !> Recursive Implicit Fermi Dirac for finite temperature.
+  !! \param Inv_bml Inverses generated by algorithm.  
+  !! \param h_bml Input Hamiltonian matrix.
+  !! \param p_bml Output density matrix.
+  !! \param nsteps Number of recursion steps.
+  !! \param nocc Number of occupied states.
+  !! \param mu Shifted chemical potential
+  !! \param beta Input inverse temperature.
+  !! \param occErrLimit Occupation error limit.
+  !! \param threshold Threshold for multiplication.
+  !! \param tol Tolerance for linear system solver.
+  !! \param SCF_IT The current SCF iteration. 
+  !! \param occiter Counts the total nr of DM calculations during MD. 
+  !! See \cite{niklasson2003}
+  subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc, &
+       mu, beta, occErrLimit, threshold, tol,SCF_IT, occiter)
+
+    implicit none
+
+    type(bml_matrix_t), intent(in) :: h_bml
+    type(bml_matrix_t), intent(inout) :: p_bml, Inv_bml(nsteps)
+    integer, intent(in) :: nsteps, SCF_IT
+    real(dp), intent(in) :: nocc, threshold
+    real(dp), intent(in) :: tol
+    real(dp), intent(in) :: occErrLimit, beta
+    real(dp), intent(inout) :: mu
+    integer, intent(inout) :: occiter
+
+    type(bml_matrix_t) :: w_bml, y_bml, d_bml, aux_bml, p2_bml, I_bml, ai_bml
+    real(dp) :: trdPdmu, trP0, occErr, alpha 
+    real(dp) :: cnst, ofactor, mustep 
+    real(dp), allocatable :: trace(:), gbnd(:)
+    character(20) :: bml_type
+    integer :: N, M, i, iter, muadj, prev
+
+    bml_type = bml_get_type(h_bml)
+    N = bml_get_N(h_bml)
+    M = bml_get_M(h_bml)
+
+    allocate(trace(2))
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, p2_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, d_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, w_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, aux_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, y_bml)
+    call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, I_bml)
+    call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, ai_bml) 
+
+    occErr = 10.0_dp
+    alpha = 1.0_dp
+    prev = 0
+    iter = 0
+    cnst = beta/(1.0_dp*2**(nsteps+2))
+
+    if (SCF_IT .eq. 1) then
+      ! Normalization
+      ! P0 = 0.5*I - cnst*(H0-mu0*I)
+      call bml_copy(h_bml, p_bml)
+      call prg_normalize_implicit_fermi(p_bml, cnst, mu)
+      ! Generate good starting guess for (2*(P2-P)+1)^-1 using conjugate gradient
+      call bml_multiply_x2(p_bml, p2_bml, threshold, trace)
+      ! Y = 2*(P2-P) + II
+      call bml_copy(p2_bml, y_bml)
+      call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
+      call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
+      call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.0001_dp, threshold)
+    else
+      ! Otherwise use previous inverse as starting guess
+      call bml_copy(Inv_bml(1),ai_bml) 
+    end if
+
+    do while (occErr .gt. occErrLimit .or. muadj .eq. 1) 
+      iter = iter + 1
+      muadj = 0
+      write(*,*) 'mu =', mu
+      ! Normalization
+      ! P0 = 0.5*I - cnst*(H0-mu0*I)
+      call bml_copy(h_bml, p_bml)
+      call prg_normalize_implicit_fermi(p_bml, cnst, mu)
+
+        do i = 1, nsteps
+            call bml_multiply_x2(p_bml, p2_bml, threshold, trace)
+            ! Y = 2*(P2-P) + I
+            call bml_copy(p2_bml, y_bml)
+            call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
+            call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
+            ! Find inverse ai = (2*(P2-P)+I)^-1
+            !call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.01_dp, threshold)
+            call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, aux_bml, I_bml, tol, threshold)
+            call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
+            call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation  
+        enddo
+
+      trdPdmu = bml_trace(p_bml)
+      trP0 = trdPdmu
+      trdPdmu = trdPdmu - bml_sum_squares(p_bml) ! sum p(i,j)**2
+      trdPdmu = beta * trdPdmu
+      occErr = abs(trP0 - nocc)
+      write(*,*) 'occerr =', nocc-trP0
+
+      ! If occupation error is too large, do bisection method 
+      if (occerr > 10.0_dp) then 
+         if (nocc-trP0 < 0.0_dp) then
+            if (prev .eq. 1) then
+               alpha = alpha/2
+            endif 
+            prev = -1
+            mu = mu - alpha
+         else 
+            if (prev .eq. -1) then
+               alpha = alpha/2
+            endif
+            prev = 1
+            mu = mu + alpha 
+         endif  
+      ! Otherwise do Newton 
+      else if (occErr .gt. occErrLimit) then
+        mustep = (nocc -trP0)/trdPdmu
+        if (abs(mustep) > 1.0) then
+          mustep = 0.1_dp*mustep
+        end if  
+        mu = mu + mustep 
+        muadj = 1
+      end if
+    enddo
+    ! Adjusting the occupation sometimes causes the perturbation calculation to not converge. 
+    ! For now we recompute the DM one extra time if mu was adjusted. 
+    !if (muadj .eq. 1) then
+      ! Adjust occupation
+     ! call bml_copy(p_bml, d_bml)
+     ! call bml_scale_add_identity(d_bml, -1.0_dp, 1.0_dp, threshold)
+     ! call bml_multiply(p_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
+     ! ofactor = ((nocc - trP0)/trdPdmu) * beta
+     ! call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
+    !end if
+    occiter = occiter + iter
+    call bml_scale(2.0_dp,p_bml)
+    deallocate(trace)
+
+    call bml_deallocate(p2_bml)
+    call bml_deallocate(w_bml)
+    call bml_deallocate(d_bml)
+    call bml_deallocate(y_bml)
+    call bml_deallocate(aux_bml)
+    call bml_deallocate(ai_bml)
+    call bml_deallocate(I_bml)
+
+  end subroutine prg_implicit_fermi_save_inverse
+  
   !> Recursive Implicit Fermi Dirac for finite temperature.
   !! \param h_bml Input Hamiltonian matrix.
   !! \param p_bml Output density matrix.
@@ -63,6 +216,11 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
     N = bml_get_N(h_bml)
     M = bml_get_M(h_bml)
 
+    call bml_print_matrix("h_bml",h_bml,1,10,1,10)
+    call bml_print_matrix("p_bml",p_bml,1,10,1,10)
+    write(*,*) nsteps, k, nocc, &
+       mu, beta, method, osteps, occErrLimit, threshold, tol
+    !stop
     allocate(trace(2))
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, p2_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, d_bml)
@@ -106,7 +264,7 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
           else
             call prg_setup_linsys(p_bml, y_bml, p2_bml, d_bml, w_bml, aux1_bml, aux2_bml, k, threshold)
           end if
-          call prg_conjgrad(y_bml, p_bml, p2_bml, d_bml, w_bml, tol, threshold)
+          call prg_conjgrad(y_bml, p_bml, p2_bml, d_bml, aux1_bml, w_bml, tol, threshold)
         enddo
       else
         write(*,*) "Doing NS"
@@ -123,9 +281,9 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
             call prg_setup_linsys(p_bml, y_bml, p2_bml, d_bml, w_bml, aux1_bml, aux2_bml, k, threshold)
           end if
           if (i .eq. 1) then
-            call prg_conjgrad(y_bml, ai_bml, I_bml, d_bml, w_bml, 0.9_dp, threshold)
+            call prg_conjgrad(y_bml, ai_bml, I_bml, aux1_bml, d_bml, w_bml, 0.9_dp, threshold)
           end if
-          call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, tol, threshold)
+          call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, aux1_bml, I_bml, tol, threshold)
           call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
         enddo
 
@@ -185,7 +343,7 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
     real(dp), intent(in) :: mu, threshold
     real(dp), intent(inout), optional :: tol
 
-    type(bml_matrix_t) :: w_bml, y_bml, d_bml, p2_bml, aux1_bml, aux2_bml, I_bml, ai_bml
+    type(bml_matrix_t) :: w_bml, y_bml, c_bml, d_bml, p2_bml, aux1_bml, aux2_bml, I_bml, ai_bml
     real(dp) :: cnst
     real(dp), allocatable :: trace(:), gbnd(:)
     character(20) :: bml_type
@@ -201,6 +359,7 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, d_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, w_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, y_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, c_bml)
     if (method .eq. 1) then
       call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, I_bml)
       call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, ai_bml)
@@ -221,7 +380,7 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
         call bml_copy(p2_bml, y_bml)
         call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
         call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
-        call prg_conjgrad(y_bml, p_bml, p2_bml, d_bml, w_bml, tol, threshold)
+        call prg_conjgrad(y_bml, p_bml, p2_bml, d_bml, w_bml, c_bml, tol, threshold)
       enddo
     else
       write(*,*) "Doing NS"
@@ -232,9 +391,9 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
         call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
         call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
         if (i .eq. 1) then
-          call prg_conjgrad(y_bml, ai_bml, I_bml, d_bml, w_bml, 0.9_dp, threshold)
+          call prg_conjgrad(y_bml, ai_bml, I_bml, c_bml, d_bml, w_bml, 0.9_dp, threshold)
         end if
-        call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, tol, threshold)
+        call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, c_bml, I_bml, tol, threshold)
         call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
       enddo
     endif
@@ -246,6 +405,7 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
     call bml_deallocate(w_bml)
     call bml_deallocate(d_bml)
     call bml_deallocate(y_bml)
+    call bml_deallocate(c_bml)
     if (method .eq. 1) then
       call bml_deallocate(ai_bml)
       call bml_deallocate(I_bml)
@@ -253,6 +413,89 @@ subroutine prg_implicit_fermi_zero(h_bml, p_bml, nsteps, mu, method, threshold,
 
   end subroutine prg_implicit_fermi_zero
 
+
+  !> Calculate first order density matrix response to perturbations using Implicit Fermi Dirac.
+  !! \param H0_bml Input Hamiltonian matrix.
+  !! \param H1_bml Input First order perturbation of H0.
+  !! \param P0_bml Output density matrix.
+  !! \param P1_bml Output First order density matrix response.
+  !! \param nsteps Number of recursion steps.
+  !! \param mu0 Shifted chemical potential.
+  !! \param beta Input inverse temperature.
+  !! \param nocc Number of occupied states.
+  !! \param threshold Threshold for matrix algebra.
+  !! See \cite{niklasson2015}
+  subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bml, &
+       Inv_bml, nsteps, mu0, beta, nocc, threshold)
+
+    implicit none
+
+    type(bml_matrix_t), intent(in) :: H0_bml, H1_bml, Inv_bml(nsteps)
+    type(bml_matrix_t), intent(inout) :: P0_bml, P1_bml
+    real(dp), intent(in) :: mu0, threshold
+    real(dp)  :: mu1
+    real(dp), intent(in) :: beta, nocc
+    integer, intent(in) :: nsteps
+    type(bml_matrix_t) :: B_bml, C_bml, C0_bml
+    character(20) :: bml_type
+    real(dp) :: p1_trace, dPdmu_trace, p1B_trace, mu1B, cnst 
+    integer :: N, M, i, j, k
+
+    bml_type = bml_get_type(H0_bml)
+    N = bml_get_N(H0_bml)
+    M = bml_get_M(H0_bml)
+
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, B_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C0_bml)
+
+    cnst = beta/(2**(2+nsteps))
+
+      ! P0 = 0.5*II - cnst*(H0-mu0*II)
+      call bml_copy(H0_bml, P0_bml)
+      call prg_normalize_implicit_fermi(P0_bml, cnst, mu0)
+
+      ! P1 =  - cnst*H1
+      call bml_copy(H1_bml, P1_bml)
+      call bml_scale(-1.0_dp*cnst, P1_bml)
+      do i = 1, nsteps
+
+        ! Calculate coefficient matrices
+        ! C0 = P0^2
+        call bml_multiply(P0_bml, P0_bml, C0_bml, 1.0_dp, 0.0_dp, threshold)
+        ! C = P0*P1+P1*P0, B = 2(P1 - C)
+        call bml_multiply(P0_bml, P1_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+        call bml_multiply(P1_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+        call bml_copy(P1_bml, B_bml)
+        call bml_add(B_bml, C_bml, 2.0_dp, -2.0_dp, threshold)
+        ! Get next P0
+        call bml_multiply(Inv_bml(i), C0_bml, P0_bml, 1.0_dp, 0.0_dp, threshold)
+        ! Get next P1
+        ! C = P0*P1+P1*P0 + 2(P1 -P0*P1-P1*P0)*P0(i+1)
+        call bml_multiply(B_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+        call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
+      enddo
+
+      ! dPdmu = beta*P0(I-P0)
+      call bml_copy(P0_bml, B_bml)
+      call bml_scale_add_identity(B_bml, -1.0_dp, 1.0_dp, threshold)
+      call bml_multiply(P0_bml, B_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+      call bml_scale(beta,C_bml)
+      dPdmu_trace = bml_trace(C_bml)
+      p1_trace = bml_trace(P1_bml)
+      mu1 =  - p1_trace/dPdmu_trace
+      if (abs(dPdmu_trace) > 1e-8) then
+        call bml_add(P1_bml,C_bml,1.0_dp,mu1,threshold)
+      endif 
+
+      call bml_deallocate(B_bml)
+      call bml_deallocate(C_bml)
+      call bml_deallocate(C0_bml)
+
+
+  end subroutine prg_implicit_fermi_first_order_response
+
+
   !> Calculate density matrix response to perturbations using Implicit Fermi Dirac.
   !! \param H0_bml Input Hamiltonian matrix.
   !! \param H1_bml, H2_bml, H3_bml Input First to third order perturbations of H0.
@@ -270,7 +513,7 @@ end subroutine prg_implicit_fermi_zero
   !! See \cite{niklasson2015}
   subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P1_bml, P2_bml, P3_bml, &
        nsteps, mu0, mu, beta, nocc, occ_tol, lin_tol, order, threshold)
-
+  
     implicit none
 
     type(bml_matrix_t), intent(in) :: H0_bml, H1_bml, H2_bml, H3_bml
@@ -279,7 +522,7 @@ subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P
     real(dp), allocatable, intent(inout) :: mu(:)
     real(dp), intent(in) :: beta, occ_tol, lin_tol, nocc
     integer, intent(in) :: nsteps
-    type(bml_matrix_t) :: I_bml, tmp1_bml, tmp2_bml, C0_bml, T_bml, Ti_bml
+    type(bml_matrix_t) :: I_bml, tmp1_bml, tmp2_bml, tmp3_bml, C0_bml, T_bml, Ti_bml
     type(bml_matrix_t), allocatable :: B_bml(:), P_bml(:), C_bml(:), H_bml(:)
     real(dp), allocatable :: p_trace(:), trace(:)
     character(20) :: bml_type
@@ -305,6 +548,7 @@ subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P
     end do
 
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, tmp1_bml)
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, tmp3_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, tmp2_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C0_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, T_bml)
@@ -371,10 +615,10 @@ subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P
         call bml_scale_add_identity(T_bml, 2.0_dp, 1.0_dp, threshold)
         ! Find T-inverse
         if (i .eq. 1) then
-          call prg_conjgrad(T_bml, Ti_bml, I_bml, tmp1_bml, tmp2_bml, 0.01_dp, threshold)
+          call prg_conjgrad(T_bml, Ti_bml, I_bml, tmp1_bml, tmp2_bml, tmp3_bml,0.01_dp, threshold)
           call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, I_bml)
         end if
-        call prg_newtonschulz(T_bml, Ti_bml, tmp1_bml, tmp2_bml, lin_tol, threshold)
+        call prg_newtonschulz(T_bml, Ti_bml, tmp1_bml, tmp2_bml, tmp3_bml, I_bml, lin_tol, threshold)
         ! Get next P0
         call bml_multiply(Ti_bml, C0_bml, P0_bml, 1.0_dp, 0.0_dp, threshold)
         ! Get next P1
@@ -608,31 +852,39 @@ end subroutine prg_setup_linsys
   !! \param tol Convergence criterion (Frobenius norm of residual matrix)
   !! \param threshold Threshold for matrix algebra
 
-  subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, tol, threshold)
+  subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, d_bml, I_bml, tol, threshold)
 
     implicit none
 
-    type(bml_matrix_t), intent(inout) :: ai_bml, r_bml, tmp_bml
-    type(bml_matrix_t), intent(in) :: a_bml
+    type(bml_matrix_t), intent(inout) :: ai_bml, r_bml, tmp_bml, d_bml
+    type(bml_matrix_t), intent(in) :: a_bml, I_bml
     real(dp), intent(in) :: threshold, tol
-    real(dp) :: norm
-    integer :: i
+    real(dp) :: err,prev_err,scaled_tol
+    integer :: i,N,N2
 
-    norm = 1.0
+    N = bml_get_N(a_bml)
+    err = 100000.0
     i = 0
-    do while(norm > tol)
+    N2 = N*N
+    scaled_tol = tol*N
+    do while(err > scaled_tol)
+      !write(*,*) 'iter = ', i 
       call bml_copy(ai_bml, tmp_bml)
       call bml_multiply(a_bml, ai_bml, r_bml, 1.0_dp, 0.0_dp, threshold)
       call bml_scale_add_identity(r_bml, -1.0_dp, 1.0_dp, threshold)
-      norm = bml_fnorm(r_bml)
-      !   write(*,*) "norm = ", norm
-      if (norm < tol) then
-        exit
-      end if
-      call bml_multiply(tmp_bml, r_bml, ai_bml, 1.0_dp, 1.0_dp, threshold)
+      prev_err = err
+      err = bml_fnorm(r_bml)
+      !write(*,*) "err = ", err
+      !write(*,*) "prev_err = ", prev_err
+      if (10*prev_err < err) then 
+        write(*,*) 'NS did not converge, calling conjugate gradient'
+        call prg_conjgrad(a_bml, ai_bml, I_bml, r_bml, tmp_bml, d_bml, 0.0001_dp, threshold)
+      else 
+        call bml_multiply(tmp_bml, r_bml, ai_bml, 1.0_dp, 1.0_dp, threshold)
+      endif 
       i = i + 1
     enddo
-    ! write(*,*) "Number of NS iterations:", i
+    !write(*,*) "Number of NS iterations:", i
   end subroutine prg_newtonschulz
 
   ! Preconditioned CG, preconditioner inverse diagonal of A
@@ -717,42 +969,42 @@ end subroutine prg_pcg
   !! \param w_bml Auxillary matrix
   !! \param cg_tol Convergence condition (OBS squared Frobenius norm of residual matrix)
   !! \param threshold Threshold for matrix algebra
-  subroutine prg_conjgrad(A_bml, p_bml, p2_bml, d_bml, w_bml, cg_tol, threshold)
+  subroutine prg_conjgrad(A_bml, p_bml, p2_bml, tmp_bml, d_bml, w_bml, cg_tol, threshold)
 
     implicit none
 
-    type(bml_matrix_t), intent(in) :: A_bml
-    type(bml_matrix_t), intent(inout) :: p_bml, p2_bml, d_bml, w_bml
+    type(bml_matrix_t), intent(in) :: A_bml, p2_bml
+    type(bml_matrix_t), intent(inout) :: p_bml, tmp_bml, d_bml, w_bml
     real(dp), intent(in) :: cg_tol, threshold
 
     real(dp) :: alpha, beta
     integer :: k
     real(dp) :: r_norm_old, r_norm_new
 
-    call bml_multiply(A_bml, p_bml, p2_bml, -1.0_dp, 1.0_dp, threshold)
-    r_norm_new = bml_sum_squares(p2_bml)
+    call bml_copy(p2_bml,tmp_bml)
+    call bml_multiply(A_bml, p_bml, tmp_bml, -1.0_dp, 1.0_dp, threshold)
+    r_norm_new = bml_sum_squares(tmp_bml)
     k = 0
 
     do while (r_norm_new .gt. cg_tol)
 
-      !   write(*,*) r_norm_new
+      write(*,*) r_norm_new
       k = k + 1
       if (k .eq. 1) then
-        write(*,*) r_norm_new
-        call bml_copy(p2_bml, d_bml)
+        call bml_copy(tmp_bml, d_bml)
       else
         beta = r_norm_new/r_norm_old
-        call bml_add(d_bml, p2_bml, beta, 1.0_dp, threshold)
+        call bml_add(d_bml, tmp_bml, beta, 1.0_dp, threshold)
       endif
 
       call bml_multiply(A_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
       alpha = r_norm_new/bml_trace_mult(d_bml, w_bml)
 
       call bml_add(p_bml, d_bml, 1.0_dp, alpha, threshold)
-      call bml_add(p2_bml, w_bml, 1.0_dp, -alpha, threshold)
+      call bml_add(tmp_bml, w_bml, 1.0_dp, -alpha, threshold)
       r_norm_old = r_norm_new
-      r_norm_new = bml_sum_squares(p2_bml)
-      if (k .gt. 50) then
+      r_norm_new = bml_sum_squares(tmp_bml)
+      if (k .gt. 500) then
         write(*,*) "Conjugate gradient is not converging"
         stop
       endif
diff --git a/src/prg_xlbokernel_mod.F90 b/src/prg_xlbokernel_mod.F90
new file mode 100644
index 00000000..54f4ecf1
--- /dev/null
+++ b/src/prg_xlbokernel_mod.F90
@@ -0,0 +1,837 @@
+!> Pre-conditioned O(N) calculation of the kernel for XL-BOMD.
+!! \ingroup PROGRESS
+!! \brief Here are subroutines implementing Niklasson's scheme for 
+!!  low-rank, Krylov subspace approximation of the kernel. 
+module prg_xlbokernel_mod
+
+  use omp_lib
+  use bml
+  use prg_normalize_mod
+  use prg_densitymatrix_mod
+  use prg_timer_mod
+  use prg_parallel_mod
+  use prg_ewald_mod
+  use prg_implicit_fermi_mod 
+
+  implicit none
+
+  private  !Everything is private by default
+
+  integer, parameter :: dp = kind(1.0d0)
+
+  public :: prg_kernel_multirank
+  public :: prg_kernel_multirank_latte
+  public :: prg_kernel_matrix_multirank
+  public :: prg_full_kernel
+  public :: prg_full_kernel_latte
+
+contains
+
+subroutine Invert(A,AI,N)
+
+implicit none
+integer, parameter                          :: PREC = 8
+integer, intent(in)                         :: N
+real(PREC),                 intent(in)      :: A(N,N)
+real(PREC), intent(out)                     :: AI(N,N)
+real(PREC)                                  :: WORK(N+N*N)!, C(N,N)
+integer                                     :: LDA, LWORK, M, INFO, IPIV(N)
+integer                                     :: I,J,K
+
+external DGETRF
+external DGETRI
+
+AI = A
+LDA = N
+M = N
+LWORK = N+N*N
+
+call DGETRF(M, N, AI, LDA, IPIV, INFO)
+call DGETRI(N, AI, N, IPIV, WORK, LWORK, INFO)
+
+end subroutine Invert
+  
+!> Compute low rank approximation of (K0*J)^(-1)*K0*(q[n]-n)(for LATTE)
+!! \param KRes The low rank approximation 
+!! \param KK0_bml The pre-conditioner K0.
+!! \param Res The residual q[n]-n
+!! \param FelTol Relative error tolerance for approximation 
+!! \param L Number of vectors used. 
+!! \param LMAX Maximum nr of vectors to use. 
+!! \param NUMRANK Nr of vectors to use. 
+!! \param HO_bml, Orthogonalized Hamiltonian matrix. 
+!! \param mu The chemical potiential. 
+!! \param beta Scaled inverse temperature.
+!! \param RXYZ Nuclear coordinates.
+!! \param Box Box dimensions. 
+!! \param Hubbard_U Hubbard U list. 
+!! \param Element_Pointer List to keep track of elements. 
+!! \param Nr_atoms The number of atoms. 
+!! \param HDIM Hamiltonian matrix dimension. 
+!! \param Max_Nr_Neigh Max neighbours for Ewald. 
+!! \param Coulomb_acc Coulomb accuracy.
+!! \param nebcoul Neighbour lists.
+!! \param totnebcoul Number of neighbours list. 
+!! \param Hinxlist List to keep track of atomic positions in the Hamiltonian. 
+!! \param S_bml The S matrix. 
+!! \param Z_bml, The Z matrix. 
+!! \param Nocc Occupation. 
+!! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+!! \param DO_bml, D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+!! \param m_rec Number of recursion steps.
+!! \param threshold Threshold value for matrix truncation.
+!! \param Nr_elem Number of elements in Hubbard list. 
+subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_bml,mu,beta,RXYZ,Box,Hubbard_U,Element_Pointer, & 
+    Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,nebcoul,totnebcoul,Hinxlist, &
+    S_bml,Z_bml,Nocc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold,Nr_elem)
+
+!! Res = q[n] - n
+!! KK0 is preconditioner
+!! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+  implicit none
+  integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec,Nr_elem,NUMRANK
+  real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+  real(dp), intent(in)      :: Coulomb_acc, FelTol, beta, mu
+  real(dp), intent(in)      :: RXYZ(3,Nr_atoms), Box(3,3)
+  real(dp), intent(in)      :: Res(Nr_atoms)
+  integer,    intent(in)      :: Hinxlist(HDIM),Element_Pointer(Nr_atoms)
+  real(dp), intent(in)      :: Hubbard_U(Nr_elem)
+  type(bml_matrix_t), intent(inout)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
+  real(dp)                  :: K0Res(Nr_atoms)
+  type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+  real(dp), intent(in)      :: threshold
+  integer,    intent(in)      :: LMAX
+  character(20)             :: bml_type
+  integer,    intent(in)      :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+  real(dp)                  :: dq_v(Nr_atoms)
+  real(dp), intent(inout)   :: KRes(Nr_atoms)
+  integer                     :: I,J,K,It,N,MN
+  integer, intent(out)        :: L
+  real(dp)                  :: Fel, proj_tmp, start, finish
+  real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+  real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+  real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
+  type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml, ZT_bml, T_bml
+
+  call timer_prg_init()
+  bml_type = bml_get_type(HO_bml)
+  N = bml_get_N(HO_bml)
+  MN = bml_get_M(HO_bml)
+  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
+  call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
+  call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
+  call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
+  call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+
+  call bml_transpose(Z_bml,ZT_bml)
+
+  ! K0Res = KK0*Res temporary for matrix-vector multiplication
+  call bml_set_row(Res_bml,1,Res,threshold)
+  call bml_transpose(KK0_bml,KK0T_bml)
+  call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
+  call bml_get_row(K0Res_bml,1,row_NA)
+  K0Res = row_NA
+  dr = K0Res
+
+  I = 0
+  Fel = 1.D0  
+  do while ((Fel > FelTol).AND.(I < (LMAX)).AND.(I < NUMRANK))   !! Fel = "Error" in Swedish
+    I = I + 1
+    vi(:,I) = dr/norm2(dr)
+    do J = 1,I-1
+      vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+    enddo
+    vi(:,I) = vi(:,I)/norm2(vi(:,I))
+    v(:) = vi(:,I)  ! v_i
+
+    ! Compute H1 = H(v) 
+    dq_v = v 
+    call prg_timer_start(1)
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I)
+    do J = 1,Nr_atoms
+      call Ewald_Real_Space_latte(Coulomb_Pot_Real_I,J,RXYZ,Box, &
+      dq_v,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_elem)
+      Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+    enddo
+    !$OMP END PARALLEL DO
+    
+    call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc,Max_Nr_Neigh)
+    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+    call prg_timer_stop(1,1)
+
+
+    call bml_deallocate(H1_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,HDIM,MN,H1_bml)
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+    do It = 1,Nr_atoms-1
+       do K = Hinxlist(It)+1,Hinxlist(It+1)
+           row1(K) = Hubbard_U(Element_Pointer(It))*dq_v(It) + Coulomb_Pot_dq_v(It)
+       enddo
+    enddo
+    !$OMP END PARALLEL DO
+    do K = Hinxlist(Nr_atoms)+1,HDIM
+         row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
+    enddo
+     
+    ! H1 = 1/2(S*H1+H1*S)
+    call bml_set_diagonal(H1_bml,row1,threshold)
+    call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold) 
+    call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
+    call bml_copy(X_bml,H1_bml)
+
+    ! H1 = Z^T H1 Z
+    call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+    call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+    ! Compute D1 = F_FD(HO_bml + eps*H1_bml)/eps at eps = 0
+    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+        m_rec, mu, beta, real(nocc,dp), threshold)
+    
+    ! D1 = Z D1 Z^T
+    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+    call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+    ! Compute dq/dv
+    call bml_multiply(D1_bml,S_bml,X_bml, 1.0_dp,0.0_dp,threshold)
+    call bml_get_diagonal(X_bml,row1)
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+    do It = 1, Nr_atoms-1
+       dq_dv(It) = 0
+       do K = Hinxlist(It)+1,Hinxlist(It+1)
+          dq_dv(It) = dq_dv(It) + row1(K)
+       enddo
+    enddo
+    !$OMP END PARALLEL DO
+    dq_dv(Nr_atoms) = 0
+    do K = Hinxlist(Nr_atoms)+1,HDIM
+       dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K) 
+    enddo
+
+    dr = dq_dv - v
+    ! fi = K0(dq_dv - v)
+    call bml_set_row(Res_bml,1,dr,threshold)
+    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
+    call bml_get_row(K0Res_bml,1,row_NA)
+    dr = row_NA
+    fi(:,I) = dr
+
+    L = I
+    allocate(O(L,L), M(L,L))
+    do K = 1,L
+    do J = 1,L
+       O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+    enddo
+    enddo
+    call Invert(O,M,L)                        ! M = O^(-1)
+    IdentRes = 0.D0*K0Res
+    KRes = 0.D0
+    do K = 1,L
+    do J = 1,L
+      proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
+      IdentRes = IdentRes + proj_tmp*fi(:,K)
+      KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n) 
+    enddo
+    enddo
+    Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+    write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+    deallocate(O, M)
+
+  enddo
+
+  deallocate(row1);deallocate(row2);deallocate(row_NA)
+  call bml_deallocate(KK0T_bml)
+  call bml_deallocate(K0Res_bml)
+  call bml_deallocate(Res_bml)
+  call bml_deallocate(ZT_bml)
+  call prg_timer_shutdown() 
+
+  end subroutine prg_kernel_multirank_latte
+
+  ! Above routine but for development code   
+  subroutine prg_kernel_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, & 
+    Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
+    S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+
+!! Res = q[n] - n
+!! KK0 is preconditioner
+!! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+  implicit none
+  integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
+  real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+  real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
+  real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+  real(dp), intent(in)      :: Res(Nr_atoms)
+  integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+  real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+  type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
+  real(dp)                  :: K0Res(Nr_atoms)
+  type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+  real(dp), intent(in)      :: threshold
+  integer,    intent(in)      :: LMAX
+  character(10), intent(in)   :: Element_Type(Nr_atoms)
+  integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+  real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
+  real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+  real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+  real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+  real(dp)                  :: dq_v(Nr_atoms)
+  real(dp), intent(inout)   :: KRes(Nr_atoms)
+  integer                     :: I,J,K,It
+  integer, intent(out)        :: L
+  real(dp)                  :: Fel, proj_tmp
+  real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+  real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+  real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
+  type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml
+
+  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
+  call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
+  call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
+  call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
+
+  ! K0Res = KK0*Res temporary for matrix-vector multiplication
+  call bml_transpose(KK0_bml,KK0T_bml) 
+  call bml_set_row(Res_bml,1,Res,ONE*1e-14)
+  call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,ONE*1e-14)
+  call bml_get_row(K0Res_bml,1,row_NA)
+  K0Res = row_NA
+  dr = K0Res
+
+  I = 0
+  Fel = 1.D0  
+  do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
+    I = I + 1
+    vi(:,I) = dr/norm2(dr)
+    do J = 1,I-1
+      vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+    enddo
+    vi(:,I) = vi(:,I)/norm2(vi(:,I))
+    v(:) = vi(:,I)  ! v_i
+
+    dq_v = v 
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
+    do J = 1,Nr_atoms
+      call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
+      dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+      Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+    enddo
+    !$OMP END PARALLEL DO
+    
+    call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+    row1 = 0.0_dp
+    do J = 1,HDIM
+       call bml_set_row(H1_bml,J,row1,threshold)
+    enddo
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
+    do J = 1,Nr_atoms
+       do K = H_INDEX_START(J),H_INDEX_END(J)
+           row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
+       enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    call bml_set_diagonal(H1_bml,row1,threshold)
+    call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold) 
+    call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+    call bml_copy(D1_bml,H1_bml)
+
+    call bml_transpose(Z_bml,X_bml)
+    call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+    call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+        m_rec, mu, beta, real(nocc,dp), threshold)
+    
+    call bml_transpose(Z_bml,X_bml)
+    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+    call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
+    do It = 1, Nr_atoms
+      dq_dv(It) = 0
+      do K = H_INDEX_START(It), H_INDEX_END(It)
+        call bml_get_row(S_bml,K,row1)
+        call bml_get_row(D1_bml,K,row2)
+        dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    dr = dq_dv - v 
+    ! fi = K0(dq_dv - v)
+    call bml_set_row(Res_bml,1,dr,threshold)
+    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
+    call bml_get_row(K0Res_bml,1,row_NA)
+    dr = row_NA
+    fi(:,I) = dr
+
+    L = I
+    allocate(O(L,L), M(L,L))
+    do K = 1,L
+    do J = 1,L
+       O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+    enddo
+    enddo
+    call Invert(O,M,L)                        ! M = O^(-1)
+    IdentRes = 0.D0*K0Res
+    KRes = 0.D0
+    do K = 1,L
+    do J = 1,L
+      proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
+      IdentRes = IdentRes + proj_tmp*fi(:,K)
+      KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n) 
+    enddo
+    enddo
+    Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+    write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+    deallocate(O, M)
+
+  enddo
+
+  deallocate(row1);deallocate(row2);deallocate(row_NA)
+  call bml_deallocate(KK0T_bml)
+  call bml_deallocate(K0Res_bml)
+  call bml_deallocate(Res_bml)
+
+  end subroutine prg_kernel_multirank
+
+!> Compute full inverse Jacobian of q[n]-n (for LATTE)
+!! \param KK The inverse Jacobian.
+!! \param DO_bml Orthogonalized density matrix. 
+!! \param mu0 The chemical potiential. 
+!! \param RXYZ Nuclear coordinates.
+!! \param Box Box dimensions. 
+!! \param Hubbard_U Hubbard U list. 
+!! \param Element_Pointer List to keep track of elements. 
+!! \param Nr_atoms The number of atoms. 
+!! \param HDIM Hamiltonian matrix dimension. 
+!! \param Max_Nr_Neigh Max neighbours for Ewald. 
+!! \param Coulomb_acc Coulomb accuracy
+!! \param Hinxlist List to keep track of atomic positions in the Hamiltonian. 
+!! \param S_bml The S matrix. 
+!! \param Z_bml, The Z matrix. 
+!! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+!! \param HO_bml, Orthogonalized Hamiltonian matrix. 
+!! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+!! \param Nocc Occupation. 
+!! \param m_rec Number of recursion steps.
+!! \param threshold Threshold value for matrix truncation.
+!! \param beta Scaled inverse temperature.
+!! \param Nr_elem Number of elements in Hubbard list. 
+subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
+Element_Pointer, Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc, & 
+Hinxlist,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
+Nocc,m_rec,threshold,beta,Nr_elem)
+
+use bml
+
+implicit none
+integer, parameter             :: PREC = 8, dp = kind(1.0d0)
+integer,    intent(in)         :: Nr_atoms, Nr_elem,HDIM, Nocc, Max_Nr_Neigh,m_rec
+real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
+real(PREC), intent(in)         :: Coulomb_acc, threshold,beta
+real(PREC)                     :: v(Nr_atoms)
+real(PREC), intent(in)         :: RXYZ(3,Nr_atoms),Box(3,3)
+integer,    intent(in)         :: Hinxlist(HDIM)
+real(PREC), intent(in)         :: Hubbard_U(Nr_elem)
+type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
+type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
+type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml 
+real(PREC), intent(inout)      :: mu0
+integer,    intent(in)         :: Element_Pointer(Nr_atoms)
+real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
+real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+real(PREC)                     :: dq_v(Nr_atoms)
+real(PREC)                     :: dq_dv(Nr_atoms), err,tol
+real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
+integer                        :: I,J,K, ITER, mm,It,N,MN
+real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)     
+type(bml_matrix_t)             :: ZT_bml, R_bml, JJI_bml, JJ_bml, tmp_bml
+character(20)                  :: bml_type
+
+  bml_type = bml_get_type(HO_bml)
+  N = bml_get_N(HO_bml)
+  MN = bml_get_M(HO_bml)
+  call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+  call bml_transpose(Z_bml,ZT_bml)
+  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
+  Coulomb_Pot_dq_v = ZERO
+  Coulomb_Pot_k = ZERO
+  dq_v = ZERO
+  JJ = ZERO
+  KK = ZERO
+  
+  do J = 1,Nr_atoms  
+    dq_v(J) = ONE
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
+    do I = 1,Nr_atoms   
+        call Ewald_Real_Space_Single_latte(Coulomb_Pot_Real_I,I,RXYZ,Box,Nr_elem, &
+        dq_v,J,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,HDIM,Max_Nr_Neigh)
+        Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
+    enddo
+    !$OMP END PARALLEL DO
+    call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc, &
+    Max_Nr_Neigh)
+    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+    call bml_deallocate(H1_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,H1_bml)
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
+    do I = 1,Nr_atoms-1
+       do K = Hinxlist(I)+1,Hinxlist(I+1)
+           row1(K) = Hubbard_U(Element_Pointer(I))*dq_v(I) + Coulomb_Pot_dq_v(I)
+       enddo
+    enddo
+    !$OMP END PARALLEL DO
+    do K = Hinxlist(Nr_atoms)+1,HDIM
+         row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
+    enddo
+
+    call bml_set_diagonal(H1_bml,row1,threshold)
+    call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold) 
+    call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
+    call bml_copy(X_bml,H1_bml)
+
+    call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+    call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec,mu0,beta,real(nocc,PREC),threshold) 
+
+    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+    call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+    call bml_multiply(D1_bml,S_bml,X_bml,1.0_dp,0.0_dp,threshold)
+    call bml_get_diagonal(X_bml,row1)
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+    do It = 1, Nr_atoms-1
+       dq_dv(It) = 0
+       do K = Hinxlist(It)+1,Hinxlist(It+1)
+          dq_dv(It) = dq_dv(It) + row1(K)
+       enddo
+       JJ(It,J) = dq_dv(It)
+    enddo
+    !$OMP END PARALLEL DO
+    dq_dv(Nr_atoms) = 0
+    do K = Hinxlist(Nr_atoms)+1,HDIM
+       dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K) 
+    enddo
+    JJ(Nr_atoms,J) = dq_dv(Nr_atoms)
+    dq_v = ZERO
+  enddo
+  
+  do I = 1,Nr_atoms
+     JJ(I,I) = JJ(I,I) - ONE
+  enddo
+  
+  call Invert(JJ,KK,Nr_atoms)
+  
+  deallocate(row1); deallocate(row2); deallocate(JJ)
+  call bml_deallocate(ZT_bml)
+
+end subroutine prg_full_kernel_latte
+
+!> Compute full inverse Jacobian of q[n]-n (for development code) 
+!! \param KK The inverse Jacobian.
+!! \param DO_bml Orthogonalized density matrix. 
+!! \param mu0 The chemical potiential. 
+!! \param RX,RY,RZ Nuclear coordinates.
+!! \param Lbox Box dimensions. 
+!! \param Hubbard_U Hubbard U list. 
+!! \param Element_Type List to keep track of elements. 
+!! \param Nr_atoms The number of atoms. 
+!! \param HDIM Hamiltonian matrix dimension. 
+!! \param Max_Nr_Neigh Max neighbours for Ewald. 
+!! \param Coulomb_acc Coulomb accuracy
+!! \param TIMERATIO Parameter for Ewald
+!! \param nnRx,nnRy,nnRz Neighbour lists.
+!! \param nrnnlist Number of neighbours list.
+!! \param nnType Refers to original order of atoms. 
+!! \param H_INDEX_START, H_INDEX_END Lists to keep track of atomic positions in the Hamiltonian. 
+!! \param S_bml The S matrix. 
+!! \param Z_bml, The Z matrix. 
+!! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+!! \param HO_bml, Orthogonalized Hamiltonian matrix. 
+!! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+!! \param Nocc Occupation. 
+!! \param Znuc List of nuclear charges.
+!! \param m_rec Number of recursion steps.
+!! \param threshold Threshold value for matrix truncation.
+!! \param beta Scaled inverse temperature.
+!! \param diagonal Auxillary vector. 
+
+subroutine prg_full_kernel(KK,DO_bml,mu0,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
+Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz, & 
+nrnnlist,nnType,H_INDEX_START,H_INDEX_END,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
+Nocc,Znuc,m_rec,threshold,beta,diagonal)
+
+use bml
+
+implicit none
+integer, parameter             :: PREC = 8, dp = kind(1.0d0)
+integer,    intent(in)         :: Nr_atoms, HDIM, Nocc, Max_Nr_Neigh,m_rec
+real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
+real(PREC), intent(in)         :: Coulomb_acc, TIMERATIO,threshold,beta
+real(PREC)                     :: v(Nr_atoms)
+real(PREC), intent(in)         :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+integer,    intent(in)         :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+real(PREC), intent(in)         :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
+type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
+type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml 
+real(PREC), intent(inout)      :: mu0, diagonal(HDIM)
+character(10), intent(in)      :: Element_Type(Nr_atoms)
+integer,    intent(in)         :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+real(PREC), intent(in)         :: nnRx(Nr_atoms,Max_Nr_Neigh)
+real(PREC), intent(in)         :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
+real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+real(PREC)                     :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+real(PREC)                     :: dq_v(Nr_atoms)
+real(PREC)                     :: dq_dv(Nr_atoms) 
+real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
+integer                        :: I,J,K, ITER, mm,It
+real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)        
+
+  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
+  Coulomb_Pot_dq_v = ZERO
+  Coulomb_Pot_k = ZERO
+  dq_v = ZERO
+  JJ = ZERO
+  KK = ZERO
+  
+  do J = 1,Nr_atoms  
+    dq_v(J) = ONE
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
+    do I = 1,Nr_atoms   
+        call Ewald_Real_Space_Single(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,I,RX,RY,RZ,LBox, &
+        dq_v,J,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,HDIM,Max_Nr_Neigh)
+        Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
+    enddo
+    !$OMP END PARALLEL DO
+    call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
+    TIMERATIO,Max_Nr_Neigh)
+    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+    diagonal = 0.0_dp
+    do I = 1,HDIM
+       call bml_set_row(H1_bml,I,diagonal,threshold)
+    enddo
+    
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
+    do I = 1,Nr_atoms
+       do K = H_INDEX_START(I),H_INDEX_END(I)
+           diagonal(K) = Hubbard_U(I)*dq_v(I) + Coulomb_Pot_dq_v(I)
+       enddo
+    enddo
+    !$OMP END PARALLEL DO
+    
+    call bml_set_diagonal(H1_bml,diagonal,threshold)
+    call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold) 
+    call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+    call bml_copy(D1_bml,H1_bml)
+
+    call bml_transpose(Z_bml,X_bml)
+    call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+    call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec,mu0,beta,real(nocc,PREC),threshold) 
+
+    call bml_transpose(Z_bml,X_bml)
+    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+    call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+    do It = 1, Nr_atoms
+       dq_dv(It) = 0
+       do K = H_INDEX_START(It), H_INDEX_END(It)
+          call bml_get_row(S_bml,K,row1)
+          call bml_get_row(D1_bml,K,row2)
+          dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+       enddo
+       JJ(It,J) = dq_dv(It)
+    enddo
+    dq_v = ZERO
+  enddo
+
+  do I = 1,Nr_atoms
+     JJ(I,I) = JJ(I,I) - ONE
+  enddo
+  call Invert(JJ,KK,Nr_atoms)
+  deallocate(row1); deallocate(row2); deallocate(JJ)
+
+end subroutine prg_full_kernel
+
+
+! Compute the low-rank kernel matrix. (For development code) 
+  subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, & 
+    Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
+    S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+
+!! Res = q[n] - n
+!! KK0 is preconditioner
+!! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+  implicit none
+  integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
+  real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+  real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
+  real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+  real(dp), intent(in)      :: Res(Nr_atoms)
+  integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+  real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+  type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec)
+  real(dp)                  :: K0Res(Nr_atoms)
+  type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+  type(bml_matrix_t),intent(inout) :: KK0_bml
+  real(dp), intent(in)      :: threshold
+  integer,    intent(in)      :: LMAX
+  character(10), intent(in)   :: Element_Type(Nr_atoms)
+  integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+  real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
+  real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+  real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+  real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+  real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+  real(dp)                  :: dq_v(Nr_atoms)
+  real(dp), intent(out)     :: KRes(Nr_atoms)
+  integer                     :: I,J,K,It,col,row
+  integer, intent(out)        :: L
+  real(dp)                  :: Fel, proj_tmp,elem
+  real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+  real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+  real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:)
+
+  allocate(row1(HDIM));allocate(row2(HDIM));
+
+  dr = Res
+  I = 0
+  Fel = 1.D0  
+  do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
+    I = I + 1
+    vi(:,I) = dr/norm2(dr)
+    do J = 1,I-1
+      vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+    enddo
+    vi(:,I) = vi(:,I)/norm2(vi(:,I))
+    v(:) = vi(:,I)  ! v_i
+
+    dq_v = v 
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
+    do J = 1,Nr_atoms
+      call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
+      dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+      Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+    enddo
+    !$OMP END PARALLEL DO
+    
+    call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+    row1 = 0.0_dp
+    do J = 1,HDIM
+       call bml_set_row(H1_bml,J,row1,threshold)
+    enddo
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
+    do J = 1,Nr_atoms
+       do K = H_INDEX_START(J),H_INDEX_END(J)
+           row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
+       enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    call bml_set_diagonal(H1_bml,row1,threshold)
+    call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold) 
+    call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+    call bml_copy(D1_bml,H1_bml)
+
+    call bml_transpose(Z_bml,X_bml)
+    call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+    call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+        m_rec, mu, beta, real(nocc,dp), threshold)
+    
+    call bml_transpose(Z_bml,X_bml)
+    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+    call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
+    do It = 1, Nr_atoms
+      dq_dv(It) = 0
+      do K = H_INDEX_START(It), H_INDEX_END(It)
+        call bml_get_row(S_bml,K,row1)
+        call bml_get_row(D1_bml,K,row2)
+        dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    dr = dq_dv - v 
+    fi(:,I) = dr
+
+    L = I
+    allocate(O(L,L), M(L,L))
+    do K = 1,L
+    do J = 1,L
+       O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+    enddo
+    enddo
+    call Invert(O,M,L)                        ! M = O^(-1)
+    IdentRes = 0.D0*Res
+    KRes = 0.D0
+    do K = 1,L
+    do J = 1,L
+      proj_tmp = M(K,J)*dot_product(fi(:,J),Res)
+      IdentRes = IdentRes + proj_tmp*fi(:,K)
+      KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (J)^(-1)*(q[n]-n) 
+    enddo
+    enddo
+    Fel = norm2(IdentRes-Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+    write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+  
+    ! Does not work, need to normalize matrix  
+        if ((Fel > FelTol).AND.(I < (LMAX))) then 
+          deallocate(O, M)
+        else 
+         do row = 1,Nr_atoms
+         do col = 1,Nr_atoms 
+           elem = 0.0
+           do K = 1,L
+             do J = 1,L 
+               elem = elem + M(J,K)*vi(row,J)*fi(col,K)
+             enddo
+           enddo 
+           if (abs(elem) > threshold) then
+             call bml_set_element(KK0_bml,row,col,elem)
+           endif
+         enddo
+         enddo  
+         deallocate(O,M)
+         endif
+    enddo 
+    deallocate(row1); deallocate(row2);
+
+  end subroutine prg_kernel_matrix_multirank
+
+end module prg_xlbokernel_mod

From bf9a36831227d21621931f9c7ae86ef625d412e5 Mon Sep 17 00:00:00 2001
From: Linnea Andersson <lian5777@fredholm.it.uu.se>
Date: Wed, 12 May 2021 19:34:44 +0200
Subject: [PATCH 06/10] O(N) pre-conditioned kernel matrix with implicit
 FD-expansion response

---
 src/prg_ewald_mod.F90          | 1386 ++++++++++++++---------------
 src/prg_implicit_fermi_mod.F90 |  210 ++---
 src/prg_xlbokernel_mod.F90     | 1505 ++++++++++++++++----------------
 tests/CMakeLists.txt           |    1 +
 tests/src/main.F90             |   47 +-
 5 files changed, 1595 insertions(+), 1554 deletions(-)

diff --git a/src/prg_ewald_mod.F90 b/src/prg_ewald_mod.F90
index 5a032412..95127797 100644
--- a/src/prg_ewald_mod.F90
+++ b/src/prg_ewald_mod.F90
@@ -1,4 +1,4 @@
-! Ewald sum routines for kernel calculation 
+! Ewald sum routines for kernel calculation
 module prg_ewald_mod
 
   use bml
@@ -20,550 +20,550 @@ module prg_ewald_mod
 
 contains
 
-!> Find Coulomb potential on site I from single charge at site J          
-subroutine  Ewald_Real_Space_Single_latte(COULOMBV,I,RXYZ,Box,Nr_elem, &
-    DELTAQ,J,U,Element_Pointer,Nr_atoms,COULACC,HDIM,Max_Nr_Neigh)
-
-implicit none
-
-integer,    parameter     :: PREC = 8
-real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
-real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
-real(PREC), parameter     :: pi = 3.14159265358979323846264D0
-real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
-integer,    intent(in)    :: Nr_atoms, Nr_elem, HDIM, Max_Nr_Neigh, I, J, Element_Pointer(Nr_atoms)
-real(PREC), intent(in)    :: COULACC, DELTAQ(Nr_atoms)
-real(PREC)                :: TFACT, RELPERM, KECONST
-real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3)
-real(PREC), intent(in)    :: U(Nr_elem)
-real(PREC)                :: COULCUT, COULCUT2
-real(PREC), intent(out)   :: COULOMBV 
-real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
-real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
-real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
-real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
-real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
-real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
-
-integer                   :: K, ccnt,l,m,n
-
-COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
-SQRTX = sqrt(-log(COULACC))
-
-ccnt = 0
-COULCUT = 12.D0
-CALPHA = SQRTX/COULCUT
-COULCUT2 = COULCUT*COULCUT
-CALPHA2 = CALPHA*CALPHA
-
-RELPERM = ONE
-KECONST = 14.3996437701414D0*RELPERM
-TFACT  = 16.0D0/(5.0D0*KECONST)
-
-COULOMBV = ZERO
-
-TI = TFACT*U(Element_Pointer(I))
-TI2 = TI*TI
-TI3 = TI2*TI
-TI4 = TI2*TI2
-TI6 = TI4*TI2
-
-SSA = TI
-SSB = TI3/48.D0
-SSC = 3.D0*TI2/16.D0
-SSD = 11.D0*TI/16.D0
-SSE = 1.D0
-
-Ra(1) = RXYZ(1,I)
-Ra(2) = RXYZ(2,I)
-Ra(3) = RXYZ(3,I)
+  !> Find Coulomb potential on site I from single charge at site J
+  subroutine  Ewald_Real_Space_Single_latte(COULOMBV,I,RXYZ,Box,Nr_elem, &
+       DELTAQ,J,U,Element_Pointer,Nr_atoms,COULACC,HDIM,Max_Nr_Neigh)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Nr_elem, HDIM, Max_Nr_Neigh, I, J, Element_Pointer(Nr_atoms)
+    real(PREC), intent(in)    :: COULACC, DELTAQ(Nr_atoms)
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3)
+    real(PREC), intent(in)    :: U(Nr_elem)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: COULOMBV
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+
+    integer                   :: K, ccnt,l,m,n
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    COULOMBV = ZERO
+
+    TI = TFACT*U(Element_Pointer(I))
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RXYZ(1,I)
+    Ra(2) = RXYZ(2,I)
+    Ra(3) = RXYZ(3,I)
 
     do k = -1,1
-    do m = -1,1
-    do l = -1,1
-
-    Rb(1) = RXYZ(1,J)+k*box(1,1)
-    Rb(2) = RXYZ(2,J)+m*box(2,2)
-    Rb(3) = RXYZ(3,J)+l*box(3,3)
-    Rab = Rb-Ra  ! OBS b - a !!!
-    dR = norm2(Rab)
-    MAGR = dR
-    MAGR2 = dR*dR
-
-    if ((dR <= COULCUT).and.(dR > 1e-12)) then
-
-       TJ = TFACT*U(Element_Pointer(J))
-       DC = Rab/dR
-
-       Z = abs(CALPHA*MAGR)
-       NUMREP_ERFC = erfc(Z)
-
-       CA = NUMREP_ERFC/MAGR
-       COULOMBV = COULOMBV + DELTAQ(J)*CA
-       ccnt = ccnt + 1
-       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
-       EXPTI = exp(-TI*MAGR )
-
-       if (Element_Pointer(I).eq.Element_Pointer(J)) then
-           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
-           ccnt = ccnt + 1
-       else
-           TJ2 = TJ*TJ
-           TJ3 = TJ2*TJ
-           TJ4 = TJ2*TJ2
-           TJ6 = TJ4*TJ2
-           EXPTJ = exp( -TJ*MAGR )
-           TI2MTJ2 = TI2 - TJ2
-           TJ2MTI2 = -TI2MTJ2
-           SA = TI
-           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
-           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
-           SD = TJ
-           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
-           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
-
-           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
-       endif
-    endif
-    enddo
-    enddo
+      do m = -1,1
+        do l = -1,1
+
+          Rb(1) = RXYZ(1,J)+k*box(1,1)
+          Rb(2) = RXYZ(2,J)+m*box(2,2)
+          Rb(3) = RXYZ(3,J)+l*box(3,3)
+          Rab = Rb-Ra  ! OBS b - a !!!
+          dR = norm2(Rab)
+          MAGR = dR
+          MAGR2 = dR*dR
+
+          if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+            TJ = TFACT*U(Element_Pointer(J))
+            DC = Rab/dR
+
+            Z = abs(CALPHA*MAGR)
+            NUMREP_ERFC = erfc(Z)
+
+            CA = NUMREP_ERFC/MAGR
+            COULOMBV = COULOMBV + DELTAQ(J)*CA
+            ccnt = ccnt + 1
+            CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+            EXPTI = exp(-TI*MAGR )
+
+            if (Element_Pointer(I).eq.Element_Pointer(J)) then
+              COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+              ccnt = ccnt + 1
+            else
+              TJ2 = TJ*TJ
+              TJ3 = TJ2*TJ
+              TJ4 = TJ2*TJ2
+              TJ6 = TJ4*TJ2
+              EXPTJ = exp( -TJ*MAGR )
+              TI2MTJ2 = TI2 - TJ2
+              TJ2MTI2 = -TI2MTJ2
+              SA = TI
+              SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+              SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+              SD = TJ
+              SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+              SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+              COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+            endif
+          endif
+        enddo
+      enddo
     enddo
 
-COULOMBV = KECONST*COULOMBV
-
-end subroutine Ewald_Real_Space_Single_latte
-
-subroutine  Ewald_Real_Space_Single(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
-    DELTAQ,J,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,HDIM,Max_Nr_Neigh)
-
-
-implicit none
-
-integer,    parameter     :: PREC = 8
-real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
-real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
-real(PREC), parameter     :: pi = 3.14159265358979323846264D0
-real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
-integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, J
-real(PREC), intent(in)    :: COULACC, TIMERATIO,DELTAQ(Nr_atoms)
-real(PREC)                :: TFACT, RELPERM, KECONST
-real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
-real(PREC), intent(in)    :: U(Nr_atoms)
-real(PREC)                :: COULCUT, COULCUT2
-character(10), intent(in) :: Element_Type(Nr_atoms)
-real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
-real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
-real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
-real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
-real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
-real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
-real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
-
-integer                   :: K, ccnt,l,m,n
-
-COULVOL = LBox(1)*LBox(2)*LBox(3)
-SQRTX = sqrt(-log(COULACC))
-
-ccnt = 0
-COULCUT = 12.D0
-CALPHA = SQRTX/COULCUT
-COULCUT2 = COULCUT*COULCUT
-CALPHA2 = CALPHA*CALPHA
-
-RELPERM = ONE
-KECONST = 14.3996437701414D0*RELPERM
-TFACT  = 16.0D0/(5.0D0*KECONST)
-
-FCOUL = ZERO
-COULOMBV = ZERO
-
-TI = TFACT*U(I)
-TI2 = TI*TI
-TI3 = TI2*TI
-TI4 = TI2*TI2
-TI6 = TI4*TI2
-
-SSA = TI
-SSB = TI3/48.D0
-SSC = 3.D0*TI2/16.D0
-SSD = 11.D0*TI/16.D0
-SSE = 1.D0
-
-Ra(1) = RX(I)
-Ra(2) = RY(I)
-Ra(3) = RZ(I)
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space_Single_latte
+
+  subroutine  Ewald_Real_Space_Single(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
+       DELTAQ,J,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,HDIM,Max_Nr_Neigh)
+
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, J
+    real(PREC), intent(in)    :: COULACC, TIMERATIO,DELTAQ(Nr_atoms)
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+    real(PREC), intent(in)    :: U(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    character(10), intent(in) :: Element_Type(Nr_atoms)
+    real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+
+    integer                   :: K, ccnt,l,m,n
+
+    COULVOL = LBox(1)*LBox(2)*LBox(3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    FCOUL = ZERO
+    COULOMBV = ZERO
+
+    TI = TFACT*U(I)
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RX(I)
+    Ra(2) = RY(I)
+    Ra(3) = RZ(I)
 
     do k = -1,1
-    do m = -1,1
-    do l = -1,1
-
-    Rb(1) = RX(J)+k*Lbox(1)
-    Rb(2) = RY(J)+m*Lbox(2)
-    Rb(3) = RZ(J)+l*Lbox(3)
-    Rab = Rb-Ra  ! OBS b - a !!!
-    dR = norm2(Rab)
-    MAGR = dR
-    MAGR2 = dR*dR
-
-    if ((dR <= COULCUT).and.(dR > 1e-12)) then
-
-       TJ = TFACT*U(J)
-       DC = Rab/dR
-
-       Z = abs(CALPHA*MAGR)
-       NUMREP_ERFC = erfc(Z)
-
-       CA = NUMREP_ERFC/MAGR
-       COULOMBV = COULOMBV + DELTAQ(J)*CA
-       ccnt = ccnt + 1
-       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
-       FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
-       EXPTI = exp(-TI*MAGR )
-
-       if (Element_Type(I).eq.Element_Type(J)) then
-           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
-           ccnt = ccnt + 1
-           FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
-             + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
-       else
-           TJ2 = TJ*TJ
-           TJ3 = TJ2*TJ
-           TJ4 = TJ2*TJ2
-           TJ6 = TJ4*TJ2
-           EXPTJ = exp( -TJ*MAGR )
-           TI2MTJ2 = TI2 - TJ2
-           TJ2MTI2 = -TI2MTJ2
-           SA = TI
-           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
-           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
-           SD = TJ
-           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
-           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
-
-           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
-           FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
-              + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
-       endif
-
-       FCOUL(1) = FCOUL(1) + DC(1)*FORCE
-       FCOUL(2) = FCOUL(2) + DC(2)*FORCE
-       FCOUL(3) = FCOUL(3) + DC(3)*FORCE
-    endif
-    enddo
+      do m = -1,1
+        do l = -1,1
+
+          Rb(1) = RX(J)+k*Lbox(1)
+          Rb(2) = RY(J)+m*Lbox(2)
+          Rb(3) = RZ(J)+l*Lbox(3)
+          Rab = Rb-Ra  ! OBS b - a !!!
+          dR = norm2(Rab)
+          MAGR = dR
+          MAGR2 = dR*dR
+
+          if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+            TJ = TFACT*U(J)
+            DC = Rab/dR
+
+            Z = abs(CALPHA*MAGR)
+            NUMREP_ERFC = erfc(Z)
+
+            CA = NUMREP_ERFC/MAGR
+            COULOMBV = COULOMBV + DELTAQ(J)*CA
+            ccnt = ccnt + 1
+            CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+            FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
+            EXPTI = exp(-TI*MAGR )
+
+            if (Element_Type(I).eq.Element_Type(J)) then
+              COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+              ccnt = ccnt + 1
+              FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
+                   + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
+            else
+              TJ2 = TJ*TJ
+              TJ3 = TJ2*TJ
+              TJ4 = TJ2*TJ2
+              TJ6 = TJ4*TJ2
+              EXPTJ = exp( -TJ*MAGR )
+              TI2MTJ2 = TI2 - TJ2
+              TJ2MTI2 = -TI2MTJ2
+              SA = TI
+              SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+              SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+              SD = TJ
+              SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+              SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+              COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+              FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
+                   + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
+            endif
+
+            FCOUL(1) = FCOUL(1) + DC(1)*FORCE
+            FCOUL(2) = FCOUL(2) + DC(2)*FORCE
+            FCOUL(3) = FCOUL(3) + DC(3)*FORCE
+          endif
+        enddo
+      enddo
     enddo
+
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space_Single
+
+  subroutine  Ewald_Real_Space_latte(COULOMBV,I,RXYZ,Box, &
+       DELTAQ,U,Element_Pointer,Nr_atoms,COULACC,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_Elem)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, Nr_Elem
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+    real(PREC), intent(in)    :: U(Nr_elem)
+    real(PREC)                :: COULCUT, COULCUT2
+    integer, intent(in)       :: Element_Pointer(Nr_atoms)
+    integer,    intent(in)    :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
+    real(PREC), intent(out)   :: COULOMBV
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+    integer                   :: J,K, ccnt, newj, PBCI,PBCJ,PBCK
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    COULOMBV = ZERO
+
+    TI = TFACT*U(Element_Pointer(I))
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RXYZ(1,I)
+    Ra(2) = RXYZ(2,I)
+    Ra(3) = RXYZ(3,I)
+
+    do newj = 1,totnebcoul(I)
+      J = NEBCOUL(1, NEWJ, I)
+      PBCI = NEBCOUL(2, NEWJ, I)
+      PBCJ = NEBCOUL(3, NEWJ, I)
+      PBCK = NEBCOUL(4, NEWJ, I)
+      Rb(1) = RXYZ(1,J) + REAL(PBCI)*BOX(1,1) + REAL(PBCJ)*BOX(2,1) + &
+           REAL(PBCK)*BOX(3,1)
+
+      Rb(2) = RXYZ(2,J) + REAL(PBCI)*BOX(1,2) + REAL(PBCJ)*BOX(2,2) + &
+           REAL(PBCK)*BOX(3,2)
+
+      Rb(3) = RXYZ(3,J) + REAL(PBCI)*BOX(1,3) + REAL(PBCJ)*BOX(2,3) + &
+           REAL(PBCK)*BOX(3,3)
+      Rab = Rb-Ra  ! OBS b - a !!!
+      dR = norm2(Rab)
+      MAGR = dR
+      MAGR2 = dR*dR
+
+      if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+        TJ = TFACT*U(Element_Pointer(J))
+        DC = Rab/dR
+
+        Z = abs(CALPHA*MAGR)
+        NUMREP_ERFC = erfc(Z)
+
+        CA = NUMREP_ERFC/MAGR
+        COULOMBV = COULOMBV + DELTAQ(J)*CA
+        ccnt = ccnt + 1
+        CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+        EXPTI = exp(-TI*MAGR )
+
+        if (Element_Pointer(I).eq.Element_Pointer(J)) then
+          COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          ccnt = ccnt + 1
+        else
+          TJ2 = TJ*TJ
+          TJ3 = TJ2*TJ
+          TJ4 = TJ2*TJ2
+          TJ6 = TJ4*TJ2
+          EXPTJ = exp( -TJ*MAGR )
+          TI2MTJ2 = TI2 - TJ2
+          TJ2MTI2 = -TI2MTJ2
+          SA = TI
+          SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+          SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+          SD = TJ
+          SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+          SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+          COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+        endif
+
+      endif
     enddo
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space_latte
+
+  subroutine  Ewald_Real_Space(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
+       DELTAQ,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I
+    real(PREC), intent(in)    :: COULACC, TIMERATIO
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+    real(PREC), intent(in)    :: U(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    character(10), intent(in) :: Element_Type(Nr_atoms)
+    integer,    intent(in)    :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)    :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)    :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+    integer                   :: J,K, ccnt, nnI
+
+    COULVOL = LBox(1)*LBox(2)*LBox(3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    FCOUL = ZERO
+    COULOMBV = ZERO
+
+    TI = TFACT*U(I)
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RX(I)
+    Ra(2) = RY(I)
+    Ra(3) = RZ(I)
+
+    do nnI = 1,nrnnlist(I)
+      Rb(1) = nnRx(I,nnI)
+      Rb(2) = nnRy(I,nnI)
+      Rb(3) = nnRz(I,nnI)
+      J = nnType(I,nnI)
+      Rab = Rb-Ra  ! OBS b - a !!!
+      dR = norm2(Rab)
+      MAGR = dR
+      MAGR2 = dR*dR
+
+      if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+        TJ = TFACT*U(J)
+        DC = Rab/dR
+
+        ! Not Using Numerical Recipes ERFC
+        Z = abs(CALPHA*MAGR)
+        NUMREP_ERFC = erfc(Z)
+
+        CA = NUMREP_ERFC/MAGR
+        COULOMBV = COULOMBV + DELTAQ(J)*CA
+        ccnt = ccnt + 1
+        !TEST(ccnt) = DELTAQ(J)*CA
+        CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+        FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
+        EXPTI = exp(-TI*MAGR )
+
+        if (Element_Type(I).eq.Element_Type(J)) then
+          COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          ccnt = ccnt + 1
+          !TEST(ccnt) = - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
+               + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
+        else
+          TJ2 = TJ*TJ
+          TJ3 = TJ2*TJ
+          TJ4 = TJ2*TJ2
+          TJ6 = TJ4*TJ2
+          EXPTJ = exp( -TJ*MAGR )
+          TI2MTJ2 = TI2 - TJ2
+          TJ2MTI2 = -TI2MTJ2
+          SA = TI
+          SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+          SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+          SD = TJ
+          SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+          SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+          COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+          FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
+               + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
+        endif
 
-COULOMBV = KECONST*COULOMBV
-
-end subroutine Ewald_Real_Space_Single
-
-subroutine  Ewald_Real_Space_latte(COULOMBV,I,RXYZ,Box, &
-    DELTAQ,U,Element_Pointer,Nr_atoms,COULACC,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_Elem)
-
-implicit none
-
-integer,    parameter     :: PREC = 8
-real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
-real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
-real(PREC), parameter     :: pi = 3.14159265358979323846264D0
-real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
-integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I, Nr_Elem
-real(PREC), intent(in)    :: COULACC
-real(PREC)                :: TFACT, RELPERM, KECONST
-real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
-real(PREC), intent(in)    :: U(Nr_elem)
-real(PREC)                :: COULCUT, COULCUT2
-integer, intent(in)       :: Element_Pointer(Nr_atoms)
-integer,    intent(in)    :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
-real(PREC), intent(out)   :: COULOMBV
-real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
-real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
-real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
-real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
-real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
-real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
-integer                   :: J,K, ccnt, newj, PBCI,PBCJ,PBCK
-
-COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
-SQRTX = sqrt(-log(COULACC))
-
-ccnt = 0
-COULCUT = 12.D0
-CALPHA = SQRTX/COULCUT
-COULCUT2 = COULCUT*COULCUT
-CALPHA2 = CALPHA*CALPHA
-
-RELPERM = ONE
-KECONST = 14.3996437701414D0*RELPERM
-TFACT  = 16.0D0/(5.0D0*KECONST)
-
-COULOMBV = ZERO
-
-TI = TFACT*U(Element_Pointer(I))
-TI2 = TI*TI
-TI3 = TI2*TI
-TI4 = TI2*TI2
-TI6 = TI4*TI2
-
-SSA = TI
-SSB = TI3/48.D0
-SSC = 3.D0*TI2/16.D0
-SSD = 11.D0*TI/16.D0
-SSE = 1.D0
-
-Ra(1) = RXYZ(1,I)
-Ra(2) = RXYZ(2,I)
-Ra(3) = RXYZ(3,I)
-
-do newj = 1,totnebcoul(I)
-        J = NEBCOUL(1, NEWJ, I)
-        PBCI = NEBCOUL(2, NEWJ, I)
-        PBCJ = NEBCOUL(3, NEWJ, I)
-        PBCK = NEBCOUL(4, NEWJ, I)
-        Rb(1) = RXYZ(1,J) + REAL(PBCI)*BOX(1,1) + REAL(PBCJ)*BOX(2,1) + &
-             REAL(PBCK)*BOX(3,1) 
-
-        Rb(2) = RXYZ(2,J) + REAL(PBCI)*BOX(1,2) + REAL(PBCJ)*BOX(2,2) + &
-             REAL(PBCK)*BOX(3,2) 
-
-        Rb(3) = RXYZ(3,J) + REAL(PBCI)*BOX(1,3) + REAL(PBCJ)*BOX(2,3) + &
-             REAL(PBCK)*BOX(3,3) 
-        Rab = Rb-Ra  ! OBS b - a !!!
-        dR = norm2(Rab)
-        MAGR = dR
-        MAGR2 = dR*dR
-
-    if ((dR <= COULCUT).and.(dR > 1e-12)) then
-
-       TJ = TFACT*U(Element_Pointer(J))
-       DC = Rab/dR
-
-       Z = abs(CALPHA*MAGR)
-       NUMREP_ERFC = erfc(Z)
-
-       CA = NUMREP_ERFC/MAGR
-       COULOMBV = COULOMBV + DELTAQ(J)*CA
-       ccnt = ccnt + 1
-       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
-       EXPTI = exp(-TI*MAGR )
-
-       if (Element_Pointer(I).eq.Element_Pointer(J)) then
-           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
-           ccnt = ccnt + 1
-       else
-           TJ2 = TJ*TJ
-           TJ3 = TJ2*TJ
-           TJ4 = TJ2*TJ2
-           TJ6 = TJ4*TJ2
-           EXPTJ = exp( -TJ*MAGR )
-           TI2MTJ2 = TI2 - TJ2
-           TJ2MTI2 = -TI2MTJ2
-           SA = TI
-           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
-           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
-           SD = TJ
-           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
-           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
-
-           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
-       endif
-
-    endif
-enddo
-COULOMBV = KECONST*COULOMBV
-
-end subroutine Ewald_Real_Space_latte
-
-subroutine  Ewald_Real_Space(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
-    DELTAQ,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
-
-implicit none
-
-integer,    parameter     :: PREC = 8
-real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
-real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
-real(PREC), parameter     :: pi = 3.14159265358979323846264D0
-real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
-integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, I
-real(PREC), intent(in)    :: COULACC, TIMERATIO
-real(PREC)                :: TFACT, RELPERM, KECONST
-real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
-real(PREC), intent(in)    :: U(Nr_atoms)
-real(PREC)                :: COULCUT, COULCUT2
-character(10), intent(in) :: Element_Type(Nr_atoms)
-integer,    intent(in)    :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
-real(PREC), intent(in)    :: nnRx(Nr_atoms,Max_Nr_Neigh)
-real(PREC), intent(in)    :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
-real(PREC), intent(out)   :: COULOMBV, FCOUL(3)
-real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
-real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
-real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
-real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
-real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
-real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
-integer                   :: J,K, ccnt, nnI
-
-COULVOL = LBox(1)*LBox(2)*LBox(3)
-SQRTX = sqrt(-log(COULACC))
-
-ccnt = 0
-COULCUT = 12.D0
-CALPHA = SQRTX/COULCUT
-COULCUT2 = COULCUT*COULCUT
-CALPHA2 = CALPHA*CALPHA
-
-RELPERM = ONE
-KECONST = 14.3996437701414D0*RELPERM
-TFACT  = 16.0D0/(5.0D0*KECONST)
-
-FCOUL = ZERO
-COULOMBV = ZERO
-
-TI = TFACT*U(I)
-TI2 = TI*TI
-TI3 = TI2*TI
-TI4 = TI2*TI2
-TI6 = TI4*TI2
-
-SSA = TI
-SSB = TI3/48.D0
-SSC = 3.D0*TI2/16.D0
-SSD = 11.D0*TI/16.D0
-SSE = 1.D0
-
-Ra(1) = RX(I)
-Ra(2) = RY(I)
-Ra(3) = RZ(I)
-
-do nnI = 1,nrnnlist(I)
-  Rb(1) = nnRx(I,nnI)
-  Rb(2) = nnRy(I,nnI)
-  Rb(3) = nnRz(I,nnI)
-  J = nnType(I,nnI)
-    Rab = Rb-Ra  ! OBS b - a !!!
-    dR = norm2(Rab)
-    MAGR = dR
-    MAGR2 = dR*dR
-
-    if ((dR <= COULCUT).and.(dR > 1e-12)) then
-
-       TJ = TFACT*U(J)
-       DC = Rab/dR
-
-       ! Not Using Numerical Recipes ERFC
-       Z = abs(CALPHA*MAGR)
-       NUMREP_ERFC = erfc(Z)
-
-       CA = NUMREP_ERFC/MAGR
-       COULOMBV = COULOMBV + DELTAQ(J)*CA
-       ccnt = ccnt + 1
-       !TEST(ccnt) = DELTAQ(J)*CA
-       CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
-       FORCE = -KECONST*DELTAQ(I)*DELTAQ(J)*CA/MAGR
-       EXPTI = exp(-TI*MAGR )
-
-       if (Element_Type(I).eq.Element_Type(J)) then
-           COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
-           ccnt = ccnt + 1
-           !TEST(ccnt) = - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
-           FORCE = FORCE + (KECONST*DELTAQ(I)*DELTAQ(J)*EXPTI)*((SSE/MAGR2 - TWO*SSB*MAGR - SSC) &
-             + SSA*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR))
-       else
-           TJ2 = TJ*TJ
-           TJ3 = TJ2*TJ
-           TJ4 = TJ2*TJ2
-           TJ6 = TJ4*TJ2
-           EXPTJ = exp( -TJ*MAGR )
-           TI2MTJ2 = TI2 - TJ2
-           TJ2MTI2 = -TI2MTJ2
-           SA = TI
-           SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
-           SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
-           SD = TJ
-           SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
-           SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
-
-           COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
-           FORCE = FORCE + KECONST*DELTAQ(I)*DELTAQ(J)*((EXPTI*(SA*(SB - (SC/MAGR)) - (SC/MAGR2))) &
-              + (EXPTJ*(SD*(SE - (SF/MAGR)) - (SF/MAGR2))))
-       endif
-
-       FCOUL(1) = FCOUL(1) + DC(1)*FORCE
-       FCOUL(2) = FCOUL(2) + DC(2)*FORCE
-       FCOUL(3) = FCOUL(3) + DC(3)*FORCE
-    endif
-enddo
-COULOMBV = KECONST*COULOMBV
-
-end subroutine Ewald_Real_Space
-
-subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_Neigh)
-
-implicit none
-
-integer, parameter        :: PREC = 8
-real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
-real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
-real(PREC), parameter     :: pi = 3.14159265358979323846264D0
-real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
-integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
-real(PREC), intent(in)    :: COULACC
-real(PREC)                :: KECONST, TFACT, RELPERM
-real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
-real(PREC)                :: COULCUT, COULCUT2
-real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
-real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
-real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
-real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
-real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
-real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
-real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
-real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
-
-integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
-
-COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
-SQRTX = sqrt(-log(COULACC))
-
-ccnt = 0
-
-COULCUT = 12.0D0
-CALPHA = SQRTX/COULCUT
-
-COULCUT2 = COULCUT*COULCUT
-KCUTOFF = TWO*CALPHA*SQRTX
-KCUTOFF2 = KCUTOFF*KCUTOFF
-CALPHA2 = CALPHA*CALPHA
-FOURCALPHA2 = FOUR*CALPHA2
-
-RECIPVECS = ZERO
-RECIPVECS(1,1) = TWO*pi/Box(1,1)
-RECIPVECS(2,2) = TWO*pi/Box(2,2)
-RECIPVECS(3,3) = TWO*pi/Box(3,3)
-LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
-MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
-NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
-
-RELPERM = 1.D0
-KECONST = 14.3996437701414D0*RELPERM
-
-COULOMBV = ZERO
-SINLIST = ZERO
-COSLIST = ZERO
-
-do L = 0,LMAX
-
-  if (L.eq.0) then
-    MMIN = 0
-  else
-    MMIN = -MMAX
-  endif
-
-  L11 = L*RECIPVECS(1,1)
-  L12 = L*RECIPVECS(1,2)
-  L13 = L*RECIPVECS(1,3)
-
-  do M = MMIN,MMAX
+        FCOUL(1) = FCOUL(1) + DC(1)*FORCE
+        FCOUL(2) = FCOUL(2) + DC(2)*FORCE
+        FCOUL(3) = FCOUL(3) + DC(3)*FORCE
+      endif
+    enddo
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space
+
+  subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_Neigh)
+
+    implicit none
+
+    integer, parameter        :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: KECONST, TFACT, RELPERM
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
+    real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
+    real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+    real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
+
+    integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+
+    COULCUT = 12.0D0
+    CALPHA = SQRTX/COULCUT
+
+    COULCUT2 = COULCUT*COULCUT
+    KCUTOFF = TWO*CALPHA*SQRTX
+    KCUTOFF2 = KCUTOFF*KCUTOFF
+    CALPHA2 = CALPHA*CALPHA
+    FOURCALPHA2 = FOUR*CALPHA2
+
+    RECIPVECS = ZERO
+    RECIPVECS(1,1) = TWO*pi/Box(1,1)
+    RECIPVECS(2,2) = TWO*pi/Box(2,2)
+    RECIPVECS(3,3) = TWO*pi/Box(3,3)
+    LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+    MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+    NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+    RELPERM = 1.D0
+    KECONST = 14.3996437701414D0*RELPERM
+
+    COULOMBV = ZERO
+    SINLIST = ZERO
+    COSLIST = ZERO
+
+    do L = 0,LMAX
+
+      if (L.eq.0) then
+        MMIN = 0
+      else
+        MMIN = -MMAX
+      endif
+
+      L11 = L*RECIPVECS(1,1)
+      L12 = L*RECIPVECS(1,2)
+      L13 = L*RECIPVECS(1,3)
+
+      do M = MMIN,MMAX
 
         NMIN = -NMAX
         if ((L==0).and.(M==0)) then
@@ -575,122 +575,122 @@ subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_
         M23 = L13 + M*RECIPVECS(2,3)
 
         do N = NMIN,NMAX
-           K(1) = M21 + N*RECIPVECS(3,1)
-           K(2) = M22 + N*RECIPVECS(3,2)
-           K(3) = M23 + N*RECIPVECS(3,3)
-           K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
-           if (K2.le.KCUTOFF2) then
-              PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
-              PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
-
-              COSSUM = 0.D0
-              SINSUM = 0.D0
-
-              ! Doing the sin and cos sums
-              !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,DOT) &
-              !$OMP REDUCTION(+:COSSUM) & 
-              !$OMP REDUCTION(+:SINSUM)  
-              do I = 1,Nr_atoms
-                 DOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
-                 ! We re-use these in the next loop...
-                 SINLIST(I) = sin(DOT)
-                 COSLIST(I) = cos(DOT)
-                 COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
-                 SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
-              enddo
-              !$OMP END PARALLEL DO
-              COSSUM2 = COSSUM*COSSUM
-              SINSUM2 = SINSUM*SINSUM
-
-              ! Add up energy and force contributions
-
-              KEPREF = KECONST*PREFACTOR
-              !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I) 
-              do I = 1,Nr_atoms
-                 COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
-              enddo
-              !$OMP END PARALLEL DO
-
-              KEPREF = KEPREF*(COSSUM2 + SINSUM2)
-           endif
+          K(1) = M21 + N*RECIPVECS(3,1)
+          K(2) = M22 + N*RECIPVECS(3,2)
+          K(3) = M23 + N*RECIPVECS(3,3)
+          K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+          if (K2.le.KCUTOFF2) then
+            PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+            PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
+
+            COSSUM = 0.D0
+            SINSUM = 0.D0
+
+            ! Doing the sin and cos sums
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,DOT) &
+            !$OMP REDUCTION(+:COSSUM) &
+            !$OMP REDUCTION(+:SINSUM)
+            do I = 1,Nr_atoms
+              DOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
+              ! We re-use these in the next loop...
+              SINLIST(I) = sin(DOT)
+              COSLIST(I) = cos(DOT)
+              COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
+              SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
+            enddo
+            !$OMP END PARALLEL DO
+            COSSUM2 = COSSUM*COSSUM
+            SINSUM2 = SINSUM*SINSUM
+
+            ! Add up energy and force contributions
+
+            KEPREF = KECONST*PREFACTOR
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I)
+            do I = 1,Nr_atoms
+              COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
+            enddo
+            !$OMP END PARALLEL DO
+
+            KEPREF = KEPREF*(COSSUM2 + SINSUM2)
+          endif
         enddo
-  enddo
-enddo
-
-! Point self energy
-CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
-COULOMBV = COULOMBV - CORRFACT*DELTAQ;
-
-end subroutine Ewald_k_Space_latte
-
-subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TIMERATIO,Max_Nr_Neigh)
-
-implicit none
-
-integer, parameter        :: PREC = 8
-real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
-real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
-real(PREC), parameter     :: pi = 3.14159265358979323846264D0
-real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
-integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
-real(PREC), intent(in)    :: COULACC, TIMERATIO
-real(PREC)                :: KECONST, TFACT, RELPERM
-real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
-real(PREC)                :: COULCUT, COULCUT2
-real(PREC), intent(out)   :: COULOMBV(Nr_atoms), FCOUL(3,Nr_atoms)
-real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
-real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
-real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
-real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
-real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
-real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
-real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
-
-integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
-
-COULVOL = LBox(1)*LBox(2)*LBox(3)
-SQRTX = sqrt(-log(COULACC))
-
-ccnt = 0
-
-COULCUT = 12.0D0
-CALPHA = SQRTX/COULCUT
-
-COULCUT2 = COULCUT*COULCUT
-KCUTOFF = TWO*CALPHA*SQRTX
-KCUTOFF2 = KCUTOFF*KCUTOFF
-CALPHA2 = CALPHA*CALPHA
-FOURCALPHA2 = FOUR*CALPHA2
-
-RECIPVECS = ZERO
-RECIPVECS(1,1) = TWO*pi/LBox(1)
-RECIPVECS(2,2) = TWO*pi/LBox(2)
-RECIPVECS(3,3) = TWO*pi/LBox(3)
-LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
-MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
-NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
-
-RELPERM = 1.D0
-KECONST = 14.3996437701414D0*RELPERM
-
-FCOUL = ZERO
-COULOMBV = ZERO
-SINLIST = ZERO
-COSLIST = ZERO
-
-do L = 0,LMAX
-
-  if (L.eq.0) then
-    MMIN = 0
-  else
-    MMIN = -MMAX
-  endif
-
-  L11 = L*RECIPVECS(1,1)
-  L12 = L*RECIPVECS(1,2)
-  L13 = L*RECIPVECS(1,3)
-
-  do M = MMIN,MMAX
+      enddo
+    enddo
+
+    ! Point self energy
+    CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+    COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+  end subroutine Ewald_k_Space_latte
+
+  subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TIMERATIO,Max_Nr_Neigh)
+
+    implicit none
+
+    integer, parameter        :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
+    real(PREC), intent(in)    :: COULACC, TIMERATIO
+    real(PREC)                :: KECONST, TFACT, RELPERM
+    real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: COULOMBV(Nr_atoms), FCOUL(3,Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
+    real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
+    real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+    real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
+
+    integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+    COULVOL = LBox(1)*LBox(2)*LBox(3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+
+    COULCUT = 12.0D0
+    CALPHA = SQRTX/COULCUT
+
+    COULCUT2 = COULCUT*COULCUT
+    KCUTOFF = TWO*CALPHA*SQRTX
+    KCUTOFF2 = KCUTOFF*KCUTOFF
+    CALPHA2 = CALPHA*CALPHA
+    FOURCALPHA2 = FOUR*CALPHA2
+
+    RECIPVECS = ZERO
+    RECIPVECS(1,1) = TWO*pi/LBox(1)
+    RECIPVECS(2,2) = TWO*pi/LBox(2)
+    RECIPVECS(3,3) = TWO*pi/LBox(3)
+    LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+    MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+    NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+    RELPERM = 1.D0
+    KECONST = 14.3996437701414D0*RELPERM
+
+    FCOUL = ZERO
+    COULOMBV = ZERO
+    SINLIST = ZERO
+    COSLIST = ZERO
+
+    do L = 0,LMAX
+
+      if (L.eq.0) then
+        MMIN = 0
+      else
+        MMIN = -MMAX
+      endif
+
+      L11 = L*RECIPVECS(1,1)
+      L12 = L*RECIPVECS(1,2)
+      L13 = L*RECIPVECS(1,3)
+
+      do M = MMIN,MMAX
 
         NMIN = -NMAX
         if ((L==0).and.(M==0)) then
@@ -702,50 +702,50 @@ subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TI
         M23 = L13 + M*RECIPVECS(2,3)
 
         do N = NMIN,NMAX
-           K(1) = M21 + N*RECIPVECS(3,1)
-           K(2) = M22 + N*RECIPVECS(3,2)
-           K(3) = M23 + N*RECIPVECS(3,3)
-           K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
-           if (K2.le.KCUTOFF2) then
-              PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
-              PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
-
-              COSSUM = 0.D0
-              SINSUM = 0.D0
-
-              ! Doing the sin and cos sums
-              do I = 1,Nr_atoms
-                 DOT = K(1)*RX(I) + K(2)*RY(I) + K(3)*RZ(I)
-                 ! We re-use these in the next loop...
-                 SINLIST(I) = sin(DOT)
-                 COSLIST(I) = cos(DOT)
-                 COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
-                 SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
-              enddo
-              COSSUM2 = COSSUM*COSSUM
-              SINSUM2 = SINSUM*SINSUM
-
-              ! Add up energy and force contributions
-
-              KEPREF = KECONST*PREFACTOR
-              do I = 1,Nr_atoms
-                 COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
-                 FORCE = KEPREF*DELTAQ(I)*(SINLIST(I)*COSSUM - COSLIST(I)*SINSUM)
-                 FCOUL(1,I) = FCOUL(1,I) + FORCE*K(1)
-                 FCOUL(2,I) = FCOUL(2,I) + FORCE*K(2)
-                 FCOUL(3,I) = FCOUL(3,I) + FORCE*K(3)
-              enddo
-
-              KEPREF = KEPREF*(COSSUM2 + SINSUM2)
-           endif
+          K(1) = M21 + N*RECIPVECS(3,1)
+          K(2) = M22 + N*RECIPVECS(3,2)
+          K(3) = M23 + N*RECIPVECS(3,3)
+          K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+          if (K2.le.KCUTOFF2) then
+            PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+            PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
+
+            COSSUM = 0.D0
+            SINSUM = 0.D0
+
+            ! Doing the sin and cos sums
+            do I = 1,Nr_atoms
+              DOT = K(1)*RX(I) + K(2)*RY(I) + K(3)*RZ(I)
+              ! We re-use these in the next loop...
+              SINLIST(I) = sin(DOT)
+              COSLIST(I) = cos(DOT)
+              COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
+              SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
+            enddo
+            COSSUM2 = COSSUM*COSSUM
+            SINSUM2 = SINSUM*SINSUM
+
+            ! Add up energy and force contributions
+
+            KEPREF = KECONST*PREFACTOR
+            do I = 1,Nr_atoms
+              COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
+              FORCE = KEPREF*DELTAQ(I)*(SINLIST(I)*COSSUM - COSLIST(I)*SINSUM)
+              FCOUL(1,I) = FCOUL(1,I) + FORCE*K(1)
+              FCOUL(2,I) = FCOUL(2,I) + FORCE*K(2)
+              FCOUL(3,I) = FCOUL(3,I) + FORCE*K(3)
+            enddo
+
+            KEPREF = KEPREF*(COSSUM2 + SINSUM2)
+          endif
         enddo
-  enddo
-enddo
+      enddo
+    enddo
 
-! Point self energy
-CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
-COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+    ! Point self energy
+    CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+    COULOMBV = COULOMBV - CORRFACT*DELTAQ;
 
-end subroutine Ewald_k_Space
+  end subroutine Ewald_k_Space
 
 end module prg_ewald_mod
diff --git a/src/prg_implicit_fermi_mod.F90 b/src/prg_implicit_fermi_mod.F90
index 2e217a98..76bd2bc2 100644
--- a/src/prg_implicit_fermi_mod.F90
+++ b/src/prg_implicit_fermi_mod.F90
@@ -30,7 +30,7 @@ module prg_implicit_fermi_mod
 contains
 
   !> Recursive Implicit Fermi Dirac for finite temperature.
-  !! \param Inv_bml Inverses generated by algorithm.  
+  !! \param Inv_bml Inverses generated by algorithm.
   !! \param h_bml Input Hamiltonian matrix.
   !! \param p_bml Output density matrix.
   !! \param nsteps Number of recursion steps.
@@ -40,8 +40,8 @@ module prg_implicit_fermi_mod
   !! \param occErrLimit Occupation error limit.
   !! \param threshold Threshold for multiplication.
   !! \param tol Tolerance for linear system solver.
-  !! \param SCF_IT The current SCF iteration. 
-  !! \param occiter Counts the total nr of DM calculations during MD. 
+  !! \param SCF_IT The current SCF iteration.
+  !! \param occiter Counts the total nr of DM calculations during MD.
   !! See \cite{niklasson2003}
   subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc, &
        mu, beta, occErrLimit, threshold, tol,SCF_IT, occiter)
@@ -58,8 +58,8 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
     integer, intent(inout) :: occiter
 
     type(bml_matrix_t) :: w_bml, y_bml, d_bml, aux_bml, p2_bml, I_bml, ai_bml
-    real(dp) :: trdPdmu, trP0, occErr, alpha 
-    real(dp) :: cnst, ofactor, mustep 
+    real(dp) :: trdPdmu, trP0, occErr, alpha
+    real(dp) :: cnst, ofactor, mustep
     real(dp), allocatable :: trace(:), gbnd(:)
     character(20) :: bml_type
     integer :: N, M, i, iter, muadj, prev
@@ -75,7 +75,7 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, aux_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, y_bml)
     call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, I_bml)
-    call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, ai_bml) 
+    call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, ai_bml)
 
     occErr = 10.0_dp
     alpha = 1.0_dp
@@ -97,10 +97,10 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
       call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.0001_dp, threshold)
     else
       ! Otherwise use previous inverse as starting guess
-      call bml_copy(Inv_bml(1),ai_bml) 
+      call bml_copy(Inv_bml(1),ai_bml)
     end if
 
-    do while (occErr .gt. occErrLimit .or. muadj .eq. 1) 
+    do while ((occErr .gt. occErrLimit .or. muadj .eq. 1) .and. iter < 50)
       iter = iter + 1
       muadj = 0
       write(*,*) 'mu =', mu
@@ -109,18 +109,18 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
       call bml_copy(h_bml, p_bml)
       call prg_normalize_implicit_fermi(p_bml, cnst, mu)
 
-        do i = 1, nsteps
-            call bml_multiply_x2(p_bml, p2_bml, threshold, trace)
-            ! Y = 2*(P2-P) + I
-            call bml_copy(p2_bml, y_bml)
-            call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
-            call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
-            ! Find inverse ai = (2*(P2-P)+I)^-1
-            !call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.01_dp, threshold)
-            call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, aux_bml, I_bml, tol, threshold)
-            call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
-            call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation  
-        enddo
+      do i = 1, nsteps
+        call bml_multiply_x2(p_bml, p2_bml, threshold, trace)
+        ! Y = 2*(P2-P) + I
+        call bml_copy(p2_bml, y_bml)
+        call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
+        call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
+        ! Find inverse ai = (2*(P2-P)+I)^-1
+        !call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.01_dp, threshold)
+        call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, aux_bml, I_bml, tol, threshold)
+        call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
+        call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation
+      enddo
 
       trdPdmu = bml_trace(p_bml)
       trP0 = trdPdmu
@@ -129,40 +129,44 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
       occErr = abs(trP0 - nocc)
       write(*,*) 'occerr =', nocc-trP0
 
-      ! If occupation error is too large, do bisection method 
-      if (occerr > 10.0_dp) then 
-         if (nocc-trP0 < 0.0_dp) then
-            if (prev .eq. 1) then
-               alpha = alpha/2
-            endif 
-            prev = -1
-            mu = mu - alpha
-         else 
-            if (prev .eq. -1) then
-               alpha = alpha/2
-            endif
-            prev = 1
-            mu = mu + alpha 
-         endif  
-      ! Otherwise do Newton 
+      ! If occupation error is too large, do bisection method
+      if (occerr > 10.0_dp) then
+        if (nocc-trP0 < 0.0_dp) then
+          if (prev .eq. 1) then
+            alpha = alpha/2
+          endif
+          prev = -1
+          mu = mu - alpha
+        else
+          if (prev .eq. -1) then
+            alpha = alpha/2
+          endif
+          prev = 1
+          mu = mu + alpha
+        endif
+        ! Otherwise do Newton
       else if (occErr .gt. occErrLimit) then
         mustep = (nocc -trP0)/trdPdmu
         if (abs(mustep) > 1.0) then
           mustep = 0.1_dp*mustep
-        end if  
-        mu = mu + mustep 
+        end if
+        mu = mu + mustep
         muadj = 1
       end if
     enddo
-    ! Adjusting the occupation sometimes causes the perturbation calculation to not converge. 
-    ! For now we recompute the DM one extra time if mu was adjusted. 
+
+    if (iter .ge. 50) then 
+            write(*,*) 'Could not converge chemical potential in prg_impplicit_fermi_save_inverse'
+    end if 
+    ! Adjusting the occupation sometimes causes the perturbation calculation to not converge.
+    ! For now we recompute the DM one extra time if mu was adjusted.
     !if (muadj .eq. 1) then
-      ! Adjust occupation
-     ! call bml_copy(p_bml, d_bml)
-     ! call bml_scale_add_identity(d_bml, -1.0_dp, 1.0_dp, threshold)
-     ! call bml_multiply(p_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
-     ! ofactor = ((nocc - trP0)/trdPdmu) * beta
-     ! call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
+    ! Adjust occupation
+    ! call bml_copy(p_bml, d_bml)
+    ! call bml_scale_add_identity(d_bml, -1.0_dp, 1.0_dp, threshold)
+    ! call bml_multiply(p_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
+    ! ofactor = ((nocc - trP0)/trdPdmu) * beta
+    ! call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
     !end if
     occiter = occiter + iter
     call bml_scale(2.0_dp,p_bml)
@@ -177,7 +181,7 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
     call bml_deallocate(I_bml)
 
   end subroutine prg_implicit_fermi_save_inverse
-  
+
   !> Recursive Implicit Fermi Dirac for finite temperature.
   !! \param h_bml Input Hamiltonian matrix.
   !! \param p_bml Output density matrix.
@@ -216,17 +220,12 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
     N = bml_get_N(h_bml)
     M = bml_get_M(h_bml)
 
-    call bml_print_matrix("h_bml",h_bml,1,10,1,10)
-    call bml_print_matrix("p_bml",p_bml,1,10,1,10)
-    write(*,*) nsteps, k, nocc, &
-       mu, beta, method, osteps, occErrLimit, threshold, tol
-    !stop
     allocate(trace(2))
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, p2_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, d_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, w_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, y_bml)
-    if (k .gt. 2) then
+    if (k .ge. 2) then
       call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, aux1_bml)
       call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, aux2_bml)
     endif
@@ -292,8 +291,10 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
       trP0 = trdPdmu
       trdPdmu = trdPdmu - bml_sum_squares(p_bml) ! sum p(i,j)**2
       trdPdmu = beta * trdPdmu
-      mu = mu + (nocc - trP0)/trdPdmu
       occErr = abs(trP0 - nocc)
+      if (occErr .gt. occErrLimit) then 
+        mu = mu + (nocc - trP0)/trdPdmu
+      end if 
       write(*,*) "mu =", mu
     enddo
 
@@ -304,7 +305,7 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
 
     call bml_multiply(p_bml, d_bml, w_bml, 1.0_dp, 0.0_dp, threshold)
     ofactor = ((nocc - trP0)/trdPdmu) * beta
-    call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
+    !call bml_add(p_bml, w_bml, 1.0_dp, ofactor, threshold)
     !call bml_print_matrix("P adjusted occupation",p_bml,0,10,0,10)
 
     deallocate(trace)
@@ -313,7 +314,7 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
     call bml_deallocate(w_bml)
     call bml_deallocate(d_bml)
     call bml_deallocate(y_bml)
-    if (k .gt. 2) then
+    if (k .ge. 2) then
       call bml_deallocate(aux1_bml)
       call bml_deallocate(aux2_bml)
     endif
@@ -438,7 +439,7 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
     integer, intent(in) :: nsteps
     type(bml_matrix_t) :: B_bml, C_bml, C0_bml
     character(20) :: bml_type
-    real(dp) :: p1_trace, dPdmu_trace, p1B_trace, mu1B, cnst 
+    real(dp) :: p1_trace, dPdmu_trace, p1B_trace, mu1B, cnst
     integer :: N, M, i, j, k
 
     bml_type = bml_get_type(H0_bml)
@@ -451,46 +452,46 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
 
     cnst = beta/(2**(2+nsteps))
 
-      ! P0 = 0.5*II - cnst*(H0-mu0*II)
-      call bml_copy(H0_bml, P0_bml)
-      call prg_normalize_implicit_fermi(P0_bml, cnst, mu0)
-
-      ! P1 =  - cnst*H1
-      call bml_copy(H1_bml, P1_bml)
-      call bml_scale(-1.0_dp*cnst, P1_bml)
-      do i = 1, nsteps
-
-        ! Calculate coefficient matrices
-        ! C0 = P0^2
-        call bml_multiply(P0_bml, P0_bml, C0_bml, 1.0_dp, 0.0_dp, threshold)
-        ! C = P0*P1+P1*P0, B = 2(P1 - C)
-        call bml_multiply(P0_bml, P1_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
-        call bml_multiply(P1_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
-        call bml_copy(P1_bml, B_bml)
-        call bml_add(B_bml, C_bml, 2.0_dp, -2.0_dp, threshold)
-        ! Get next P0
-        call bml_multiply(Inv_bml(i), C0_bml, P0_bml, 1.0_dp, 0.0_dp, threshold)
-        ! Get next P1
-        ! C = P0*P1+P1*P0 + 2(P1 -P0*P1-P1*P0)*P0(i+1)
-        call bml_multiply(B_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
-        call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
-      enddo
+    ! P0 = 0.5*II - cnst*(H0-mu0*II)
+    call bml_copy(H0_bml, P0_bml)
+    call prg_normalize_implicit_fermi(P0_bml, cnst, mu0)
+
+    ! P1 =  - cnst*H1
+    call bml_copy(H1_bml, P1_bml)
+    call bml_scale(-1.0_dp*cnst, P1_bml)
+    do i = 1, nsteps
+
+      ! Calculate coefficient matrices
+      ! C0 = P0^2
+      call bml_multiply(P0_bml, P0_bml, C0_bml, 1.0_dp, 0.0_dp, threshold)
+      ! C = P0*P1+P1*P0, B = 2(P1 - C)
+      call bml_multiply(P0_bml, P1_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+      call bml_multiply(P1_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+      call bml_copy(P1_bml, B_bml)
+      call bml_add(B_bml, C_bml, 2.0_dp, -2.0_dp, threshold)
+      ! Get next P0
+      call bml_multiply(Inv_bml(i), C0_bml, P0_bml, 1.0_dp, 0.0_dp, threshold)
+      ! Get next P1
+      ! C = P0*P1+P1*P0 + 2(P1 -P0*P1-P1*P0)*P0(i+1)
+      call bml_multiply(B_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+      call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
+    enddo
 
-      ! dPdmu = beta*P0(I-P0)
-      call bml_copy(P0_bml, B_bml)
-      call bml_scale_add_identity(B_bml, -1.0_dp, 1.0_dp, threshold)
-      call bml_multiply(P0_bml, B_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
-      call bml_scale(beta,C_bml)
-      dPdmu_trace = bml_trace(C_bml)
-      p1_trace = bml_trace(P1_bml)
-      mu1 =  - p1_trace/dPdmu_trace
-      if (abs(dPdmu_trace) > 1e-8) then
-        call bml_add(P1_bml,C_bml,1.0_dp,mu1,threshold)
-      endif 
+    ! dPdmu = beta*P0(I-P0)
+    call bml_copy(P0_bml, B_bml)
+    call bml_scale_add_identity(B_bml, -1.0_dp, 1.0_dp, threshold)
+    call bml_multiply(P0_bml, B_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+    call bml_scale(beta,C_bml)
+    dPdmu_trace = bml_trace(C_bml)
+    p1_trace = bml_trace(P1_bml)
+    mu1 =  - p1_trace/dPdmu_trace
+    if (abs(dPdmu_trace) > 1e-8) then
+      call bml_add(P1_bml,C_bml,1.0_dp,mu1,threshold)
+    endif
 
-      call bml_deallocate(B_bml)
-      call bml_deallocate(C_bml)
-      call bml_deallocate(C0_bml)
+    call bml_deallocate(B_bml)
+    call bml_deallocate(C_bml)
+    call bml_deallocate(C0_bml)
 
 
   end subroutine prg_implicit_fermi_first_order_response
@@ -513,7 +514,7 @@ end subroutine prg_implicit_fermi_first_order_response
   !! See \cite{niklasson2015}
   subroutine prg_implicit_fermi_response(H0_bml, H1_bml, H2_bml, H3_bml, P0_bml, P1_bml, P2_bml, P3_bml, &
        nsteps, mu0, mu, beta, nocc, occ_tol, lin_tol, order, threshold)
-  
+
     implicit none
 
     type(bml_matrix_t), intent(in) :: H0_bml, H1_bml, H2_bml, H3_bml
@@ -868,7 +869,8 @@ subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, d_bml, I_bml, tol, th
     N2 = N*N
     scaled_tol = tol*N
     do while(err > scaled_tol)
-      !write(*,*) 'iter = ', i 
+      !write(*,*) 'iter = ', i
+      !write(*,*) 'ns error =', err
       call bml_copy(ai_bml, tmp_bml)
       call bml_multiply(a_bml, ai_bml, r_bml, 1.0_dp, 0.0_dp, threshold)
       call bml_scale_add_identity(r_bml, -1.0_dp, 1.0_dp, threshold)
@@ -876,12 +878,12 @@ subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, d_bml, I_bml, tol, th
       err = bml_fnorm(r_bml)
       !write(*,*) "err = ", err
       !write(*,*) "prev_err = ", prev_err
-      if (10*prev_err < err) then 
+      if (10*prev_err < err) then
         write(*,*) 'NS did not converge, calling conjugate gradient'
         call prg_conjgrad(a_bml, ai_bml, I_bml, r_bml, tmp_bml, d_bml, 0.0001_dp, threshold)
-      else 
+      else
         call bml_multiply(tmp_bml, r_bml, ai_bml, 1.0_dp, 1.0_dp, threshold)
-      endif 
+      endif
       i = i + 1
     enddo
     !write(*,*) "Number of NS iterations:", i
@@ -1004,7 +1006,7 @@ subroutine prg_conjgrad(A_bml, p_bml, p2_bml, tmp_bml, d_bml, w_bml, cg_tol, thr
       call bml_add(tmp_bml, w_bml, 1.0_dp, -alpha, threshold)
       r_norm_old = r_norm_new
       r_norm_new = bml_sum_squares(tmp_bml)
-      if (k .gt. 500) then
+      if (k .gt. 50) then
         write(*,*) "Conjugate gradient is not converging"
         stop
       endif
@@ -1124,8 +1126,10 @@ subroutine prg_test_density_matrix(ham_bml, p_bml, beta, mu, nocc, osteps, occEr
       trP0 = trdPdmu
       trdPdmu = trdPdmu - bml_sum_squares(p_bml) ! sum p(i,j)**2
       trdPdmu = beta * trdPdmu
-      mu = mu + (nocc - trP0)/trdPdmu
       occErr = abs(trP0 - nocc)
+      if (occErr .gt. occErrLimit) then
+         mu = mu + (nocc - trP0)/trdPdmu
+      end if 
       !write(*,*) "mu = ", mu
     enddo
 
@@ -1136,7 +1140,7 @@ subroutine prg_test_density_matrix(ham_bml, p_bml, beta, mu, nocc, osteps, occEr
 
     call bml_multiply(p_bml, aux_bml, aux1_bml, 1.0_dp, 0.0_dp, threshold)
     ofactor = ((nocc - trP0)/trdPdmu) * beta
-    call bml_add(p_bml, aux1_bml, 1.0_dp, ofactor, threshold)
+    !call bml_add(p_bml, aux1_bml, 1.0_dp, ofactor, threshold)
     !call bml_print_matrix("Diagonalization - Adjusted occupation",p_bml,0,10,0,10)
 
     call bml_deallocate(eigenvectors_bml)
diff --git a/src/prg_xlbokernel_mod.F90 b/src/prg_xlbokernel_mod.F90
index 54f4ecf1..a19cb5b2 100644
--- a/src/prg_xlbokernel_mod.F90
+++ b/src/prg_xlbokernel_mod.F90
@@ -1,7 +1,7 @@
 !> Pre-conditioned O(N) calculation of the kernel for XL-BOMD.
 !! \ingroup PROGRESS
-!! \brief Here are subroutines implementing Niklasson's scheme for 
-!!  low-rank, Krylov subspace approximation of the kernel. 
+!! \brief Here are subroutines implementing Niklasson's scheme for
+!!  low-rank, Krylov subspace approximation of the kernel.
 module prg_xlbokernel_mod
 
   use omp_lib
@@ -11,7 +11,7 @@ module prg_xlbokernel_mod
   use prg_timer_mod
   use prg_parallel_mod
   use prg_ewald_mod
-  use prg_implicit_fermi_mod 
+  use prg_implicit_fermi_mod
 
   implicit none
 
@@ -27,809 +27,810 @@ module prg_xlbokernel_mod
 
 contains
 
-subroutine Invert(A,AI,N)
-
-implicit none
-integer, parameter                          :: PREC = 8
-integer, intent(in)                         :: N
-real(PREC),                 intent(in)      :: A(N,N)
-real(PREC), intent(out)                     :: AI(N,N)
-real(PREC)                                  :: WORK(N+N*N)!, C(N,N)
-integer                                     :: LDA, LWORK, M, INFO, IPIV(N)
-integer                                     :: I,J,K
-
-external DGETRF
-external DGETRI
-
-AI = A
-LDA = N
-M = N
-LWORK = N+N*N
-
-call DGETRF(M, N, AI, LDA, IPIV, INFO)
-call DGETRI(N, AI, N, IPIV, WORK, LWORK, INFO)
-
-end subroutine Invert
-  
-!> Compute low rank approximation of (K0*J)^(-1)*K0*(q[n]-n)(for LATTE)
-!! \param KRes The low rank approximation 
-!! \param KK0_bml The pre-conditioner K0.
-!! \param Res The residual q[n]-n
-!! \param FelTol Relative error tolerance for approximation 
-!! \param L Number of vectors used. 
-!! \param LMAX Maximum nr of vectors to use. 
-!! \param NUMRANK Nr of vectors to use. 
-!! \param HO_bml, Orthogonalized Hamiltonian matrix. 
-!! \param mu The chemical potiential. 
-!! \param beta Scaled inverse temperature.
-!! \param RXYZ Nuclear coordinates.
-!! \param Box Box dimensions. 
-!! \param Hubbard_U Hubbard U list. 
-!! \param Element_Pointer List to keep track of elements. 
-!! \param Nr_atoms The number of atoms. 
-!! \param HDIM Hamiltonian matrix dimension. 
-!! \param Max_Nr_Neigh Max neighbours for Ewald. 
-!! \param Coulomb_acc Coulomb accuracy.
-!! \param nebcoul Neighbour lists.
-!! \param totnebcoul Number of neighbours list. 
-!! \param Hinxlist List to keep track of atomic positions in the Hamiltonian. 
-!! \param S_bml The S matrix. 
-!! \param Z_bml, The Z matrix. 
-!! \param Nocc Occupation. 
-!! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
-!! \param DO_bml, D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
-!! \param m_rec Number of recursion steps.
-!! \param threshold Threshold value for matrix truncation.
-!! \param Nr_elem Number of elements in Hubbard list. 
-subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_bml,mu,beta,RXYZ,Box,Hubbard_U,Element_Pointer, & 
-    Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,nebcoul,totnebcoul,Hinxlist, &
-    S_bml,Z_bml,Nocc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold,Nr_elem)
-
-!! Res = q[n] - n
-!! KK0 is preconditioner
-!! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+  subroutine Invert(A,AI,N)
+
+    implicit none
+    integer, parameter                          :: PREC = 8
+    integer, intent(in)                         :: N
+    real(PREC),                 intent(in)      :: A(N,N)
+    real(PREC), intent(out)                     :: AI(N,N)
+    real(PREC)                                  :: WORK(N+N*N)!, C(N,N)
+    integer                                     :: LDA, LWORK, M, INFO, IPIV(N)
+    integer                                     :: I,J,K
+
+    external DGETRF
+    external DGETRI
+
+    AI = A
+    LDA = N
+    M = N
+    LWORK = N+N*N
+
+    call DGETRF(M, N, AI, LDA, IPIV, INFO)
+    call DGETRI(N, AI, N, IPIV, WORK, LWORK, INFO)
+
+  end subroutine Invert
+
+  !> Compute low rank approximation of (K0*J)^(-1)*K0*(q[n]-n)(for LATTE)
+  !! \param KRes The low rank approximation
+  !! \param KK0_bml The pre-conditioner K0.
+  !! \param Res The residual q[n]-n
+  !! \param FelTol Relative error tolerance for approximation
+  !! \param L Number of vectors used.
+  !! \param LMAX Maximum nr of vectors to use.
+  !! \param NUMRANK Nr of vectors to use.
+  !! \param HO_bml, Orthogonalized Hamiltonian matrix.
+  !! \param mu The chemical potiential.
+  !! \param beta Scaled inverse temperature.
+  !! \param RXYZ Nuclear coordinates.
+  !! \param Box Box dimensions.
+  !! \param Hubbard_U Hubbard U list.
+  !! \param Element_Pointer List to keep track of elements.
+  !! \param Nr_atoms The number of atoms.
+  !! \param HDIM Hamiltonian matrix dimension.
+  !! \param Max_Nr_Neigh Max neighbours for Ewald.
+  !! \param Coulomb_acc Coulomb accuracy.
+  !! \param nebcoul Neighbour lists.
+  !! \param totnebcoul Number of neighbours list.
+  !! \param Hinxlist List to keep track of atomic positions in the Hamiltonian.
+  !! \param S_bml The S matrix.
+  !! \param Z_bml, The Z matrix.
+  !! \param Nocc Occupation.
+  !! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+  !! \param DO_bml, D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+  !! \param m_rec Number of recursion steps.
+  !! \param threshold Threshold value for matrix truncation.
+  !! \param Nr_elem Number of elements in Hubbard list.
+  subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_bml,mu,beta,RXYZ,Box,Hubbard_U,Element_Pointer, &
+       Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,nebcoul,totnebcoul,Hinxlist, &
+       S_bml,Z_bml,Nocc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold,Nr_elem)
+
+    !! Res = q[n] - n
+    !! KK0 is preconditioner
+    !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+    implicit none
+    integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec,Nr_elem,NUMRANK
+    real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(dp), intent(in)      :: Coulomb_acc, FelTol, beta, mu
+    real(dp), intent(in)      :: RXYZ(3,Nr_atoms), Box(3,3)
+    real(dp), intent(in)      :: Res(Nr_atoms)
+    integer,    intent(in)      :: Hinxlist(HDIM),Element_Pointer(Nr_atoms)
+    real(dp), intent(in)      :: Hubbard_U(Nr_elem)
+    type(bml_matrix_t), intent(inout)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
+    real(dp)                  :: K0Res(Nr_atoms)
+    type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+    real(dp), intent(in)      :: threshold
+    integer,    intent(in)      :: LMAX
+    character(20)             :: bml_type
+    integer,    intent(in)      :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(dp)                  :: dq_v(Nr_atoms)
+    real(dp), intent(inout)   :: KRes(Nr_atoms)
+    integer                     :: I,J,K,It,N,MN
+    integer, intent(out)        :: L
+    real(dp)                  :: Fel, proj_tmp, start, finish
+    real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+    real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+    real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
+    type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml, ZT_bml, T_bml
+
+    call timer_prg_init()
+    bml_type = bml_get_type(HO_bml)
+    N = bml_get_N(HO_bml)
+    MN = bml_get_M(HO_bml)
+    allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
+    call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
+    call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
+    call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+
+    call bml_transpose(Z_bml,ZT_bml)
+
+    ! K0Res = KK0*Res temporary for matrix-vector multiplication
+    call bml_set_row(Res_bml,1,Res,1.0_dp*1e-10)
+    call bml_transpose(KK0_bml,KK0T_bml)
+    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,1.0_dp*1e-10)
+    call bml_get_row(K0Res_bml,1,row_NA)
+    K0Res = row_NA
+    dr = K0Res
+
+    I = 0
+    Fel = 1.D0
+    do while ((Fel > FelTol).AND.(I < (LMAX)).AND.(I < NUMRANK))   !! Fel = "Error" in Swedish
+      I = I + 1
+      !write(*,*) 'dr =', norm2(dr)
+      vi(:,I) = dr/norm2(dr)
+      do J = 1,I-1
+        vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+      enddo
+      vi(:,I) = vi(:,I)/norm2(vi(:,I))
+      v(:) = vi(:,I)  ! v_i
+
+      ! Compute H1 = H(v)
+      dq_v = v
+      call prg_timer_start(1)
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I)
+      do J = 1,Nr_atoms
+        call Ewald_Real_Space_latte(Coulomb_Pot_Real_I,J,RXYZ,Box, &
+             dq_v,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_elem)
+        Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+      enddo
+      !$OMP END PARALLEL DO
 
-  implicit none
-  integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec,Nr_elem,NUMRANK
-  real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-  real(dp), intent(in)      :: Coulomb_acc, FelTol, beta, mu
-  real(dp), intent(in)      :: RXYZ(3,Nr_atoms), Box(3,3)
-  real(dp), intent(in)      :: Res(Nr_atoms)
-  integer,    intent(in)      :: Hinxlist(HDIM),Element_Pointer(Nr_atoms)
-  real(dp), intent(in)      :: Hubbard_U(Nr_elem)
-  type(bml_matrix_t), intent(inout)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
-  real(dp)                  :: K0Res(Nr_atoms)
-  type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
-  real(dp), intent(in)      :: threshold
-  integer,    intent(in)      :: LMAX
-  character(20)             :: bml_type
-  integer,    intent(in)      :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
-  real(dp)                  :: dq_v(Nr_atoms)
-  real(dp), intent(inout)   :: KRes(Nr_atoms)
-  integer                     :: I,J,K,It,N,MN
-  integer, intent(out)        :: L
-  real(dp)                  :: Fel, proj_tmp, start, finish
-  real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
-  real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
-  real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
-  type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml, ZT_bml, T_bml
-
-  call timer_prg_init()
-  bml_type = bml_get_type(HO_bml)
-  N = bml_get_N(HO_bml)
-  MN = bml_get_M(HO_bml)
-  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
-  call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
-  call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
-  call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
-  call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
-
-  call bml_transpose(Z_bml,ZT_bml)
-
-  ! K0Res = KK0*Res temporary for matrix-vector multiplication
-  call bml_set_row(Res_bml,1,Res,threshold)
-  call bml_transpose(KK0_bml,KK0T_bml)
-  call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
-  call bml_get_row(K0Res_bml,1,row_NA)
-  K0Res = row_NA
-  dr = K0Res
-
-  I = 0
-  Fel = 1.D0  
-  do while ((Fel > FelTol).AND.(I < (LMAX)).AND.(I < NUMRANK))   !! Fel = "Error" in Swedish
-    I = I + 1
-    vi(:,I) = dr/norm2(dr)
-    do J = 1,I-1
-      vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
-    enddo
-    vi(:,I) = vi(:,I)/norm2(vi(:,I))
-    v(:) = vi(:,I)  ! v_i
+      call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc,Max_Nr_Neigh)
+      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+      call prg_timer_stop(1,1)
 
-    ! Compute H1 = H(v) 
-    dq_v = v 
-    call prg_timer_start(1)
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I)
-    do J = 1,Nr_atoms
-      call Ewald_Real_Space_latte(Coulomb_Pot_Real_I,J,RXYZ,Box, &
-      dq_v,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_elem)
-      Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
-    enddo
-    !$OMP END PARALLEL DO
-    
-    call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc,Max_Nr_Neigh)
-    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
-    call prg_timer_stop(1,1)
 
+      call bml_deallocate(H1_bml)
+      call bml_zero_matrix(bml_type,bml_element_real,dp,HDIM,MN,H1_bml)
 
-    call bml_deallocate(H1_bml)
-    call bml_zero_matrix(bml_type,bml_element_real,dp,HDIM,MN,H1_bml)
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+      do It = 1,Nr_atoms-1
+        do K = Hinxlist(It)+1,Hinxlist(It+1)
+          row1(K) = Hubbard_U(Element_Pointer(It))*dq_v(It) + Coulomb_Pot_dq_v(It)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+      do K = Hinxlist(Nr_atoms)+1,HDIM
+        row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
+      enddo
 
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
-    do It = 1,Nr_atoms-1
-       do K = Hinxlist(It)+1,Hinxlist(It+1)
-           row1(K) = Hubbard_U(Element_Pointer(It))*dq_v(It) + Coulomb_Pot_dq_v(It)
-       enddo
-    enddo
-    !$OMP END PARALLEL DO
-    do K = Hinxlist(Nr_atoms)+1,HDIM
-         row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
-    enddo
-     
-    ! H1 = 1/2(S*H1+H1*S)
-    call bml_set_diagonal(H1_bml,row1,threshold)
-    call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold) 
-    call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
-    call bml_copy(X_bml,H1_bml)
-
-    ! H1 = Z^T H1 Z
-    call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
-    call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
-
-    ! Compute D1 = F_FD(HO_bml + eps*H1_bml)/eps at eps = 0
-    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-        m_rec, mu, beta, real(nocc,dp), threshold)
-    
-    ! D1 = Z D1 Z^T
-    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
-    call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-
-    ! Compute dq/dv
-    call bml_multiply(D1_bml,S_bml,X_bml, 1.0_dp,0.0_dp,threshold)
-    call bml_get_diagonal(X_bml,row1)
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
-    do It = 1, Nr_atoms-1
-       dq_dv(It) = 0
-       do K = Hinxlist(It)+1,Hinxlist(It+1)
+      ! H1 = 1/2(S*H1+H1*S)
+      call bml_set_diagonal(H1_bml,row1,threshold)
+      call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(X_bml,H1_bml)
+
+      ! H1 = Z^T H1 Z
+      call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+      ! Compute D1 = F_FD(HO_bml + eps*H1_bml)/eps at eps = 0
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec, mu, beta, real(nocc,dp), threshold)
+
+      ! D1 = Z D1 Z^T
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      ! Compute dq/dv
+      call bml_multiply(D1_bml,S_bml,X_bml, 1.0_dp,0.0_dp,threshold)
+      call bml_get_diagonal(X_bml,row1)
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+      do It = 1, Nr_atoms-1
+        dq_dv(It) = 0
+        do K = Hinxlist(It)+1,Hinxlist(It+1)
           dq_dv(It) = dq_dv(It) + row1(K)
-       enddo
-    enddo
-    !$OMP END PARALLEL DO
-    dq_dv(Nr_atoms) = 0
-    do K = Hinxlist(Nr_atoms)+1,HDIM
-       dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K) 
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+      dq_dv(Nr_atoms) = 0
+      do K = Hinxlist(Nr_atoms)+1,HDIM
+        dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K)
+      enddo
+
+      dr = dq_dv - v
+      ! fi = K0(dq_dv - v)
+      call bml_set_row(Res_bml,1,dr,1.0_dp*1e-10)
+      call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,1.0_dp*1e-10)
+      call bml_get_row(K0Res_bml,1,row_NA)
+      dr = row_NA
+      fi(:,I) = dr
+
+      L = I
+      allocate(O(L,L), M(L,L))
+      do K = 1,L
+        do J = 1,L
+          O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+        enddo
+      enddo
+      call Invert(O,M,L)                        ! M = O^(-1)
+      IdentRes = 0.D0*K0Res
+      KRes = 0.D0
+      do K = 1,L
+        do J = 1,L
+          proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
+          IdentRes = IdentRes + proj_tmp*fi(:,K)
+          KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n)
+        enddo
+      enddo
+      Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+      write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+      deallocate(O, M)
+
     enddo
 
-    dr = dq_dv - v
-    ! fi = K0(dq_dv - v)
-    call bml_set_row(Res_bml,1,dr,threshold)
-    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
+    deallocate(row1);deallocate(row2);deallocate(row_NA)
+    call bml_deallocate(KK0T_bml)
+    call bml_deallocate(K0Res_bml)
+    call bml_deallocate(Res_bml)
+    call bml_deallocate(ZT_bml)
+    call prg_timer_shutdown()
+
+  end subroutine prg_kernel_multirank_latte
+
+  ! Above routine but for development code
+  subroutine prg_kernel_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
+       Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
+       S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+
+    !! Res = q[n] - n
+    !! KK0 is preconditioner
+    !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+    implicit none
+    integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
+    real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
+    real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+    real(dp), intent(in)      :: Res(Nr_atoms)
+    integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+    real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+    type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
+    real(dp)                  :: K0Res(Nr_atoms)
+    type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+    real(dp), intent(in)      :: threshold
+    integer,    intent(in)      :: LMAX
+    character(10), intent(in)   :: Element_Type(Nr_atoms)
+    integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+    real(dp)                  :: dq_v(Nr_atoms)
+    real(dp), intent(inout)   :: KRes(Nr_atoms)
+    integer                     :: I,J,K,It
+    integer, intent(out)        :: L
+    real(dp)                  :: Fel, proj_tmp
+    real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+    real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+    real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
+    type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml
+
+    allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
+    call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
+    call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
+    call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
+
+    ! K0Res = KK0*Res temporary for matrix-vector multiplication
+    call bml_transpose(KK0_bml,KK0T_bml)
+    call bml_set_row(Res_bml,1,Res,ONE*1e-14)
+    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,ONE*1e-14)
     call bml_get_row(K0Res_bml,1,row_NA)
-    dr = row_NA
-    fi(:,I) = dr
-
-    L = I
-    allocate(O(L,L), M(L,L))
-    do K = 1,L
-    do J = 1,L
-       O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
-    enddo
-    enddo
-    call Invert(O,M,L)                        ! M = O^(-1)
-    IdentRes = 0.D0*K0Res
-    KRes = 0.D0
-    do K = 1,L
-    do J = 1,L
-      proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
-      IdentRes = IdentRes + proj_tmp*fi(:,K)
-      KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n) 
-    enddo
-    enddo
-    Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
-    write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
-    deallocate(O, M)
+    K0Res = row_NA
+    dr = K0Res
+
+    I = 0
+    Fel = 1.D0
+    do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
+      I = I + 1
+      vi(:,I) = dr/norm2(dr)
+      do J = 1,I-1
+        vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+      enddo
+      vi(:,I) = vi(:,I)/norm2(vi(:,I))
+      v(:) = vi(:,I)  ! v_i
 
-  enddo
+      dq_v = v
 
-  deallocate(row1);deallocate(row2);deallocate(row_NA)
-  call bml_deallocate(KK0T_bml)
-  call bml_deallocate(K0Res_bml)
-  call bml_deallocate(Res_bml)
-  call bml_deallocate(ZT_bml)
-  call prg_timer_shutdown() 
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
+      do J = 1,Nr_atoms
+        call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
+             dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+        Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+      enddo
+      !$OMP END PARALLEL DO
 
-  end subroutine prg_kernel_multirank_latte
+      call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
-  ! Above routine but for development code   
-  subroutine prg_kernel_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, & 
-    Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
-    S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+      row1 = 0.0_dp
+      do J = 1,HDIM
+        call bml_set_row(H1_bml,J,row1,threshold)
+      enddo
 
-!! Res = q[n] - n
-!! KK0 is preconditioner
-!! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
+      do J = 1,Nr_atoms
+        do K = H_INDEX_START(J),H_INDEX_END(J)
+          row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
 
-  implicit none
-  integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
-  real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-  real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
-  real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
-  real(dp), intent(in)      :: Res(Nr_atoms)
-  integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
-  real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
-  type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
-  real(dp)                  :: K0Res(Nr_atoms)
-  type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
-  real(dp), intent(in)      :: threshold
-  integer,    intent(in)      :: LMAX
-  character(10), intent(in)   :: Element_Type(Nr_atoms)
-  integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
-  real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
-  real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
-  real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
-  real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
-  real(dp)                  :: dq_v(Nr_atoms)
-  real(dp), intent(inout)   :: KRes(Nr_atoms)
-  integer                     :: I,J,K,It
-  integer, intent(out)        :: L
-  real(dp)                  :: Fel, proj_tmp
-  real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
-  real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
-  real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
-  type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml
-
-  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
-  call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
-  call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
-  call bml_zero_matrix("ellpack",bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
-
-  ! K0Res = KK0*Res temporary for matrix-vector multiplication
-  call bml_transpose(KK0_bml,KK0T_bml) 
-  call bml_set_row(Res_bml,1,Res,ONE*1e-14)
-  call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,ONE*1e-14)
-  call bml_get_row(K0Res_bml,1,row_NA)
-  K0Res = row_NA
-  dr = K0Res
-
-  I = 0
-  Fel = 1.D0  
-  do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
-    I = I + 1
-    vi(:,I) = dr/norm2(dr)
-    do J = 1,I-1
-      vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
-    enddo
-    vi(:,I) = vi(:,I)/norm2(vi(:,I))
-    v(:) = vi(:,I)  ! v_i
+      call bml_set_diagonal(H1_bml,row1,threshold)
+      call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(D1_bml,H1_bml)
 
-    dq_v = v 
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
 
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
-    do J = 1,Nr_atoms
-      call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
-      dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
-      Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
-    enddo
-    !$OMP END PARALLEL DO
-    
-    call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
-    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
-
-    row1 = 0.0_dp
-    do J = 1,HDIM
-       call bml_set_row(H1_bml,J,row1,threshold)
-    enddo
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec, mu, beta, real(nocc,dp), threshold)
 
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
-    do J = 1,Nr_atoms
-       do K = H_INDEX_START(J),H_INDEX_END(J)
-           row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
-       enddo
-    enddo
-    !$OMP END PARALLEL DO
-
-    call bml_set_diagonal(H1_bml,row1,threshold)
-    call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold) 
-    call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
-    call bml_copy(D1_bml,H1_bml)
-
-    call bml_transpose(Z_bml,X_bml)
-    call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-    call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
-
-    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-        m_rec, mu, beta, real(nocc,dp), threshold)
-    
-    call bml_transpose(Z_bml,X_bml)
-    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
-    call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
-    do It = 1, Nr_atoms
-      dq_dv(It) = 0
-      do K = H_INDEX_START(It), H_INDEX_END(It)
-        call bml_get_row(S_bml,K,row1)
-        call bml_get_row(D1_bml,K,row2)
-        dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
+      do It = 1, Nr_atoms
+        dq_dv(It) = 0
+        do K = H_INDEX_START(It), H_INDEX_END(It)
+          call bml_get_row(S_bml,K,row1)
+          call bml_get_row(D1_bml,K,row2)
+          dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+        enddo
       enddo
-    enddo
-    !$OMP END PARALLEL DO
+      !$OMP END PARALLEL DO
+
+      dr = dq_dv - v
+      ! fi = K0(dq_dv - v)
+      call bml_set_row(Res_bml,1,dr,threshold)
+      call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
+      call bml_get_row(K0Res_bml,1,row_NA)
+      dr = row_NA
+      fi(:,I) = dr
+
+      L = I
+      allocate(O(L,L), M(L,L))
+      do K = 1,L
+        do J = 1,L
+          O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+        enddo
+      enddo
+      call Invert(O,M,L)                        ! M = O^(-1)
+      IdentRes = 0.D0*K0Res
+      KRes = 0.D0
+      do K = 1,L
+        do J = 1,L
+          proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
+          IdentRes = IdentRes + proj_tmp*fi(:,K)
+          KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n)
+        enddo
+      enddo
+      Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+      write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+      deallocate(O, M)
 
-    dr = dq_dv - v 
-    ! fi = K0(dq_dv - v)
-    call bml_set_row(Res_bml,1,dr,threshold)
-    call bml_multiply(Res_bml,KK0T_bml,K0Res_bml,1.0_dp,0.0_dp,threshold)
-    call bml_get_row(K0Res_bml,1,row_NA)
-    dr = row_NA
-    fi(:,I) = dr
-
-    L = I
-    allocate(O(L,L), M(L,L))
-    do K = 1,L
-    do J = 1,L
-       O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
-    enddo
     enddo
-    call Invert(O,M,L)                        ! M = O^(-1)
-    IdentRes = 0.D0*K0Res
-    KRes = 0.D0
-    do K = 1,L
-    do J = 1,L
-      proj_tmp = M(K,J)*dot_product(fi(:,J),K0Res)
-      IdentRes = IdentRes + proj_tmp*fi(:,K)
-      KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (K0*J)^(-1)*K0*(q[n]-n) 
-    enddo
-    enddo
-    Fel = norm2(IdentRes-K0Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
-    write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
-    deallocate(O, M)
-
-  enddo
 
-  deallocate(row1);deallocate(row2);deallocate(row_NA)
-  call bml_deallocate(KK0T_bml)
-  call bml_deallocate(K0Res_bml)
-  call bml_deallocate(Res_bml)
+    deallocate(row1);deallocate(row2);deallocate(row_NA)
+    call bml_deallocate(KK0T_bml)
+    call bml_deallocate(K0Res_bml)
+    call bml_deallocate(Res_bml)
 
   end subroutine prg_kernel_multirank
 
-!> Compute full inverse Jacobian of q[n]-n (for LATTE)
-!! \param KK The inverse Jacobian.
-!! \param DO_bml Orthogonalized density matrix. 
-!! \param mu0 The chemical potiential. 
-!! \param RXYZ Nuclear coordinates.
-!! \param Box Box dimensions. 
-!! \param Hubbard_U Hubbard U list. 
-!! \param Element_Pointer List to keep track of elements. 
-!! \param Nr_atoms The number of atoms. 
-!! \param HDIM Hamiltonian matrix dimension. 
-!! \param Max_Nr_Neigh Max neighbours for Ewald. 
-!! \param Coulomb_acc Coulomb accuracy
-!! \param Hinxlist List to keep track of atomic positions in the Hamiltonian. 
-!! \param S_bml The S matrix. 
-!! \param Z_bml, The Z matrix. 
-!! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
-!! \param HO_bml, Orthogonalized Hamiltonian matrix. 
-!! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
-!! \param Nocc Occupation. 
-!! \param m_rec Number of recursion steps.
-!! \param threshold Threshold value for matrix truncation.
-!! \param beta Scaled inverse temperature.
-!! \param Nr_elem Number of elements in Hubbard list. 
-subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
-Element_Pointer, Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc, & 
-Hinxlist,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
-Nocc,m_rec,threshold,beta,Nr_elem)
-
-use bml
-
-implicit none
-integer, parameter             :: PREC = 8, dp = kind(1.0d0)
-integer,    intent(in)         :: Nr_atoms, Nr_elem,HDIM, Nocc, Max_Nr_Neigh,m_rec
-real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
-real(PREC), intent(in)         :: Coulomb_acc, threshold,beta
-real(PREC)                     :: v(Nr_atoms)
-real(PREC), intent(in)         :: RXYZ(3,Nr_atoms),Box(3,3)
-integer,    intent(in)         :: Hinxlist(HDIM)
-real(PREC), intent(in)         :: Hubbard_U(Nr_elem)
-type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
-type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
-type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml 
-real(PREC), intent(inout)      :: mu0
-integer,    intent(in)         :: Element_Pointer(Nr_atoms)
-real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
-real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
-real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
-real(PREC)                     :: dq_v(Nr_atoms)
-real(PREC)                     :: dq_dv(Nr_atoms), err,tol
-real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
-integer                        :: I,J,K, ITER, mm,It,N,MN
-real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)     
-type(bml_matrix_t)             :: ZT_bml, R_bml, JJI_bml, JJ_bml, tmp_bml
-character(20)                  :: bml_type
-
-  bml_type = bml_get_type(HO_bml)
-  N = bml_get_N(HO_bml)
-  MN = bml_get_M(HO_bml)
-  call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
-  call bml_transpose(Z_bml,ZT_bml)
-  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
-  Coulomb_Pot_dq_v = ZERO
-  Coulomb_Pot_k = ZERO
-  dq_v = ZERO
-  JJ = ZERO
-  KK = ZERO
-  
-  do J = 1,Nr_atoms  
-    dq_v(J) = ONE
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
-    do I = 1,Nr_atoms   
+  !> Compute full inverse Jacobian of q[n]-n (for LATTE)
+  !! \param KK The inverse Jacobian.
+  !! \param DO_bml Orthogonalized density matrix.
+  !! \param mu0 The chemical potiential.
+  !! \param RXYZ Nuclear coordinates.
+  !! \param Box Box dimensions.
+  !! \param Hubbard_U Hubbard U list.
+  !! \param Element_Pointer List to keep track of elements.
+  !! \param Nr_atoms The number of atoms.
+  !! \param HDIM Hamiltonian matrix dimension.
+  !! \param Max_Nr_Neigh Max neighbours for Ewald.
+  !! \param Coulomb_acc Coulomb accuracy
+  !! \param Hinxlist List to keep track of atomic positions in the Hamiltonian.
+  !! \param S_bml The S matrix.
+  !! \param Z_bml, The Z matrix.
+  !! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+  !! \param HO_bml, Orthogonalized Hamiltonian matrix.
+  !! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+  !! \param Nocc Occupation.
+  !! \param m_rec Number of recursion steps.
+  !! \param threshold Threshold value for matrix truncation.
+  !! \param beta Scaled inverse temperature.
+  !! \param Nr_elem Number of elements in Hubbard list.
+  subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
+       Element_Pointer, Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc, &
+       Hinxlist,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
+       Nocc,m_rec,threshold,beta,Nr_elem)
+
+    use bml
+
+    implicit none
+    integer, parameter             :: PREC = 8, dp = kind(1.0d0)
+    integer,    intent(in)         :: Nr_atoms, Nr_elem,HDIM, Nocc, Max_Nr_Neigh,m_rec
+    real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
+    real(PREC), intent(in)         :: Coulomb_acc, threshold,beta
+    real(PREC)                     :: v(Nr_atoms)
+    real(PREC), intent(in)         :: RXYZ(3,Nr_atoms),Box(3,3)
+    integer,    intent(in)         :: Hinxlist(HDIM)
+    real(PREC), intent(in)         :: Hubbard_U(Nr_elem)
+    type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
+    type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
+    type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml
+    real(PREC), intent(inout)      :: mu0
+    integer,    intent(in)         :: Element_Pointer(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(PREC)                     :: dq_v(Nr_atoms)
+    real(PREC)                     :: dq_dv(Nr_atoms), err,tol
+    real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
+    integer                        :: I,J,K, ITER, mm,It,N,MN
+    real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)
+    type(bml_matrix_t)             :: ZT_bml, R_bml, JJI_bml, JJ_bml, tmp_bml
+    character(20)                  :: bml_type
+
+    bml_type = bml_get_type(HO_bml)
+    N = bml_get_N(HO_bml)
+    MN = bml_get_M(HO_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+    call bml_transpose(Z_bml,ZT_bml)
+    allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
+    Coulomb_Pot_dq_v = ZERO
+    Coulomb_Pot_k = ZERO
+    dq_v = ZERO
+    JJ = ZERO
+    KK = ZERO
+
+    do J = 1,Nr_atoms
+      dq_v(J) = ONE
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
+      do I = 1,Nr_atoms
         call Ewald_Real_Space_Single_latte(Coulomb_Pot_Real_I,I,RXYZ,Box,Nr_elem, &
-        dq_v,J,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,HDIM,Max_Nr_Neigh)
+             dq_v,J,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,HDIM,Max_Nr_Neigh)
         Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
-    enddo
-    !$OMP END PARALLEL DO
-    call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc, &
-    Max_Nr_Neigh)
-    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
-
-    call bml_deallocate(H1_bml)
-    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,H1_bml)
-
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
-    do I = 1,Nr_atoms-1
-       do K = Hinxlist(I)+1,Hinxlist(I+1)
-           row1(K) = Hubbard_U(Element_Pointer(I))*dq_v(I) + Coulomb_Pot_dq_v(I)
-       enddo
-    enddo
-    !$OMP END PARALLEL DO
-    do K = Hinxlist(Nr_atoms)+1,HDIM
-         row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
-    enddo
+      enddo
+      !$OMP END PARALLEL DO
+      call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc, &
+           Max_Nr_Neigh)
+      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+      call bml_deallocate(H1_bml)
+      call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,H1_bml)
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
+      do I = 1,Nr_atoms-1
+        do K = Hinxlist(I)+1,Hinxlist(I+1)
+          row1(K) = Hubbard_U(Element_Pointer(I))*dq_v(I) + Coulomb_Pot_dq_v(I)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+      do K = Hinxlist(Nr_atoms)+1,HDIM
+        row1(K) = Hubbard_U(Element_Pointer(Nr_atoms))*dq_v(Nr_atoms) + Coulomb_Pot_dq_v(Nr_atoms)
+      enddo
 
-    call bml_set_diagonal(H1_bml,row1,threshold)
-    call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold) 
-    call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
-    call bml_copy(X_bml,H1_bml)
+      call bml_set_diagonal(H1_bml,row1,threshold)
+      call bml_multiply(S_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,X_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(X_bml,H1_bml)
 
-    call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
-    call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(ZT_bml,H1_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
 
-    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-           m_rec,mu0,beta,real(nocc,PREC),threshold) 
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec,mu0,beta,real(nocc,PREC),threshold)
 
-    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
-    call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
 
-    call bml_multiply(D1_bml,S_bml,X_bml,1.0_dp,0.0_dp,threshold)
-    call bml_get_diagonal(X_bml,row1)
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
-    do It = 1, Nr_atoms-1
-       dq_dv(It) = 0
-       do K = Hinxlist(It)+1,Hinxlist(It+1)
+      call bml_multiply(D1_bml,S_bml,X_bml,1.0_dp,0.0_dp,threshold)
+      call bml_get_diagonal(X_bml,row1)
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K)
+      do It = 1, Nr_atoms-1
+        dq_dv(It) = 0
+        do K = Hinxlist(It)+1,Hinxlist(It+1)
           dq_dv(It) = dq_dv(It) + row1(K)
-       enddo
-       JJ(It,J) = dq_dv(It)
-    enddo
-    !$OMP END PARALLEL DO
-    dq_dv(Nr_atoms) = 0
-    do K = Hinxlist(Nr_atoms)+1,HDIM
-       dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K) 
+        enddo
+        JJ(It,J) = dq_dv(It)
+      enddo
+      !$OMP END PARALLEL DO
+      dq_dv(Nr_atoms) = 0
+      do K = Hinxlist(Nr_atoms)+1,HDIM
+        dq_dv(Nr_atoms) = dq_dv(Nr_atoms) + row1(K)
+      enddo
+      JJ(Nr_atoms,J) = dq_dv(Nr_atoms)
+      dq_v = ZERO
     enddo
-    JJ(Nr_atoms,J) = dq_dv(Nr_atoms)
+
+    do I = 1,Nr_atoms
+      JJ(I,I) = JJ(I,I) - ONE
+    enddo
+
+    call Invert(JJ,KK,Nr_atoms)
+
+    deallocate(row1); deallocate(row2); deallocate(JJ)
+    call bml_deallocate(ZT_bml)
+
+  end subroutine prg_full_kernel_latte
+
+  !> Compute full inverse Jacobian of q[n]-n (for development code)
+  !! \param KK The inverse Jacobian.
+  !! \param DO_bml Orthogonalized density matrix.
+  !! \param mu0 The chemical potiential.
+  !! \param RX,RY,RZ Nuclear coordinates.
+  !! \param Lbox Box dimensions.
+  !! \param Hubbard_U Hubbard U list.
+  !! \param Element_Type List to keep track of elements.
+  !! \param Nr_atoms The number of atoms.
+  !! \param HDIM Hamiltonian matrix dimension.
+  !! \param Max_Nr_Neigh Max neighbours for Ewald.
+  !! \param Coulomb_acc Coulomb accuracy
+  !! \param TIMERATIO Parameter for Ewald
+  !! \param nnRx,nnRy,nnRz Neighbour lists.
+  !! \param nrnnlist Number of neighbours list.
+  !! \param nnType Refers to original order of atoms.
+  !! \param H_INDEX_START, H_INDEX_END Lists to keep track of atomic positions in the Hamiltonian.
+  !! \param S_bml The S matrix.
+  !! \param Z_bml, The Z matrix.
+  !! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
+  !! \param HO_bml, Orthogonalized Hamiltonian matrix.
+  !! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
+  !! \param Nocc Occupation.
+  !! \param Znuc List of nuclear charges.
+  !! \param m_rec Number of recursion steps.
+  !! \param threshold Threshold value for matrix truncation.
+  !! \param beta Scaled inverse temperature.
+  !! \param diagonal Auxillary vector.
+
+  subroutine prg_full_kernel(KK,DO_bml,mu0,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
+       Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz, &
+       nrnnlist,nnType,H_INDEX_START,H_INDEX_END,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
+       Nocc,Znuc,m_rec,threshold,beta,diagonal)
+
+    use bml
+
+    implicit none
+    integer, parameter             :: PREC = 8, dp = kind(1.0d0)
+    integer,    intent(in)         :: Nr_atoms, HDIM, Nocc, Max_Nr_Neigh,m_rec
+    real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
+    real(PREC), intent(in)         :: Coulomb_acc, TIMERATIO,threshold,beta
+    real(PREC)                     :: v(Nr_atoms)
+    real(PREC), intent(in)         :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+    integer,    intent(in)         :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+    real(PREC), intent(in)         :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+    type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
+    type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
+    type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml
+    real(PREC), intent(inout)      :: mu0, diagonal(HDIM)
+    character(10), intent(in)      :: Element_Type(Nr_atoms)
+    integer,    intent(in)         :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)         :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)         :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
+    real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(PREC)                     :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+    real(PREC)                     :: dq_v(Nr_atoms)
+    real(PREC)                     :: dq_dv(Nr_atoms)
+    real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
+    integer                        :: I,J,K, ITER, mm,It
+    real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)
+
+    allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
+    Coulomb_Pot_dq_v = ZERO
+    Coulomb_Pot_k = ZERO
     dq_v = ZERO
-  enddo
-  
-  do I = 1,Nr_atoms
-     JJ(I,I) = JJ(I,I) - ONE
-  enddo
-  
-  call Invert(JJ,KK,Nr_atoms)
-  
-  deallocate(row1); deallocate(row2); deallocate(JJ)
-  call bml_deallocate(ZT_bml)
-
-end subroutine prg_full_kernel_latte
-
-!> Compute full inverse Jacobian of q[n]-n (for development code) 
-!! \param KK The inverse Jacobian.
-!! \param DO_bml Orthogonalized density matrix. 
-!! \param mu0 The chemical potiential. 
-!! \param RX,RY,RZ Nuclear coordinates.
-!! \param Lbox Box dimensions. 
-!! \param Hubbard_U Hubbard U list. 
-!! \param Element_Type List to keep track of elements. 
-!! \param Nr_atoms The number of atoms. 
-!! \param HDIM Hamiltonian matrix dimension. 
-!! \param Max_Nr_Neigh Max neighbours for Ewald. 
-!! \param Coulomb_acc Coulomb accuracy
-!! \param TIMERATIO Parameter for Ewald
-!! \param nnRx,nnRy,nnRz Neighbour lists.
-!! \param nrnnlist Number of neighbours list.
-!! \param nnType Refers to original order of atoms. 
-!! \param H_INDEX_START, H_INDEX_END Lists to keep track of atomic positions in the Hamiltonian. 
-!! \param S_bml The S matrix. 
-!! \param Z_bml, The Z matrix. 
-!! \param Inv_bml, Inverses generated by prg_implicit_fermi_save_inverse.
-!! \param HO_bml, Orthogonalized Hamiltonian matrix. 
-!! \param D1_bml, H1_bml, Y_bml, X_bml Auxillary matrices.
-!! \param Nocc Occupation. 
-!! \param Znuc List of nuclear charges.
-!! \param m_rec Number of recursion steps.
-!! \param threshold Threshold value for matrix truncation.
-!! \param beta Scaled inverse temperature.
-!! \param diagonal Auxillary vector. 
-
-subroutine prg_full_kernel(KK,DO_bml,mu0,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
-Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz, & 
-nrnnlist,nnType,H_INDEX_START,H_INDEX_END,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
-Nocc,Znuc,m_rec,threshold,beta,diagonal)
-
-use bml
-
-implicit none
-integer, parameter             :: PREC = 8, dp = kind(1.0d0)
-integer,    intent(in)         :: Nr_atoms, HDIM, Nocc, Max_Nr_Neigh,m_rec
-real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
-real(PREC), intent(in)         :: Coulomb_acc, TIMERATIO,threshold,beta
-real(PREC)                     :: v(Nr_atoms)
-real(PREC), intent(in)         :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
-integer,    intent(in)         :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
-real(PREC), intent(in)         :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
-type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
-type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
-type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml 
-real(PREC), intent(inout)      :: mu0, diagonal(HDIM)
-character(10), intent(in)      :: Element_Type(Nr_atoms)
-integer,    intent(in)         :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
-real(PREC), intent(in)         :: nnRx(Nr_atoms,Max_Nr_Neigh)
-real(PREC), intent(in)         :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
-real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
-real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
-real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
-real(PREC)                     :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
-real(PREC)                     :: dq_v(Nr_atoms)
-real(PREC)                     :: dq_dv(Nr_atoms) 
-real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
-integer                        :: I,J,K, ITER, mm,It
-real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)        
-
-  allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
-  Coulomb_Pot_dq_v = ZERO
-  Coulomb_Pot_k = ZERO
-  dq_v = ZERO
-  JJ = ZERO
-  KK = ZERO
-  
-  do J = 1,Nr_atoms  
-    dq_v(J) = ONE
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
-    do I = 1,Nr_atoms   
+    JJ = ZERO
+    KK = ZERO
+
+    do J = 1,Nr_atoms
+      dq_v(J) = ONE
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
+      do I = 1,Nr_atoms
         call Ewald_Real_Space_Single(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,I,RX,RY,RZ,LBox, &
-        dq_v,J,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,HDIM,Max_Nr_Neigh)
+             dq_v,J,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,HDIM,Max_Nr_Neigh)
         Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
-    enddo
-    !$OMP END PARALLEL DO
-    call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
-    TIMERATIO,Max_Nr_Neigh)
-    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
-
-    diagonal = 0.0_dp
-    do I = 1,HDIM
-       call bml_set_row(H1_bml,I,diagonal,threshold)
-    enddo
-    
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
-    do I = 1,Nr_atoms
-       do K = H_INDEX_START(I),H_INDEX_END(I)
-           diagonal(K) = Hubbard_U(I)*dq_v(I) + Coulomb_Pot_dq_v(I)
-       enddo
-    enddo
-    !$OMP END PARALLEL DO
-    
-    call bml_set_diagonal(H1_bml,diagonal,threshold)
-    call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold) 
-    call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
-    call bml_copy(D1_bml,H1_bml)
-
-    call bml_transpose(Z_bml,X_bml)
-    call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-    call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
-
-    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-           m_rec,mu0,beta,real(nocc,PREC),threshold) 
-
-    call bml_transpose(Z_bml,X_bml)
-    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
-    call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-
-    do It = 1, Nr_atoms
-       dq_dv(It) = 0
-       do K = H_INDEX_START(It), H_INDEX_END(It)
+      enddo
+      !$OMP END PARALLEL DO
+      call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
+           TIMERATIO,Max_Nr_Neigh)
+      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+
+      diagonal = 0.0_dp
+      do I = 1,HDIM
+        call bml_set_row(H1_bml,I,diagonal,threshold)
+      enddo
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,K)
+      do I = 1,Nr_atoms
+        do K = H_INDEX_START(I),H_INDEX_END(I)
+          diagonal(K) = Hubbard_U(I)*dq_v(I) + Coulomb_Pot_dq_v(I)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
+
+      call bml_set_diagonal(H1_bml,diagonal,threshold)
+      call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(D1_bml,H1_bml)
+
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
+
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec,mu0,beta,real(nocc,PREC),threshold)
+
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      do It = 1, Nr_atoms
+        dq_dv(It) = 0
+        do K = H_INDEX_START(It), H_INDEX_END(It)
           call bml_get_row(S_bml,K,row1)
           call bml_get_row(D1_bml,K,row2)
           dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
-       enddo
-       JJ(It,J) = dq_dv(It)
+        enddo
+        JJ(It,J) = dq_dv(It)
+      enddo
+      dq_v = ZERO
     enddo
-    dq_v = ZERO
-  enddo
 
-  do I = 1,Nr_atoms
-     JJ(I,I) = JJ(I,I) - ONE
-  enddo
-  call Invert(JJ,KK,Nr_atoms)
-  deallocate(row1); deallocate(row2); deallocate(JJ)
+    do I = 1,Nr_atoms
+      JJ(I,I) = JJ(I,I) - ONE
+    enddo
+    call Invert(JJ,KK,Nr_atoms)
+    deallocate(row1); deallocate(row2); deallocate(JJ)
+
+  end subroutine prg_full_kernel
+
+
+  ! Compute the low-rank kernel matrix. (For development code)
+  subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
+       Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
+       S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+
+    !! Res = q[n] - n
+    !! KK0 is preconditioner
+    !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+
+    implicit none
+    integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
+    real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
+    real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
+    real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
+    real(dp), intent(in)      :: Res(Nr_atoms)
+    integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
+    real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
+    type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec)
+    real(dp)                  :: K0Res(Nr_atoms)
+    type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+    type(bml_matrix_t),intent(inout) :: KK0_bml
+    real(dp), intent(in)      :: threshold
+    integer,    intent(in)      :: LMAX
+    character(10), intent(in)   :: Element_Type(Nr_atoms)
+    integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
+    real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
+    real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
+    real(dp)                  :: dq_v(Nr_atoms)
+    real(dp), intent(out)     :: KRes(Nr_atoms)
+    integer                     :: I,J,K,It,col,row
+    integer, intent(out)        :: L
+    real(dp)                  :: Fel, proj_tmp,elem
+    real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
+    real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
+    real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:)
+
+    allocate(row1(HDIM));allocate(row2(HDIM));
+
+    dr = Res
+    I = 0
+    Fel = 1.D0
+    do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
+      I = I + 1
+      vi(:,I) = dr/norm2(dr)
+      do J = 1,I-1
+        vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
+      enddo
+      vi(:,I) = vi(:,I)/norm2(vi(:,I))
+      v(:) = vi(:,I)  ! v_i
 
-end subroutine prg_full_kernel
+      dq_v = v
 
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
+      do J = 1,Nr_atoms
+        call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
+             dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
+        Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
+      enddo
+      !$OMP END PARALLEL DO
 
-! Compute the low-rank kernel matrix. (For development code) 
-  subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX,RY,RZ,LBox,Hubbard_U,Element_Type, & 
-    Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,H_INDEX_START,H_INDEX_END, &
-    S_bml,Z_bml,Nocc,Znuc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold)
+      call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
-!! Res = q[n] - n
-!! KK0 is preconditioner
-!! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
+      row1 = 0.0_dp
+      do J = 1,HDIM
+        call bml_set_row(H1_bml,J,row1,threshold)
+      enddo
 
-  implicit none
-  integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
-  real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-  real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
-  real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
-  real(dp), intent(in)      :: Res(Nr_atoms)
-  integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
-  real(dp), intent(in)      :: Znuc(Nr_atoms), Hubbard_U(Nr_atoms)
-  type(bml_matrix_t), intent(in)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec)
-  real(dp)                  :: K0Res(Nr_atoms)
-  type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
-  type(bml_matrix_t),intent(inout) :: KK0_bml
-  real(dp), intent(in)      :: threshold
-  integer,    intent(in)      :: LMAX
-  character(10), intent(in)   :: Element_Type(Nr_atoms)
-  integer,    intent(in)      :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
-  real(dp), intent(in)      :: nnRx(Nr_atoms,Max_Nr_Neigh)
-  real(dp), intent(in)      :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
-  real(dp)                  :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms),dq_dv(Nr_atoms)
-  real(dp)                  :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
-  real(dp)                  :: Coulomb_Force_Real_I(3),Coulomb_Force_k(3,Nr_atoms)
-  real(dp)                  :: dq_v(Nr_atoms)
-  real(dp), intent(out)     :: KRes(Nr_atoms)
-  integer                     :: I,J,K,It,col,row
-  integer, intent(out)        :: L
-  real(dp)                  :: Fel, proj_tmp,elem
-  real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
-  real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
-  real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:)
-
-  allocate(row1(HDIM));allocate(row2(HDIM));
-
-  dr = Res
-  I = 0
-  Fel = 1.D0  
-  do while ((Fel > FelTol).AND.(I < (LMAX)))   !! Fel = "Error" in Swedish
-    I = I + 1
-    vi(:,I) = dr/norm2(dr)
-    do J = 1,I-1
-      vi(:,I) = vi(:,I) - dot_product(vi(:,I),vi(:,J))*vi(:,J)  !! Orthogonalized v_i as in Eq. (42) Ref. [*]
-    enddo
-    vi(:,I) = vi(:,I)/norm2(vi(:,I))
-    v(:) = vi(:,I)  ! v_i
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
+      do J = 1,Nr_atoms
+        do K = H_INDEX_START(J),H_INDEX_END(J)
+          row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
+        enddo
+      enddo
+      !$OMP END PARALLEL DO
 
-    dq_v = v 
+      call bml_set_diagonal(H1_bml,row1,threshold)
+      call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
+      call bml_copy(D1_bml,H1_bml)
 
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I,Coulomb_Force_Real_I)
-    do J = 1,Nr_atoms
-      call Ewald_Real_Space(Coulomb_Pot_Real_I,Coulomb_Force_Real_I,J,RX,RY,RZ,LBox, &
-      dq_v,Hubbard_U,Element_Type,Nr_atoms,Coulomb_acc,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
-      Coulomb_Pot_Real(J) = Coulomb_Pot_Real_I
-    enddo
-    !$OMP END PARALLEL DO
-    
-    call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
-    Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
-
-    row1 = 0.0_dp
-    do J = 1,HDIM
-       call bml_set_row(H1_bml,J,row1,threshold)
-    enddo
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+      call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
 
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(J,K)
-    do J = 1,Nr_atoms
-       do K = H_INDEX_START(J),H_INDEX_END(J)
-           row1(K) = Hubbard_U(J)*dq_v(J) + Coulomb_Pot_dq_v(J)
-       enddo
-    enddo
-    !$OMP END PARALLEL DO
-
-    call bml_set_diagonal(H1_bml,row1,threshold)
-    call bml_multiply(S_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold) 
-    call bml_multiply(H1_bml,S_bml,D1_bml,0.5_dp,0.5_dp,threshold)
-    call bml_copy(D1_bml,H1_bml)
-
-    call bml_transpose(Z_bml,X_bml)
-    call bml_multiply(X_bml,H1_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-    call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
-
-    call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-        m_rec, mu, beta, real(nocc,dp), threshold)
-    
-    call bml_transpose(Z_bml,X_bml)
-    call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
-    call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
-
-    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
-    do It = 1, Nr_atoms
-      dq_dv(It) = 0
-      do K = H_INDEX_START(It), H_INDEX_END(It)
-        call bml_get_row(S_bml,K,row1)
-        call bml_get_row(D1_bml,K,row2)
-        dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+      call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
+           m_rec, mu, beta, real(nocc,dp), threshold)
+
+      call bml_transpose(Z_bml,X_bml)
+      call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
+      call bml_multiply(Y_bml,X_bml,D1_bml,1.0_dp,0.0_dp,threshold)
+
+      !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(It,K,row1,row2)
+      do It = 1, Nr_atoms
+        dq_dv(It) = 0
+        do K = H_INDEX_START(It), H_INDEX_END(It)
+          call bml_get_row(S_bml,K,row1)
+          call bml_get_row(D1_bml,K,row2)
+          dq_dv(It) = dq_dv(It) + dot_product(row1,row2)
+        enddo
       enddo
-    enddo
-    !$OMP END PARALLEL DO
+      !$OMP END PARALLEL DO
 
-    dr = dq_dv - v 
-    fi(:,I) = dr
+      dr = dq_dv - v
+      fi(:,I) = dr
 
-    L = I
-    allocate(O(L,L), M(L,L))
-    do K = 1,L
-    do J = 1,L
-       O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
-    enddo
-    enddo
-    call Invert(O,M,L)                        ! M = O^(-1)
-    IdentRes = 0.D0*Res
-    KRes = 0.D0
-    do K = 1,L
-    do J = 1,L
-      proj_tmp = M(K,J)*dot_product(fi(:,J),Res)
-      IdentRes = IdentRes + proj_tmp*fi(:,K)
-      KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (J)^(-1)*(q[n]-n) 
-    enddo
+      L = I
+      allocate(O(L,L), M(L,L))
+      do K = 1,L
+        do J = 1,L
+          O(K,J) = dot_product(fi(:,K),fi(:,J))  ! O_KJ = < fv_i(K) | fv_i(J) >  see below Eq. (31)
+        enddo
+      enddo
+      call Invert(O,M,L)                        ! M = O^(-1)
+      IdentRes = 0.D0*Res
+      KRes = 0.D0
+      do K = 1,L
+        do J = 1,L
+          proj_tmp = M(K,J)*dot_product(fi(:,J),Res)
+          IdentRes = IdentRes + proj_tmp*fi(:,K)
+          KRes = KRes + proj_tmp*vi(:,K)            !! KRes becomes the rank-L approximate of (J)^(-1)*(q[n]-n)
+        enddo
+      enddo
+      Fel = norm2(IdentRes-Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
+      write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
+
+      ! Does not work, need to normalize matrix
+      if ((Fel > FelTol).AND.(I < (LMAX))) then
+        deallocate(O, M)
+      else
+        do row = 1,Nr_atoms
+          do col = 1,Nr_atoms
+            elem = 0.0
+            do K = 1,L
+              do J = 1,L
+                elem = elem + M(J,K)*vi(row,J)*fi(col,K)
+              enddo
+            enddo
+            if (abs(elem) > threshold) then
+              call bml_set_element(KK0_bml,row,col,elem)
+            endif
+          enddo
+        enddo
+        deallocate(O,M)
+      endif
     enddo
-    Fel = norm2(IdentRes-Res)/norm2(IdentRes)  !! RELATIVE RESIDUAL ERROR ESTIMATE Eq. (48) Ref. [*]
-    write(*,*) '# I, L, Fel = ',I,L,Fel         !! Fel goes down with L
-  
-    ! Does not work, need to normalize matrix  
-        if ((Fel > FelTol).AND.(I < (LMAX))) then 
-          deallocate(O, M)
-        else 
-         do row = 1,Nr_atoms
-         do col = 1,Nr_atoms 
-           elem = 0.0
-           do K = 1,L
-             do J = 1,L 
-               elem = elem + M(J,K)*vi(row,J)*fi(col,K)
-             enddo
-           enddo 
-           if (abs(elem) > threshold) then
-             call bml_set_element(KK0_bml,row,col,elem)
-           endif
-         enddo
-         enddo  
-         deallocate(O,M)
-         endif
-    enddo 
     deallocate(row1); deallocate(row2);
 
   end subroutine prg_kernel_matrix_multirank
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index cbc484e7..f2d8d5df 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -56,6 +56,7 @@ progress_test(prg_density_T)
 progress_test(prg_density_T_fermi)
 progress_test(prg_sp2_basic)
 progress_test(prg_implicit_fermi)
+progress_test(prg_implicit_fermi_save_inverse)
 progress_test(prg_sp2_alg1_dense)
 progress_test(prg_sp2_alg2_dense)
 progress_test(prg_sp2_alg1_ellpack)
diff --git a/tests/src/main.F90 b/tests/src/main.F90
index cd2fe3d2..87285c05 100644
--- a/tests/src/main.F90
+++ b/tests/src/main.F90
@@ -31,7 +31,8 @@ program main
 
   implicit none
 
-  integer :: norb, mdim, verbose, i
+  integer :: norb, mdim, verbose
+  type(bml_matrix_t) :: inv_bml(10)  
   type(bml_matrix_t) :: ham_bml
   type(bml_matrix_t) :: rho_bml, rho1_bml
   type(bml_matrix_t) :: rho_ortho_bml
@@ -59,7 +60,7 @@ program main
   real(dp) :: mineval, maxeval, occerrlimit
   real(dp), allocatable :: gbnd(:)
   integer :: minsp2iter, icount, nodesPerPart, occsteps
-  integer :: norecs
+  integer :: norecs,occiter,i
   integer :: maxsp2iter, npts, sp2all_timer, sp2all_timer_init
   integer, allocatable :: pp(:), signlist(:)
   real(dp), allocatable :: vv(:)
@@ -189,6 +190,8 @@ program main
 
     mu = 0.2_dp
     beta = 4.0_dp !nocc,osteps,occerrlimit
+    norecs = 10
+    bml_type = 'ellpack'
 
     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
@@ -196,7 +199,7 @@ program main
     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho1_bml)
-    call prg_implicit_fermi(ham_bml, rho1_bml, 10, 2, 10.0_dp, mu, beta, 0, 1, 1.0_dp, threshold, 10e-8_dp)
+    call prg_implicit_fermi(ham_bml, rho1_bml, norecs, 2, 10.0_dp, mu, beta, 0, 1, 1.0_dp, threshold, 1e-6_dp)
 
     mu = 0.2_dp
 
@@ -205,11 +208,44 @@ program main
     call bml_add(rho1_bml,rho_bml,1.0_dp,-1.0_dp,threshold)
 
     error_calc = bml_fnorm(rho1_bml)
+    write(*,*) error_calc
     if(error_calc.gt.0.1_dp)then
       write(*,*) "Error in Implicit Fermi expansion ","Error = ",error_calc
       error stop
     endif
 
+  case("prg_implicit_fermi_save_inverse") 
+
+    mu = 0.2_dp
+    beta = 4.0_dp !nocc,osteps,occerrlimit
+    norecs = 10
+    nocc = 10.0_dp
+
+    do i = 1,norecs
+      call bml_identity_matrix(bml_type,bml_element_real,dp,norb,norb,inv_bml(i))
+    enddo 
+
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
+
+    call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
+
+    call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho1_bml)
+    call prg_test_density_matrix(ham_bml,rho1_bml,beta,mu,nocc,1,1e-4_dp,threshold)   
+    write(*,*) mu
+    mu = 0.2_dp  
+    call prg_implicit_fermi_save_inverse(inv_bml,ham_bml,rho_bml,norecs,nocc,mu,beta,1e-4_dp, threshold, 1e-5_dp, 1,occiter)
+
+    write(*,*) mu
+    call bml_scale(0.5_dp,rho_bml)
+    call bml_add(rho1_bml,rho_bml,1.0_dp,-1.0_dp,threshold)
+
+    error_calc = bml_fnorm(rho1_bml)
+    if(error_calc.gt.0.1_dp)then
+      write(*,*) "Error in Implicit Fermi expansion save inverse","Error = ",error_calc
+      error stop
+    endif
+
   case("prg_sp2_basic") !Sp2 original version
 
     call prg_timer_start(loop_timer)
@@ -979,6 +1015,7 @@ program main
       error stop
     endif
 
+
     call prg_timer_stop(loop_timer)
 
   case("prg_buildzsparse")  ! Building inverse overlap factor matrix (Lowdin method)
@@ -1005,7 +1042,6 @@ program main
     call prg_buildZsparse(over_bml,aux_bml,1,mdim,bml_type,zk1_bml,zk2_bml,zk3_bml&
          &,zk4_bml,zk5_bml,zk6_bml,4,4,3,threshold,threshold,.true.,1)
 
-
     call bml_add_deprecated(-1.0_dp,aux_bml,1.0_dp,zmat_bml,0.0_dp)
 
     error_calc = bml_fnorm(aux_bml)
@@ -1095,6 +1131,7 @@ program main
     call bml_add_deprecated(-1.0_dp,rho_bml,1.0_dp,rho1_bml,0.0_dp)
     error_calc = bml_fnorm(rho_bml)
     write(*,*)error_calc
+    
     if(error_calc.gt.error_tol)then
       write(*,*) "Error is too high", error_calc
       error stop
@@ -1135,8 +1172,6 @@ program main
       error stop
     endif
 
-
-
   case default
 
     write(*,*)"ERROR: unknown test ",test

From da4eb81694380a9ab30e94739a8e79bdfd9c3049 Mon Sep 17 00:00:00 2001
From: Linnea Andersson <lian5777@fredholm.it.uu.se>
Date: Fri, 16 Jul 2021 17:51:18 +0200
Subject: [PATCH 07/10] Added kernel routine

---
 src/CMakeLists.txt             |   1 +
 src/prg_ewald_mod.F90          | 242 +++++++++++++++++++++++++++++++--
 src/prg_implicit_fermi_mod.F90 | 125 +++++++++++------
 src/prg_xlbokernel_mod.F90     |  73 ++++++----
 4 files changed, 363 insertions(+), 78 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9c30e5be..3ec8ee57 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -96,6 +96,7 @@ install(FILES
   ${CMAKE_CURRENT_BINARY_DIR}/prg_chebyshev_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_densitymatrix_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_dos_mod.mod
+  ${CMAKE_CURRENT_BINARY_DIR}/prg_ewald_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_extras_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_genz_mod.mod
   ${CMAKE_CURRENT_BINARY_DIR}/prg_graph_mod.mod
diff --git a/src/prg_ewald_mod.F90 b/src/prg_ewald_mod.F90
index 95127797..5539c12d 100644
--- a/src/prg_ewald_mod.F90
+++ b/src/prg_ewald_mod.F90
@@ -15,8 +15,10 @@ module prg_ewald_mod
   public :: Ewald_Real_Space_Single_latte
   public :: Ewald_Real_Space
   public :: Ewald_Real_Space_latte
-  public :: Ewald_k_space
+  public :: Ewald_Real_Space_Test
+  public :: Ewald_k_space_latte_single
   public :: Ewald_k_space_latte
+  public :: Ewald_k_space_Test
 
 contains
 
@@ -374,6 +376,123 @@ subroutine  Ewald_Real_Space_latte(COULOMBV,I,RXYZ,Box, &
 
   end subroutine Ewald_Real_Space_latte
 
+  subroutine  Ewald_Real_Space_Test(COULOMBV,I,RX,RY,RZ,LBox, &
+       DELTAQ,U,Element_Type,Nr_atoms,COULACC,nnRx,nnRy,nnRz,nrnnlist,nnType,Max_Nr_Neigh)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh, I
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
+    real(PREC), intent(in)    :: U(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    character(10), intent(in) :: Element_Type(Nr_atoms)
+    integer,    intent(in)    :: nrnnlist(Nr_atoms), nnType(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)    :: nnRx(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(in)    :: nnRy(Nr_atoms,Max_Nr_Neigh), nnRz(Nr_atoms,Max_Nr_Neigh)
+    real(PREC), intent(out)   :: COULOMBV
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+    integer                   :: J,K, ccnt, nnI
+
+    COULVOL = LBox(1)*LBox(2)*LBox(3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    COULOMBV = ZERO
+
+    TI = TFACT*U(I)
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RX(I)
+    Ra(2) = RY(I)
+    Ra(3) = RZ(I)
+
+!    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(nnI,J,Rb,Rab,dR,MAGR,MAGR2,TJ,DC,Z,NUMREP_ERFC,CA) &
+!    !$OMP REDUCTION(+:COULOMBV)
+    do nnI = 1,nrnnlist(I)
+      Rb(1) = nnRx(I,nnI)
+      Rb(2) = nnRy(I,nnI)
+      Rb(3) = nnRz(I,nnI)
+      J = nnType(I,nnI)
+      Rab = Rb-Ra  ! OBS b - a !!!
+      dR = norm2(Rab)
+      MAGR = dR
+      MAGR2 = dR*dR
+
+      if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+        TJ = TFACT*U(J)
+        DC = Rab/dR
+
+        ! Not Using Numerical Recipes ERFC
+        Z = abs(CALPHA*MAGR)
+        NUMREP_ERFC = erfc(Z)
+
+        CA = NUMREP_ERFC/MAGR
+        COULOMBV = COULOMBV + DELTAQ(J)*CA
+        ccnt = ccnt + 1
+        !TEST(ccnt) = DELTAQ(J)*CA
+        CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+        EXPTI = exp(-TI*MAGR )
+
+        if (Element_Type(I).eq.Element_Type(J)) then
+          COULOMBV = COULOMBV - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          ccnt = ccnt + 1
+          !TEST(ccnt) = - DELTAQ(J)*EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+        else
+          TJ2 = TJ*TJ
+          TJ3 = TJ2*TJ
+          TJ4 = TJ2*TJ2
+          TJ6 = TJ4*TJ2
+          EXPTJ = exp( -TJ*MAGR )
+          TI2MTJ2 = TI2 - TJ2
+          TJ2MTI2 = -TI2MTJ2
+          SA = TI
+          SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+          SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+          SD = TJ
+          SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+          SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+          COULOMBV = COULOMBV - (DELTAQ(J)*(EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+        endif
+
+      endif
+    enddo
+!    !$OMP END PARALLEL DO
+    COULOMBV = KECONST*COULOMBV
+
+  end subroutine Ewald_Real_Space_Test
+
   subroutine  Ewald_Real_Space(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
        DELTAQ,U,Element_Type,Nr_atoms,COULACC,TIMERATIO,nnRx,nnRy,nnRz,nrnnlist,nnType,HDIM,Max_Nr_Neigh)
 
@@ -623,21 +742,119 @@ subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_
 
   end subroutine Ewald_k_Space_latte
 
-  subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TIMERATIO,Max_Nr_Neigh)
+  subroutine Ewald_k_Space_latte_single(COULOMBV,J,RXYZ,Box,DELTAQ,Nr_atoms,COULACC)
 
     implicit none
 
+    integer, parameter        :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, J
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: KECONST, TFACT, RELPERM
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3), DELTAQ(Nr_atoms)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2
+    real(PREC)                :: CORRFACT,FOURCALPHA2
+    real(PREC)                :: RECIPVECS(3,3)
+    real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+    real(PREC)                :: IDOT, JDOT, COSJDOT, SINJDOT, KEPREF
+
+    integer                   :: I,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+
+    COULCUT = 12.0D0
+    CALPHA = SQRTX/COULCUT
+
+    COULCUT2 = COULCUT*COULCUT
+    KCUTOFF = TWO*CALPHA*SQRTX
+    KCUTOFF2 = KCUTOFF*KCUTOFF
+    CALPHA2 = CALPHA*CALPHA
+    FOURCALPHA2 = FOUR*CALPHA2
+
+    RECIPVECS = ZERO
+    RECIPVECS(1,1) = TWO*pi/Box(1,1)
+    RECIPVECS(2,2) = TWO*pi/Box(2,2)
+    RECIPVECS(3,3) = TWO*pi/Box(3,3)
+    LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+    MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+    NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+    RELPERM = 1.D0
+    KECONST = 14.3996437701414D0*RELPERM
+
+    COULOMBV = ZERO
+
+    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,IDOT,JDOT,COSJDOT,SINJDOT,L,M,N,MMIN,MMAX,NMIN,NMAX,L11,M22,K,K2,PREFACTOR,KEPREF)
+    do L = 0,LMAX
+
+      if (L.eq.0) then
+        MMIN = 0
+      else
+        MMIN = -MMAX
+      endif
+
+      L11 = L*RECIPVECS(1,1)
+
+      do M = MMIN,MMAX
+
+        NMIN = -NMAX
+        if ((L==0).and.(M==0)) then
+          NMIN = 1
+        endif
+
+        M22 = M*RECIPVECS(2,2)
+
+        do N = NMIN,NMAX
+          K(1) = L11 
+          K(2) = M22 
+          K(3) = N*RECIPVECS(3,3)
+          K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+
+          PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+          KEPREF = KECONST*PREFACTOR
+          JDOT = K(1)*RXYZ(1,J) + K(2)*RXYZ(2,J) + K(3)*RXYZ(3,J)
+          SINJDOT = sin(JDOT)
+          COSJDOT = cos(JDOT) 
+          do I = 1,Nr_atoms
+            IDOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
+            COULOMBV(I) = COULOMBV(I) + KEPREF*DELTAQ(J)*(COSJDOT*cos(IDOT)+SINJDOT*sin(IDOT))
+          enddo
+        enddo
+      enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    ! Point self energy
+    CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+    COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+  end subroutine Ewald_k_Space_latte_single
+
+  subroutine Ewald_k_Space_Test(COULOMBV,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,Max_Nr_Neigh)
+!
+    implicit none
+!
     integer, parameter        :: PREC = 8
     real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
     real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
     real(PREC), parameter     :: pi = 3.14159265358979323846264D0
     real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
     integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh
-    real(PREC), intent(in)    :: COULACC, TIMERATIO
+    real(PREC), intent(in)    :: COULACC
     real(PREC)                :: KECONST, TFACT, RELPERM
     real(PREC), intent(in)    :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3), DELTAQ(Nr_atoms)
     real(PREC)                :: COULCUT, COULCUT2
-    real(PREC), intent(out)   :: COULOMBV(Nr_atoms), FCOUL(3,Nr_atoms)
+    real(PREC), intent(out)   :: COULOMBV(Nr_atoms)
     real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
     real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
     real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
@@ -647,7 +864,7 @@ subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TI
     real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
 
     integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
-
+!
     COULVOL = LBox(1)*LBox(2)*LBox(3)
     SQRTX = sqrt(-log(COULACC))
 
@@ -673,7 +890,6 @@ subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TI
     RELPERM = 1.D0
     KECONST = 14.3996437701414D0*RELPERM
 
-    FCOUL = ZERO
     COULOMBV = ZERO
     SINLIST = ZERO
     COSLIST = ZERO
@@ -714,6 +930,9 @@ subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TI
             SINSUM = 0.D0
 
             ! Doing the sin and cos sums
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,DOT) &
+            !$OMP REDUCTION(+:COSSUM) &
+            !$OMP REDUCTION(+:SINSUM)
             do I = 1,Nr_atoms
               DOT = K(1)*RX(I) + K(2)*RY(I) + K(3)*RZ(I)
               ! We re-use these in the next loop...
@@ -722,19 +941,18 @@ subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TI
               COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
               SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
             enddo
+            !$OMP END PARALLEL DO
             COSSUM2 = COSSUM*COSSUM
             SINSUM2 = SINSUM*SINSUM
 
             ! Add up energy and force contributions
-
+!
             KEPREF = KECONST*PREFACTOR
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I)
             do I = 1,Nr_atoms
               COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
-              FORCE = KEPREF*DELTAQ(I)*(SINLIST(I)*COSSUM - COSLIST(I)*SINSUM)
-              FCOUL(1,I) = FCOUL(1,I) + FORCE*K(1)
-              FCOUL(2,I) = FCOUL(2,I) + FORCE*K(2)
-              FCOUL(3,I) = FCOUL(3,I) + FORCE*K(3)
             enddo
+            !$OMP END PARALLEL DO
 
             KEPREF = KEPREF*(COSSUM2 + SINSUM2)
           endif
@@ -746,6 +964,6 @@ subroutine Ewald_k_Space(COULOMBV,FCOUL,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,TI
     CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
     COULOMBV = COULOMBV - CORRFACT*DELTAQ;
 
-  end subroutine Ewald_k_Space
+  end subroutine Ewald_k_Space_Test
 
 end module prg_ewald_mod
diff --git a/src/prg_implicit_fermi_mod.F90 b/src/prg_implicit_fermi_mod.F90
index 76bd2bc2..37d03618 100644
--- a/src/prg_implicit_fermi_mod.F90
+++ b/src/prg_implicit_fermi_mod.F90
@@ -58,11 +58,11 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
     integer, intent(inout) :: occiter
 
     type(bml_matrix_t) :: w_bml, y_bml, d_bml, aux_bml, p2_bml, I_bml, ai_bml
-    real(dp) :: trdPdmu, trP0, occErr, alpha
-    real(dp) :: cnst, ofactor, mustep
+    real(dp) :: trdPdmu, trP0, occErr, alpha, newerr
+    real(dp) :: cnst, ofactor, mustep, preverr
     real(dp), allocatable :: trace(:), gbnd(:)
     character(20) :: bml_type
-    integer :: N, M, i, iter, muadj, prev
+    integer :: N, M, i, iter, muadj, prev, maxiter
 
     bml_type = bml_get_type(h_bml)
     N = bml_get_N(h_bml)
@@ -78,9 +78,12 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
     call bml_identity_matrix(bml_type, bml_element_real, dp, N, M, ai_bml)
 
     occErr = 10.0_dp
-    alpha = 1.0_dp
+    newerr = 1000_dp
+    preverr = 1000_dp
+    alpha = 8.0_dp
     prev = 0
     iter = 0
+    maxiter = 30
     cnst = beta/(1.0_dp*2**(nsteps+2))
 
     if (SCF_IT .eq. 1) then
@@ -95,12 +98,15 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
       call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
       call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
       call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.0001_dp, threshold)
+      do i = 1, nsteps
+        call bml_copy(I_bml, Inv_bml(i))
+      enddo 
     else
       ! Otherwise use previous inverse as starting guess
       call bml_copy(Inv_bml(1),ai_bml)
     end if
 
-    do while ((occErr .gt. occErrLimit .or. muadj .eq. 1) .and. iter < 50)
+    do while ((occErr .gt. occErrLimit .or. muadj .eq. 1) .and. iter < maxiter)
       iter = iter + 1
       muadj = 0
       write(*,*) 'mu =', mu
@@ -116,46 +122,65 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
         call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
         call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
         ! Find inverse ai = (2*(P2-P)+I)^-1
-        !call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.01_dp, threshold)
-        call prg_newtonschulz(y_bml, ai_bml, d_bml, w_bml, aux_bml, I_bml, tol, threshold)
-        call bml_multiply(ai_bml, p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
-        call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation
+        !call prg_conjgrad(y_bml, Inv_bml(i), I_bml, aux_bml, d_bml, w_bml, tol, threshold)
+        !call bml_copy(Inv_bml(i),ai_bml) 
+        call prg_newtonschulz(y_bml, Inv_bml(i), d_bml, w_bml, aux_bml, I_bml, tol, threshold)
+        call bml_multiply(Inv_bml(i), p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
+        !call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation
       enddo
 
-      trdPdmu = bml_trace(p_bml)
-      trP0 = trdPdmu
-      trdPdmu = trdPdmu - bml_sum_squares(p_bml) ! sum p(i,j)**2
-      trdPdmu = beta * trdPdmu
+      trP0 = bml_trace(p_bml)
+      trdPdmu = beta*(trP0 - bml_sum_squares(p_bml)) ! sum p(i,j)**2
       occErr = abs(trP0 - nocc)
-      write(*,*) 'occerr =', nocc-trP0
-
+      write(*,*) 'occerr =', occErr
+       
       ! If occupation error is too large, do bisection method
-      if (occerr > 10.0_dp) then
-        if (nocc-trP0 < 0.0_dp) then
-          if (prev .eq. 1) then
-            alpha = alpha/2
-          endif
+      if (occerr > 1.0_dp) then
+     ! if (newerr > occerr) then 
+        if (nocc-trP0 < 0.0_dp .and. prev .eq. -1) then
           prev = -1
-          mu = mu - alpha
-        else
-          if (prev .eq. -1) then
-            alpha = alpha/2
-          endif
+        else if (nocc-trP0 > 0.0_dp .and. prev .eq. 1) then 
           prev = 1
-          mu = mu + alpha
+        else if (nocc-trP0 > 0.0_dp .and. prev .eq. -1) then
+          prev = 1
+          alpha = alpha/2
+      !    newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+      !    do while(newerr > occerr)
+      !      alpha = alpha/2
+      !      newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+      !    enddo
+        else 
+          prev = -1 
+          alpha = alpha/2 
+      !    newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+      !    do while(newerr > occerr)
+      !      alpha = alpha/2
+      !      newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+      !    enddo
         endif
+        !newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+        !do while(newerr > occerr .and. abs(occerr-newerr)/occerr<0.1 )
+        !  alpha = alpha/2
+        !  newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+        !enddo
+!        write(*,*) 'newerr =', newerr
+        mu = mu + prev*alpha
+        muadj = 1
+
         ! Otherwise do Newton
       else if (occErr .gt. occErrLimit) then
         mustep = (nocc -trP0)/trdPdmu
-        if (abs(mustep) > 1.0) then
-          mustep = 0.1_dp*mustep
-        end if
+        !if (abs(mustep) > 1.0 .or. preverr < occErr) then
+        !  alpha = alpha/2
+        !  mustep = alpha*mustep
+        !end if
         mu = mu + mustep
         muadj = 1
+        preverr = occErr
       end if
     enddo
 
-    if (iter .ge. 50) then 
+    if (iter .ge. maxiter) then 
             write(*,*) 'Could not converge chemical potential in prg_impplicit_fermi_save_inverse'
     end if 
     ! Adjusting the occupation sometimes causes the perturbation calculation to not converge.
@@ -432,12 +457,12 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
     implicit none
 
     type(bml_matrix_t), intent(in) :: H0_bml, H1_bml, Inv_bml(nsteps)
-    type(bml_matrix_t), intent(inout) :: P0_bml, P1_bml
+    type(bml_matrix_t), intent(inout) :: P0_bml,P1_bml
     real(dp), intent(in) :: mu0, threshold
     real(dp)  :: mu1
     real(dp), intent(in) :: beta, nocc
     integer, intent(in) :: nsteps
-    type(bml_matrix_t) :: B_bml, C_bml, C0_bml
+    type(bml_matrix_t) :: B0_bml, B_bml, C_bml, C0_bml
     character(20) :: bml_type
     real(dp) :: p1_trace, dPdmu_trace, p1B_trace, mu1B, cnst
     integer :: N, M, i, j, k
@@ -446,6 +471,7 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
     N = bml_get_N(H0_bml)
     M = bml_get_M(H0_bml)
 
+    call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, B0_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, B_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C_bml)
     call bml_zero_matrix(bml_type, bml_element_real, dp, N, M, C0_bml)
@@ -477,11 +503,31 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
       call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
     enddo
 
+!    do i = 1, nsteps-1
+      ! D = A^-1*P0
+!      call bml_multiply(Inv_bml(i), B0_bml, C0_bml, 1.0_dp, 0.0_dp, threshold)
+!      call bml_multiply(C0_bml, B0_bml, B_bml, 1.0_dp, 0.0_dp, threshold)
+      ! B0 = A^-1*P0^2
+!      call bml_copy(B_bml,B0_bml)
+      ! B = I + D -P0*D
+!      call bml_add(B_bml, C0_bml, -1.0_dp, 1.0_dp, threshold)
+!      call bml_scale_add_identity(B_bml, 1.0_dp, 1.0_dp, threshold)
+      ! P1 = 2D*P1(I+D-P0*D)
+!      call bml_multiply(C0_bml, P1_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+!      call bml_multiply(C_bml, B_bml, P1_bml, 2.0_dp, 0.0_dp, threshold)
+!    enddo 
+!      call bml_multiply(B0_bml, P1_bml, C_bml, 2.0_dp, 0.0_dp, threshold)
+!      call bml_copy(P1_bml, B_bml)
+!      call bml_add(B_bml, C_bml, 2.0_dp, -2.0_dp, threshold)
+      ! Get next P1
+!      call bml_multiply(B_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+!      call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
+
+
     ! dPdmu = beta*P0(I-P0)
     call bml_copy(P0_bml, B_bml)
     call bml_scale_add_identity(B_bml, -1.0_dp, 1.0_dp, threshold)
-    call bml_multiply(P0_bml, B_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
-    call bml_scale(beta,C_bml)
+    call bml_multiply(P0_bml, B_bml, C_bml, beta, 0.0_dp, threshold)
     dPdmu_trace = bml_trace(C_bml)
     p1_trace = bml_trace(P1_bml)
     mu1 =  - p1_trace/dPdmu_trace
@@ -490,6 +536,7 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
     endif
 
     call bml_deallocate(B_bml)
+    call bml_deallocate(B0_bml)
     call bml_deallocate(C_bml)
     call bml_deallocate(C0_bml)
 
@@ -866,9 +913,7 @@ subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, d_bml, I_bml, tol, th
     N = bml_get_N(a_bml)
     err = 100000.0
     i = 0
-    N2 = N*N
-    scaled_tol = tol*N
-    do while(err > scaled_tol)
+    do while(err > tol)
       !write(*,*) 'iter = ', i
       !write(*,*) 'ns error =', err
       call bml_copy(ai_bml, tmp_bml)
@@ -880,7 +925,7 @@ subroutine prg_newtonschulz(a_bml, ai_bml, r_bml, tmp_bml, d_bml, I_bml, tol, th
       !write(*,*) "prev_err = ", prev_err
       if (10*prev_err < err) then
         write(*,*) 'NS did not converge, calling conjugate gradient'
-        call prg_conjgrad(a_bml, ai_bml, I_bml, r_bml, tmp_bml, d_bml, 0.0001_dp, threshold)
+        call prg_conjgrad(a_bml, ai_bml, I_bml, r_bml, tmp_bml, d_bml, 0.00001_dp, threshold)
       else
         call bml_multiply(tmp_bml, r_bml, ai_bml, 1.0_dp, 1.0_dp, threshold)
       endif
@@ -990,7 +1035,7 @@ subroutine prg_conjgrad(A_bml, p_bml, p2_bml, tmp_bml, d_bml, w_bml, cg_tol, thr
 
     do while (r_norm_new .gt. cg_tol)
 
-      write(*,*) r_norm_new
+    !  write(*,*) r_norm_new
       k = k + 1
       if (k .eq. 1) then
         call bml_copy(tmp_bml, d_bml)
@@ -1011,7 +1056,7 @@ subroutine prg_conjgrad(A_bml, p_bml, p2_bml, tmp_bml, d_bml, w_bml, cg_tol, thr
         stop
       endif
     enddo
-    write(*,*) "Number of CG-iterations:", k
+    !write(*,*) "Number of CG-iterations:", k
 
   end subroutine prg_conjgrad
 
diff --git a/src/prg_xlbokernel_mod.F90 b/src/prg_xlbokernel_mod.F90
index a19cb5b2..81496915 100644
--- a/src/prg_xlbokernel_mod.F90
+++ b/src/prg_xlbokernel_mod.F90
@@ -83,23 +83,23 @@ end subroutine Invert
   !! \param Nr_elem Number of elements in Hubbard list.
   subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_bml,mu,beta,RXYZ,Box,Hubbard_U,Element_Pointer, &
        Nr_atoms,HDIM,Max_Nr_Neigh,Coulomb_acc,nebcoul,totnebcoul,Hinxlist, &
-       S_bml,Z_bml,Nocc,Inv_bml,H1_bml,X_bml,Y_bml,DO_bml,D1_bml,m_rec,threshold,Nr_elem)
+       S_bml,Z_bml,Nocc,Inv_bml,H1_bml,DO_bml,D1_bml,m_rec,threshold,Nr_elem)
 
     !! Res = q[n] - n
     !! KK0 is preconditioner
     !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
 
     implicit none
-    integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec,Nr_elem,NUMRANK
+    integer,    intent(in)      :: Nr_atoms, HDIM, Max_Nr_Neigh,m_rec,Nr_elem,NUMRANK
     real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-    real(dp), intent(in)      :: Coulomb_acc, FelTol, beta, mu
+    real(dp), intent(in)      :: Coulomb_acc, FelTol, beta, mu, Nocc
     real(dp), intent(in)      :: RXYZ(3,Nr_atoms), Box(3,3)
     real(dp), intent(in)      :: Res(Nr_atoms)
     integer,    intent(in)      :: Hinxlist(HDIM),Element_Pointer(Nr_atoms)
     real(dp), intent(in)      :: Hubbard_U(Nr_elem)
     type(bml_matrix_t), intent(inout)  :: HO_bml, S_bml, Z_bml, Inv_bml(m_rec), KK0_bml
     real(dp)                  :: K0Res(Nr_atoms)
-    type(bml_matrix_t),intent(inout) :: H1_bml, X_bml,Y_bml,DO_bml,D1_bml
+    type(bml_matrix_t),intent(inout) :: H1_bml, DO_bml,D1_bml
     real(dp), intent(in)      :: threshold
     integer,    intent(in)      :: LMAX
     character(20)             :: bml_type
@@ -115,17 +115,19 @@ subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_
     real(dp)                  :: vi(Nr_atoms,LMAX), fi(Nr_atoms,LMAX), v(Nr_atoms)
     real(dp)                  :: dr(Nr_Atoms), IdentRes(Nr_atoms)
     real(dp), allocatable     :: O(:,:),M(:,:),row1(:),row2(:),row_NA(:)
-    type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml, ZT_bml, T_bml
+    type(bml_matrix_t)          :: KK0T_bml, K0Res_bml, Res_bml, ZT_bml, T_bml, X_bml, Y_bml
 
     call timer_prg_init()
     bml_type = bml_get_type(HO_bml)
     N = bml_get_N(HO_bml)
     MN = bml_get_M(HO_bml)
     allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(row_NA(Nr_atoms))
-    call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
-    call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
-    call bml_zero_matrix('csr',bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,Nr_atoms,Nr_atoms,KK0T_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,Nr_atoms,Nr_atoms,K0Res_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,Nr_atoms,Nr_atoms,Res_bml)
     call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,X_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,Y_bml)
 
     call bml_transpose(Z_bml,ZT_bml)
 
@@ -137,6 +139,7 @@ subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_
     K0Res = row_NA
     dr = K0Res
 
+    write(*,*) 'resnorm =', norm2(Res), 'kresnorm =', norm2(dr)
     I = 0
     Fel = 1.D0
     do while ((Fel > FelTol).AND.(I < (LMAX)).AND.(I < NUMRANK))   !! Fel = "Error" in Swedish
@@ -249,6 +252,8 @@ subroutine prg_kernel_multirank_latte(KRes,KK0_bml,Res,FelTol,L,LMAX,NUMRANK,HO_
     call bml_deallocate(K0Res_bml)
     call bml_deallocate(Res_bml)
     call bml_deallocate(ZT_bml)
+    call bml_deallocate(X_bml)
+    call bml_deallocate(Y_bml)
     call prg_timer_shutdown()
 
   end subroutine prg_kernel_multirank_latte
@@ -327,8 +332,8 @@ subroutine prg_kernel_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX
       enddo
       !$OMP END PARALLEL DO
 
-      call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
-      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+     ! call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+     ! Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
       row1 = 0.0_dp
       do J = 1,HDIM
@@ -433,41 +438,43 @@ end subroutine prg_kernel_multirank
   !! \param Nr_elem Number of elements in Hubbard list.
   subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
        Element_Pointer, Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc, &
-       Hinxlist,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml,Y_bml,X_bml, &
+       Hinxlist,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml, &
        Nocc,m_rec,threshold,beta,Nr_elem)
 
     use bml
 
     implicit none
     integer, parameter             :: PREC = 8, dp = kind(1.0d0)
-    integer,    intent(in)         :: Nr_atoms, Nr_elem,HDIM, Nocc, Max_Nr_Neigh,m_rec
+    integer,    intent(in)         :: Nr_atoms, Nr_elem,HDIM, Max_Nr_Neigh,m_rec
     real(PREC), parameter          :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
     real(PREC), parameter          :: kB = 8.61739d-5 ! eV/K, kB = 6.33366256e-6 Ry/K, kB = 3.166811429e-6 Ha/K
-    real(PREC), intent(in)         :: Coulomb_acc, threshold,beta
+    real(PREC), intent(in)         :: Coulomb_acc, threshold,beta,Nocc
     real(PREC)                     :: v(Nr_atoms)
     real(PREC), intent(in)         :: RXYZ(3,Nr_atoms),Box(3,3)
     integer,    intent(in)         :: Hinxlist(HDIM)
     real(PREC), intent(in)         :: Hubbard_U(Nr_elem)
     type(bml_matrix_t), intent(in)   :: Inv_bml(m_rec)
     type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
-    type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml,Y_bml,X_bml
+    type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml
     real(PREC), intent(inout)      :: mu0
     integer,    intent(in)         :: Element_Pointer(Nr_atoms)
     real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
     real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
     real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
     real(PREC)                     :: dq_v(Nr_atoms)
-    real(PREC)                     :: dq_dv(Nr_atoms), err,tol
+    real(PREC)                     :: dq_dv(Nr_atoms), err,tol,start,ewaldk,ewaldr,ewaldkacc,ewaldracc,response,respacc
     real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
     integer                        :: I,J,K, ITER, mm,It,N,MN
     real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)
-    type(bml_matrix_t)             :: ZT_bml, R_bml, JJI_bml, JJ_bml, tmp_bml
+    type(bml_matrix_t)             :: ZT_bml, R_bml, JJI_bml, JJ_bml, tmp_bml, X_bml, Y_bml
     character(20)                  :: bml_type
 
     bml_type = bml_get_type(HO_bml)
     N = bml_get_N(HO_bml)
     MN = bml_get_M(HO_bml)
     call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,ZT_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,X_bml)
+    call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,Y_bml)
     call bml_transpose(Z_bml,ZT_bml)
     allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
     Coulomb_Pot_dq_v = ZERO
@@ -476,7 +483,11 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
     JJ = ZERO
     KK = ZERO
 
+    write(*,*) 'Beginning response calculations'
+    ewaldracc = 0.0; ewaldkacc = 0.0; respacc = 0.0;
     do J = 1,Nr_atoms
+      write(*,*) 'J =',J
+      call cpu_time(start)
       dq_v(J) = ONE
       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
       do I = 1,Nr_atoms
@@ -485,9 +496,12 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
         Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
       enddo
       !$OMP END PARALLEL DO
-      call Ewald_k_Space_latte(Coulomb_Pot_k,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc, &
-           Max_Nr_Neigh)
+      call cpu_time(ewaldr)
+      ewaldracc = ewaldracc + ewaldr-start
+      call Ewald_k_Space_latte_single(Coulomb_Pot_k,J,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc)
       Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+      call cpu_time(ewaldk)
+      ewaldkacc = ewaldkacc + ewaldk-ewaldr
 
       call bml_deallocate(H1_bml)
       call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,H1_bml)
@@ -512,7 +526,9 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
       call bml_multiply(X_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
 
       call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-           m_rec,mu0,beta,real(nocc,PREC),threshold)
+           m_rec,mu0,beta,Nocc,threshold)
+      call cpu_time(response)
+      respacc = respacc + response-ewaldk
 
       call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)
       call bml_multiply(Y_bml,ZT_bml,D1_bml,1.0_dp,0.0_dp,threshold)
@@ -540,10 +556,15 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
       JJ(I,I) = JJ(I,I) - ONE
     enddo
 
+    write(*,*) 'ewaldracc =', ewaldracc
+    write(*,*) 'ewaldkacc =', ewaldkacc 
+    write(*,*) 'implcit response time =', respacc
     call Invert(JJ,KK,Nr_atoms)
 
     deallocate(row1); deallocate(row2); deallocate(JJ)
     call bml_deallocate(ZT_bml)
+    call bml_deallocate(X_bml)
+    call bml_deallocate(Y_bml)
 
   end subroutine prg_full_kernel_latte
 
@@ -627,8 +648,8 @@ subroutine prg_full_kernel(KK,DO_bml,mu0,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
         Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
       enddo
       !$OMP END PARALLEL DO
-      call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
-           TIMERATIO,Max_Nr_Neigh)
+    !  call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
+    !       TIMERATIO,Max_Nr_Neigh)
       Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
       diagonal = 0.0_dp
@@ -691,9 +712,9 @@ subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,
     !! KRes rank-L approximation of (K0*J)^(-1)*K0*(q[n]-n) with (K0*J)^(-1) as in Eq. (41) in Ref. [*]
 
     implicit none
-    integer,    intent(in)      :: Nr_atoms, HDIM, Nocc,Max_Nr_Neigh,m_rec
+    integer,    intent(in)      :: Nr_atoms, HDIM, Max_Nr_Neigh,m_rec
     real(dp), parameter       :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0
-    real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu
+    real(dp), intent(in)      :: Coulomb_acc, TIMERATIO, FelTol, beta, mu,Nocc
     real(dp), intent(in)      :: RX(Nr_atoms), RY(Nr_atoms), RZ(Nr_atoms), LBox(3)
     real(dp), intent(in)      :: Res(Nr_atoms)
     integer,    intent(in)      :: H_INDEX_START(Nr_atoms), H_INDEX_END(Nr_atoms)
@@ -745,8 +766,8 @@ subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,
       enddo
       !$OMP END PARALLEL DO
 
-      call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
-      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+     ! call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+     ! Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
       row1 = 0.0_dp
       do J = 1,HDIM
@@ -771,7 +792,7 @@ subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,
       call bml_multiply(D1_bml,Z_bml,H1_bml,1.0_dp,0.0_dp,threshold)
 
       call prg_implicit_fermi_first_order_response(HO_bml,H1_bml,DO_bml,D1_bml,Inv_bml, &
-           m_rec, mu, beta, real(nocc,dp), threshold)
+           m_rec, mu, beta, Nocc, threshold)
 
       call bml_transpose(Z_bml,X_bml)
       call bml_multiply(Z_bml,D1_bml,Y_bml,2.0_dp,0.0_dp,threshold)

From 1256e74740bf2b2b9f2953f70c236c9becbc1826 Mon Sep 17 00:00:00 2001
From: Linnea Andersson <lian5777@fredholm.it.uu.se>
Date: Fri, 16 Jul 2021 18:23:21 +0200
Subject: [PATCH 08/10] Fixed test

---
 tests/src/main.F90 | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/src/main.F90 b/tests/src/main.F90
index 87285c05..d1df2f04 100644
--- a/tests/src/main.F90
+++ b/tests/src/main.F90
@@ -231,11 +231,10 @@ program main
     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho1_bml)
-    call prg_test_density_matrix(ham_bml,rho1_bml,beta,mu,nocc,1,1e-4_dp,threshold)   
-    write(*,*) mu
+    
     mu = 0.2_dp  
     call prg_implicit_fermi_save_inverse(inv_bml,ham_bml,rho_bml,norecs,nocc,mu,beta,1e-4_dp, threshold, 1e-5_dp, 1,occiter)
-
+    call prg_test_density_matrix(ham_bml,rho1_bml,beta,mu,nocc,1,1e-4_dp,threshold)   
     write(*,*) mu
     call bml_scale(0.5_dp,rho_bml)
     call bml_add(rho1_bml,rho_bml,1.0_dp,-1.0_dp,threshold)

From 6304e52fbc1f45de355658e384d72acab31c580a Mon Sep 17 00:00:00 2001
From: Linnea Andersson <lian5777@fredholm.it.uu.se>
Date: Sat, 17 Jul 2021 20:22:08 +0200
Subject: [PATCH 09/10] Fixed indentation

---
 src/prg_ewald_mod.F90          | 20 ++++----
 src/prg_implicit_fermi_mod.F90 | 92 +++++++++++++++++-----------------
 src/prg_syrotation_mod.F90     |  4 +-
 src/prg_xlbokernel_mod.F90     | 14 +++---
 tests/src/main.F90             | 14 +++---
 5 files changed, 72 insertions(+), 72 deletions(-)

diff --git a/src/prg_ewald_mod.F90 b/src/prg_ewald_mod.F90
index 5539c12d..a56de96e 100644
--- a/src/prg_ewald_mod.F90
+++ b/src/prg_ewald_mod.F90
@@ -436,8 +436,8 @@ subroutine  Ewald_Real_Space_Test(COULOMBV,I,RX,RY,RZ,LBox, &
     Ra(2) = RY(I)
     Ra(3) = RZ(I)
 
-!    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(nnI,J,Rb,Rab,dR,MAGR,MAGR2,TJ,DC,Z,NUMREP_ERFC,CA) &
-!    !$OMP REDUCTION(+:COULOMBV)
+    !    !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(nnI,J,Rb,Rab,dR,MAGR,MAGR2,TJ,DC,Z,NUMREP_ERFC,CA) &
+    !    !$OMP REDUCTION(+:COULOMBV)
     do nnI = 1,nrnnlist(I)
       Rb(1) = nnRx(I,nnI)
       Rb(2) = nnRy(I,nnI)
@@ -488,7 +488,7 @@ subroutine  Ewald_Real_Space_Test(COULOMBV,I,RX,RY,RZ,LBox, &
 
       endif
     enddo
-!    !$OMP END PARALLEL DO
+    !    !$OMP END PARALLEL DO
     COULOMBV = KECONST*COULOMBV
 
   end subroutine Ewald_Real_Space_Test
@@ -815,8 +815,8 @@ subroutine Ewald_k_Space_latte_single(COULOMBV,J,RXYZ,Box,DELTAQ,Nr_atoms,COULAC
         M22 = M*RECIPVECS(2,2)
 
         do N = NMIN,NMAX
-          K(1) = L11 
-          K(2) = M22 
+          K(1) = L11
+          K(2) = M22
           K(3) = N*RECIPVECS(3,3)
           K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
 
@@ -824,7 +824,7 @@ subroutine Ewald_k_Space_latte_single(COULOMBV,J,RXYZ,Box,DELTAQ,Nr_atoms,COULAC
           KEPREF = KECONST*PREFACTOR
           JDOT = K(1)*RXYZ(1,J) + K(2)*RXYZ(2,J) + K(3)*RXYZ(3,J)
           SINJDOT = sin(JDOT)
-          COSJDOT = cos(JDOT) 
+          COSJDOT = cos(JDOT)
           do I = 1,Nr_atoms
             IDOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
             COULOMBV(I) = COULOMBV(I) + KEPREF*DELTAQ(J)*(COSJDOT*cos(IDOT)+SINJDOT*sin(IDOT))
@@ -841,9 +841,9 @@ subroutine Ewald_k_Space_latte_single(COULOMBV,J,RXYZ,Box,DELTAQ,Nr_atoms,COULAC
   end subroutine Ewald_k_Space_latte_single
 
   subroutine Ewald_k_Space_Test(COULOMBV,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,Max_Nr_Neigh)
-!
+    !
     implicit none
-!
+    !
     integer, parameter        :: PREC = 8
     real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
     real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
@@ -864,7 +864,7 @@ subroutine Ewald_k_Space_Test(COULOMBV,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,Max
     real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
 
     integer                   :: I,J,L,M,N, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
-!
+    !
     COULVOL = LBox(1)*LBox(2)*LBox(3)
     SQRTX = sqrt(-log(COULACC))
 
@@ -946,7 +946,7 @@ subroutine Ewald_k_Space_Test(COULOMBV,RX,RY,RZ,LBox,DELTAQ,Nr_atoms,COULACC,Max
             SINSUM2 = SINSUM*SINSUM
 
             ! Add up energy and force contributions
-!
+            !
             KEPREF = KECONST*PREFACTOR
             !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I)
             do I = 1,Nr_atoms
diff --git a/src/prg_implicit_fermi_mod.F90 b/src/prg_implicit_fermi_mod.F90
index 37d03618..20d20f7b 100644
--- a/src/prg_implicit_fermi_mod.F90
+++ b/src/prg_implicit_fermi_mod.F90
@@ -100,7 +100,7 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
       call prg_conjgrad(y_bml, ai_bml, I_bml, aux_bml, d_bml, w_bml, 0.0001_dp, threshold)
       do i = 1, nsteps
         call bml_copy(I_bml, Inv_bml(i))
-      enddo 
+      enddo
     else
       ! Otherwise use previous inverse as starting guess
       call bml_copy(Inv_bml(1),ai_bml)
@@ -123,7 +123,7 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
         call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
         ! Find inverse ai = (2*(P2-P)+I)^-1
         !call prg_conjgrad(y_bml, Inv_bml(i), I_bml, aux_bml, d_bml, w_bml, tol, threshold)
-        !call bml_copy(Inv_bml(i),ai_bml) 
+        !call bml_copy(Inv_bml(i),ai_bml)
         call prg_newtonschulz(y_bml, Inv_bml(i), d_bml, w_bml, aux_bml, I_bml, tol, threshold)
         call bml_multiply(Inv_bml(i), p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
         !call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation
@@ -133,37 +133,37 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
       trdPdmu = beta*(trP0 - bml_sum_squares(p_bml)) ! sum p(i,j)**2
       occErr = abs(trP0 - nocc)
       write(*,*) 'occerr =', occErr
-       
+
       ! If occupation error is too large, do bisection method
       if (occerr > 1.0_dp) then
-     ! if (newerr > occerr) then 
+        ! if (newerr > occerr) then
         if (nocc-trP0 < 0.0_dp .and. prev .eq. -1) then
           prev = -1
-        else if (nocc-trP0 > 0.0_dp .and. prev .eq. 1) then 
+        else if (nocc-trP0 > 0.0_dp .and. prev .eq. 1) then
           prev = 1
         else if (nocc-trP0 > 0.0_dp .and. prev .eq. -1) then
           prev = 1
           alpha = alpha/2
-      !    newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
-      !    do while(newerr > occerr)
-      !      alpha = alpha/2
-      !      newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
-      !    enddo
-        else 
-          prev = -1 
-          alpha = alpha/2 
-      !    newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
-      !    do while(newerr > occerr)
-      !      alpha = alpha/2
-      !      newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
-      !    enddo
+          !    newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+          !    do while(newerr > occerr)
+          !      alpha = alpha/2
+          !      newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+          !    enddo
+        else
+          prev = -1
+          alpha = alpha/2
+          !    newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+          !    do while(newerr > occerr)
+          !      alpha = alpha/2
+          !      newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
+          !    enddo
         endif
         !newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
         !do while(newerr > occerr .and. abs(occerr-newerr)/occerr<0.1 )
         !  alpha = alpha/2
         !  newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
         !enddo
-!        write(*,*) 'newerr =', newerr
+        !        write(*,*) 'newerr =', newerr
         mu = mu + prev*alpha
         muadj = 1
 
@@ -180,9 +180,9 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
       end if
     enddo
 
-    if (iter .ge. maxiter) then 
-            write(*,*) 'Could not converge chemical potential in prg_impplicit_fermi_save_inverse'
-    end if 
+    if (iter .ge. maxiter) then
+      write(*,*) 'Could not converge chemical potential in prg_impplicit_fermi_save_inverse'
+    end if
     ! Adjusting the occupation sometimes causes the perturbation calculation to not converge.
     ! For now we recompute the DM one extra time if mu was adjusted.
     !if (muadj .eq. 1) then
@@ -317,9 +317,9 @@ subroutine prg_implicit_fermi(h_bml, p_bml, nsteps, k, nocc, &
       trdPdmu = trdPdmu - bml_sum_squares(p_bml) ! sum p(i,j)**2
       trdPdmu = beta * trdPdmu
       occErr = abs(trP0 - nocc)
-      if (occErr .gt. occErrLimit) then 
+      if (occErr .gt. occErrLimit) then
         mu = mu + (nocc - trP0)/trdPdmu
-      end if 
+      end if
       write(*,*) "mu =", mu
     enddo
 
@@ -503,25 +503,25 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
       call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
     enddo
 
-!    do i = 1, nsteps-1
-      ! D = A^-1*P0
-!      call bml_multiply(Inv_bml(i), B0_bml, C0_bml, 1.0_dp, 0.0_dp, threshold)
-!      call bml_multiply(C0_bml, B0_bml, B_bml, 1.0_dp, 0.0_dp, threshold)
-      ! B0 = A^-1*P0^2
-!      call bml_copy(B_bml,B0_bml)
-      ! B = I + D -P0*D
-!      call bml_add(B_bml, C0_bml, -1.0_dp, 1.0_dp, threshold)
-!      call bml_scale_add_identity(B_bml, 1.0_dp, 1.0_dp, threshold)
-      ! P1 = 2D*P1(I+D-P0*D)
-!      call bml_multiply(C0_bml, P1_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
-!      call bml_multiply(C_bml, B_bml, P1_bml, 2.0_dp, 0.0_dp, threshold)
-!    enddo 
-!      call bml_multiply(B0_bml, P1_bml, C_bml, 2.0_dp, 0.0_dp, threshold)
-!      call bml_copy(P1_bml, B_bml)
-!      call bml_add(B_bml, C_bml, 2.0_dp, -2.0_dp, threshold)
-      ! Get next P1
-!      call bml_multiply(B_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
-!      call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
+    !    do i = 1, nsteps-1
+    ! D = A^-1*P0
+    !      call bml_multiply(Inv_bml(i), B0_bml, C0_bml, 1.0_dp, 0.0_dp, threshold)
+    !      call bml_multiply(C0_bml, B0_bml, B_bml, 1.0_dp, 0.0_dp, threshold)
+    ! B0 = A^-1*P0^2
+    !      call bml_copy(B_bml,B0_bml)
+    ! B = I + D -P0*D
+    !      call bml_add(B_bml, C0_bml, -1.0_dp, 1.0_dp, threshold)
+    !      call bml_scale_add_identity(B_bml, 1.0_dp, 1.0_dp, threshold)
+    ! P1 = 2D*P1(I+D-P0*D)
+    !      call bml_multiply(C0_bml, P1_bml, C_bml, 1.0_dp, 0.0_dp, threshold)
+    !      call bml_multiply(C_bml, B_bml, P1_bml, 2.0_dp, 0.0_dp, threshold)
+    !    enddo
+    !      call bml_multiply(B0_bml, P1_bml, C_bml, 2.0_dp, 0.0_dp, threshold)
+    !      call bml_copy(P1_bml, B_bml)
+    !      call bml_add(B_bml, C_bml, 2.0_dp, -2.0_dp, threshold)
+    ! Get next P1
+    !      call bml_multiply(B_bml, P0_bml, C_bml, 1.0_dp, 1.0_dp, threshold)
+    !      call bml_multiply(Inv_bml(i), C_bml, P1_bml, 1.0_dp, 0.0_dp, threshold)
 
 
     ! dPdmu = beta*P0(I-P0)
@@ -1035,7 +1035,7 @@ subroutine prg_conjgrad(A_bml, p_bml, p2_bml, tmp_bml, d_bml, w_bml, cg_tol, thr
 
     do while (r_norm_new .gt. cg_tol)
 
-    !  write(*,*) r_norm_new
+      !  write(*,*) r_norm_new
       k = k + 1
       if (k .eq. 1) then
         call bml_copy(tmp_bml, d_bml)
@@ -1173,8 +1173,8 @@ subroutine prg_test_density_matrix(ham_bml, p_bml, beta, mu, nocc, osteps, occEr
       trdPdmu = beta * trdPdmu
       occErr = abs(trP0 - nocc)
       if (occErr .gt. occErrLimit) then
-         mu = mu + (nocc - trP0)/trdPdmu
-      end if 
+        mu = mu + (nocc - trP0)/trdPdmu
+      end if
       !write(*,*) "mu = ", mu
     enddo
 
diff --git a/src/prg_syrotation_mod.F90 b/src/prg_syrotation_mod.F90
index 44d7f91e..16f28b7e 100644
--- a/src/prg_syrotation_mod.F90
+++ b/src/prg_syrotation_mod.F90
@@ -190,7 +190,7 @@ subroutine prg_rotate(rot,r,verbose)
     v2(2)=rot%v2(2)
     v2(3)=rot%v2(3)
 
-    vQ(1)=rot%vQ(1)	      !Rotation center
+    vQ(1)=rot%vQ(1)           !Rotation center
     vQ(2)=rot%vQ(2)
     vQ(3)=rot%vQ(3)
 
@@ -232,7 +232,7 @@ subroutine prg_rotate(rot,r,verbose)
       v2=pq2-vQ
     endif
 
-    vtr(1)=0.0_dp	    !Translation
+    vtr(1)=0.0_dp           !Translation
     vtr(2)=0.0_dp
     vtr(3)=0.0_dp
 
diff --git a/src/prg_xlbokernel_mod.F90 b/src/prg_xlbokernel_mod.F90
index 81496915..8b36f9fb 100644
--- a/src/prg_xlbokernel_mod.F90
+++ b/src/prg_xlbokernel_mod.F90
@@ -332,8 +332,8 @@ subroutine prg_kernel_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,beta,RX
       enddo
       !$OMP END PARALLEL DO
 
-     ! call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
-     ! Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+      ! call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+      ! Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
       row1 = 0.0_dp
       do J = 1,HDIM
@@ -557,7 +557,7 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
     enddo
 
     write(*,*) 'ewaldracc =', ewaldracc
-    write(*,*) 'ewaldkacc =', ewaldkacc 
+    write(*,*) 'ewaldkacc =', ewaldkacc
     write(*,*) 'implcit response time =', respacc
     call Invert(JJ,KK,Nr_atoms)
 
@@ -648,8 +648,8 @@ subroutine prg_full_kernel(KK,DO_bml,mu0,RX,RY,RZ,LBox,Hubbard_U,Element_Type, &
         Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
       enddo
       !$OMP END PARALLEL DO
-    !  call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
-    !       TIMERATIO,Max_Nr_Neigh)
+      !  call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc, &
+      !       TIMERATIO,Max_Nr_Neigh)
       Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
       diagonal = 0.0_dp
@@ -766,8 +766,8 @@ subroutine prg_kernel_matrix_multirank(KRes,KK0_bml,Res,FelTol,L,LMAX,HO_bml,mu,
       enddo
       !$OMP END PARALLEL DO
 
-     ! call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
-     ! Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+      ! call Ewald_k_Space(Coulomb_Pot_k,Coulomb_Force_k,RX,RY,RZ,LBox,dq_v,Nr_atoms,Coulomb_acc,TIMERATIO,Max_Nr_Neigh)
+      ! Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
 
       row1 = 0.0_dp
       do J = 1,HDIM
diff --git a/tests/src/main.F90 b/tests/src/main.F90
index 105fa49a..9e437799 100644
--- a/tests/src/main.F90
+++ b/tests/src/main.F90
@@ -32,7 +32,7 @@ program main
   implicit none
 
   integer :: norb, mdim, verbose
-  type(bml_matrix_t) :: inv_bml(10)  
+  type(bml_matrix_t) :: inv_bml(10)
   type(bml_matrix_t) :: ham_bml
   type(bml_matrix_t) :: rho_bml, rho1_bml
   type(bml_matrix_t) :: rho_ortho_bml
@@ -215,7 +215,7 @@ program main
       error stop
     endif
 
-  case("prg_implicit_fermi_save_inverse") 
+  case("prg_implicit_fermi_save_inverse")
 
     mu = 0.2_dp
     beta = 4.0_dp !nocc,osteps,occerrlimit
@@ -224,7 +224,7 @@ program main
 
     do i = 1,norecs
       call bml_identity_matrix(bml_type,bml_element_real,dp,norb,norb,inv_bml(i))
-    enddo 
+    enddo
 
     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho_bml)
     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,ham_bml)
@@ -232,10 +232,10 @@ program main
     call bml_read_matrix(ham_bml,'hamiltonian_ortho.mtx')
 
     call bml_zero_matrix(bml_type,bml_element_real,dp,norb,norb,rho1_bml)
-    
-    mu = 0.2_dp  
+
+    mu = 0.2_dp
     call prg_implicit_fermi_save_inverse(inv_bml,ham_bml,rho_bml,norecs,nocc,mu,beta,1e-4_dp, threshold, 1e-5_dp, 1,occiter)
-    call prg_test_density_matrix(ham_bml,rho1_bml,beta,mu,nocc,1,1e-4_dp,threshold)   
+    call prg_test_density_matrix(ham_bml,rho1_bml,beta,mu,nocc,1,1e-4_dp,threshold)
     write(*,*) mu
     call bml_scale(0.5_dp,rho_bml)
     call bml_add(rho1_bml,rho_bml,1.0_dp,-1.0_dp,threshold)
@@ -1132,7 +1132,7 @@ program main
     call bml_add_deprecated(-1.0_dp,rho_bml,1.0_dp,rho1_bml,0.0_dp)
     error_calc = bml_fnorm(rho_bml)
     write(*,*)error_calc
-    
+
 
     if(error_calc.gt.error_tol)then
       write(*,*) "Error is too high", error_calc

From faee51d0c03c4abdc3ae1584356a99f2020fbdd1 Mon Sep 17 00:00:00 2001
From: Linnea Andersson <lian5777@fredholm.it.uu.se>
Date: Fri, 17 Sep 2021 18:44:46 +0200
Subject: [PATCH 10/10] changes

---
 src/prg_ewald_mod.F90          | 255 ++++++++++++++++++++++++++++++++-
 src/prg_implicit_fermi_mod.F90 |  48 ++-----
 src/prg_xlbokernel_mod.F90     |  24 ++--
 3 files changed, 284 insertions(+), 43 deletions(-)

diff --git a/src/prg_ewald_mod.F90 b/src/prg_ewald_mod.F90
index a56de96e..5dbb24ea 100644
--- a/src/prg_ewald_mod.F90
+++ b/src/prg_ewald_mod.F90
@@ -16,9 +16,11 @@ module prg_ewald_mod
   public :: Ewald_Real_Space
   public :: Ewald_Real_Space_latte
   public :: Ewald_Real_Space_Test
+  public :: Ewald_Real_Space_Matrix_latte
   public :: Ewald_k_space_latte_single
   public :: Ewald_k_space_latte
   public :: Ewald_k_space_Test
+  public :: Ewald_k_space_Matrix_latte
 
 contains
 
@@ -259,6 +261,129 @@ subroutine  Ewald_Real_Space_Single(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
 
   end subroutine Ewald_Real_Space_Single
 
+  subroutine  Ewald_Real_Space_Matrix_latte(E,RXYZ,Box,U,Element_Pointer,Nr_atoms,COULACC,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_Elem)
+
+    implicit none
+
+    integer,    parameter     :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, HDIM, Max_Nr_Neigh, Nr_Elem
+    real(PREC), intent(in)    :: COULACC
+    real(PREC), intent(out)   :: E(Nr_atoms,Nr_atoms)
+    real(PREC)                :: TFACT, RELPERM, KECONST
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3)
+    real(PREC), intent(in)    :: U(Nr_elem)
+    real(PREC)                :: COULCUT, COULCUT2
+    integer, intent(in)       :: Element_Pointer(Nr_atoms)
+    integer,    intent(in)    :: totnebcoul(Nr_atoms), nebcoul(4,Max_Nr_Neigh,Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: NUMREP_ERFC, CA, FORCE, EXPTI, EXPTJ
+    real(PREC)                :: TJ,TJ2,TJ3,TJ4,TJ6,TI2MTJ2A,SA,SB,SC,SD,SE,SF
+    real(PREC)                :: TI2MTJ2, TI2MTI2, TJ2MTI2
+    integer                   :: I,J,K, ccnt, newj, PBCI,PBCJ,PBCK
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+    COULCUT = 12.D0
+    CALPHA = SQRTX/COULCUT
+    COULCUT2 = COULCUT*COULCUT
+    CALPHA2 = CALPHA*CALPHA
+
+    RELPERM = ONE
+    KECONST = 14.3996437701414D0*RELPERM
+    TFACT  = 16.0D0/(5.0D0*KECONST)
+
+    E = 0.0
+
+    !$OMP PARALLEL DO DEFAULT(PRIVATE) SHARED(E,U,Element_Pointer,RXYZ,totnebcoul,nebcoul,coulcut,calpha)
+       
+    do I = 1,Nr_atoms
+
+    TI = TFACT*U(Element_Pointer(I))
+    TI2 = TI*TI
+    TI3 = TI2*TI
+    TI4 = TI2*TI2
+    TI6 = TI4*TI2
+
+    SSA = TI
+    SSB = TI3/48.D0
+    SSC = 3.D0*TI2/16.D0
+    SSD = 11.D0*TI/16.D0
+    SSE = 1.D0
+
+    Ra(1) = RXYZ(1,I)
+    Ra(2) = RXYZ(2,I)
+    Ra(3) = RXYZ(3,I)
+
+    do newj = 1,totnebcoul(I)
+      J = NEBCOUL(1, NEWJ, I)
+      PBCI = NEBCOUL(2, NEWJ, I)
+      PBCJ = NEBCOUL(3, NEWJ, I)
+      PBCK = NEBCOUL(4, NEWJ, I)
+      Rb(1) = RXYZ(1,J) + REAL(PBCI)*BOX(1,1) + REAL(PBCJ)*BOX(2,1) + &
+           REAL(PBCK)*BOX(3,1)
+
+      Rb(2) = RXYZ(2,J) + REAL(PBCI)*BOX(1,2) + REAL(PBCJ)*BOX(2,2) + &
+           REAL(PBCK)*BOX(3,2)
+
+      Rb(3) = RXYZ(3,J) + REAL(PBCI)*BOX(1,3) + REAL(PBCJ)*BOX(2,3) + &
+           REAL(PBCK)*BOX(3,3)
+      Rab = Rb-Ra  ! OBS b - a !!!
+      dR = norm2(Rab)
+      MAGR = dR
+      MAGR2 = dR*dR
+
+      if ((dR <= COULCUT).and.(dR > 1e-12)) then
+
+        TJ = TFACT*U(Element_Pointer(J))
+        DC = Rab/dR
+
+        Z = abs(CALPHA*MAGR)
+        NUMREP_ERFC = erfc(Z)
+
+        CA = NUMREP_ERFC/MAGR
+        E(I,J) = E(I,J) + CA
+        ccnt = ccnt + 1
+        CA = CA + TWO*CALPHA*exp( -CALPHA2*MAGR2 )/SQRTPI
+        EXPTI = exp(-TI*MAGR )
+
+        if (Element_Pointer(I).eq.Element_Pointer(J)) then
+          E(I,J) = E(I,J) - EXPTI*(SSB*MAGR2 + SSC*MAGR + SSD + SSE/MAGR)
+          ccnt = ccnt + 1
+        else
+          TJ2 = TJ*TJ
+          TJ3 = TJ2*TJ
+          TJ4 = TJ2*TJ2
+          TJ6 = TJ4*TJ2
+          EXPTJ = exp( -TJ*MAGR )
+          TI2MTJ2 = TI2 - TJ2
+          TJ2MTI2 = -TI2MTJ2
+          SA = TI
+          SB = TJ4*TI/(TWO*TI2MTJ2*TI2MTJ2)
+          SC = (TJ6 - THREE*TJ4*TI2)/(TI2MTJ2*TI2MTJ2*TI2MTJ2)
+          SD = TJ
+          SE = TI4*TJ/(TWO * TJ2MTI2 * TJ2MTI2)
+          SF = (TI6 - THREE*TI4*TJ2)/(TJ2MTI2*TJ2MTI2*TJ2MTI2)
+
+          E(I,J) = E(I,J) - ((EXPTI*(SB - (SC/MAGR)) + EXPTJ*(SE - (SF/MAGR))))
+        endif
+
+      endif
+    enddo
+    enddo
+    !$OMP END PARALLEL DO
+
+    E = KECONST*E
+
+  end subroutine Ewald_Real_Space_Matrix_latte
+
   subroutine  Ewald_Real_Space_latte(COULOMBV,I,RXYZ,Box, &
        DELTAQ,U,Element_Pointer,Nr_atoms,COULACC,nebcoul,totnebcoul,HDIM,Max_Nr_Neigh,Nr_Elem)
 
@@ -615,7 +740,7 @@ subroutine  Ewald_Real_Space(COULOMBV,FCOUL,I,RX,RY,RZ,LBox, &
     COULOMBV = KECONST*COULOMBV
 
   end subroutine Ewald_Real_Space
-
+  
   subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_Neigh)
 
     implicit none
@@ -742,6 +867,134 @@ subroutine Ewald_k_Space_latte(COULOMBV,RXYZ,Box,DELTAQ,Nr_atoms,COULACC,Max_Nr_
 
   end subroutine Ewald_k_Space_latte
 
+  subroutine Ewald_k_Space_Matrix_latte(E,RXYZ,Box,Nr_atoms,COULACC,Max_Nr_Neigh,nebcoul,totnebcoul)
+
+    implicit none
+
+    integer, parameter        :: PREC = 8
+    real(PREC), parameter     :: ONE = 1.D0, TWO = 2.D0, ZERO = 0.D0, SIX = 6.D0, THREE = 3.D0, FOUR = 4.D0
+    real(PREC), parameter     :: FOURTYEIGHT = 48.D0, ELEVEN = 11.D0, SIXTEEN = 16.D0, EIGHT = 8.D0
+    real(PREC), parameter     :: pi = 3.14159265358979323846264D0
+    real(PREC), parameter     :: SQRTPI = 1.772453850905516D0
+    integer,    intent(in)    :: Nr_atoms, Max_Nr_Neigh,nebcoul(4,Max_Nr_Neigh,Nr_atoms),totnebcoul(Nr_atoms)
+    real(PREC), intent(in)    :: COULACC
+    real(PREC)                :: KECONST, TFACT, RELPERM
+    real(PREC), intent(in)    :: RXYZ(3,Nr_atoms), Box(3,3)
+    real(PREC)                :: COULCUT, COULCUT2
+    real(PREC), intent(out)   :: E(Nr_atoms,Nr_atoms)
+    real(PREC)                :: Ra(3), Rb(3), dR, Rab(3)
+    real(PREC)                :: COULVOL, SQRTX, CALPHA, DC(3), MAGR, MAGR2, Z
+    real(PREC)                :: CALPHA2, TI,TI2,TI3,TI4,TI6,SSA,SSB,SSC,SSD,SSE
+    real(PREC)                :: CORRFACT,FOURCALPHA2, FORCE
+    real(PREC)                :: RECIPVECS(3,3),SINLIST(Nr_atoms),COSLIST(Nr_Atoms)
+    real(PREC)                :: K(3),L11,L12,L13,M21,M22,M23,K2,KCUTOFF,KCUTOFF2,PREFACTOR
+    real(PREC)                :: PREVIR, COSSUM,SINSUM,DOT, KEPREF, COSSUM2, SINSUM2
+
+    integer                   :: I,J,L,M,N, newj, ccnt, nnI, LMAX,MMAX,NMAX,NMIN,MMIN
+
+    COULVOL = Box(1,1)*Box(2,2)*Box(3,3)
+    SQRTX = sqrt(-log(COULACC))
+
+    ccnt = 0
+
+    COULCUT = 12.0D0
+    CALPHA = SQRTX/COULCUT
+
+    COULCUT2 = COULCUT*COULCUT
+    KCUTOFF = TWO*CALPHA*SQRTX
+    KCUTOFF2 = KCUTOFF*KCUTOFF
+    CALPHA2 = CALPHA*CALPHA
+    FOURCALPHA2 = FOUR*CALPHA2
+
+    RECIPVECS = ZERO
+    RECIPVECS(1,1) = TWO*pi/Box(1,1)
+    RECIPVECS(2,2) = TWO*pi/Box(2,2)
+    RECIPVECS(3,3) = TWO*pi/Box(3,3)
+    LMAX = floor(KCUTOFF / sqrt(RECIPVECS(1,1)*RECIPVECS(1,1)))
+    MMAX = floor(KCUTOFF / sqrt(RECIPVECS(2,2)*RECIPVECS(2,2)))
+    NMAX = floor(KCUTOFF / sqrt(RECIPVECS(3,3)*RECIPVECS(3,3)))
+
+    RELPERM = 1.D0
+    KECONST = 14.3996437701414D0*RELPERM
+
+    !COULOMBV = ZERO
+    SINLIST = ZERO
+    COSLIST = ZERO
+
+    E = 0.0
+
+    do L = 0,LMAX
+
+      if (L.eq.0) then
+        MMIN = 0
+      else
+        MMIN = -MMAX
+      endif
+
+      L11 = L*RECIPVECS(1,1)
+      L12 = L*RECIPVECS(1,2)
+      L13 = L*RECIPVECS(1,3)
+
+      do M = MMIN,MMAX
+
+        NMIN = -NMAX
+        if ((L==0).and.(M==0)) then
+          NMIN = 1
+        endif
+
+        M21 = L11 + M*RECIPVECS(2,1)
+        M22 = L12 + M*RECIPVECS(2,2)
+        M23 = L13 + M*RECIPVECS(2,3)
+
+        do N = NMIN,NMAX
+          K(1) = M21 + N*RECIPVECS(3,1)
+          K(2) = M22 + N*RECIPVECS(3,2)
+          K(3) = M23 + N*RECIPVECS(3,3)
+          K2 = K(1)*K(1) + K(2)*K(2) + K(3)*K(3)
+          if (K2.le.KCUTOFF2) then
+            PREFACTOR = EIGHT*pi*exp(-K2/(4.D0*CALPHA2))/(COULVOL*K2)
+            PREVIR = (2.D0/K2) + (2.D0/(4.D0*CALPHA2));
+
+
+            ! Doing the sin and cos sums
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,DOT) 
+            do I = 1,Nr_atoms
+              DOT = K(1)*RXYZ(1,I) + K(2)*RXYZ(2,I) + K(3)*RXYZ(3,I)
+              ! We re-use these in the next loop...
+              SINLIST(I) = sin(DOT)
+              COSLIST(I) = cos(DOT)
+           !   COSSUM = COSSUM + DELTAQ(I)*COSLIST(I)
+           !   SINSUM = SINSUM + DELTAQ(I)*SINLIST(I)
+            enddo
+            !$OMP END PARALLEL DO
+
+            ! Add up energy and force contributions
+
+            KEPREF = KECONST*PREFACTOR
+            CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+            !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,J)
+            do I = 1,Nr_atoms
+               do newj = 1,totnebcoul(I)
+                 J = NEBCOUL(1, NEWJ, I)
+                 E(I,J) = E(I,J) + KEPREF*(COSLIST(I)*COSLIST(J)+SINLIST(I)*SINLIST(J))
+          !    COULOMBV(I) = COULOMBV(I) + KEPREF*(COSLIST(I)*COSSUM + SINLIST(I)*SINSUM)
+               enddo
+               E(I,I) = E(I,I) - CORRFACT
+            enddo
+            !$OMP END PARALLEL DO
+
+            !KEPREF = KEPREF*(COSSUM2 + SINSUM2)
+          endif
+        enddo
+      enddo
+    enddo
+
+    ! Point self energy
+    !CORRFACT = 2.D0*KECONST*CALPHA/SQRTPI;
+    !COULOMBV = COULOMBV - CORRFACT*DELTAQ;
+
+  end subroutine Ewald_k_Space_Matrix_latte
+
   subroutine Ewald_k_Space_latte_single(COULOMBV,J,RXYZ,Box,DELTAQ,Nr_atoms,COULACC)
 
     implicit none
diff --git a/src/prg_implicit_fermi_mod.F90 b/src/prg_implicit_fermi_mod.F90
index 20d20f7b..633ea1c8 100644
--- a/src/prg_implicit_fermi_mod.F90
+++ b/src/prg_implicit_fermi_mod.F90
@@ -80,13 +80,14 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
     occErr = 10.0_dp
     newerr = 1000_dp
     preverr = 1000_dp
-    alpha = 8.0_dp
-    prev = 0
+    alpha = 1.0_dp
+    prev = 1
     iter = 0
     maxiter = 30
     cnst = beta/(1.0_dp*2**(nsteps+2))
 
     if (SCF_IT .eq. 1) then
+      alpha = 4.0_dp
       ! Normalization
       ! P0 = 0.5*I - cnst*(H0-mu0*I)
       call bml_copy(h_bml, p_bml)
@@ -101,9 +102,6 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
       do i = 1, nsteps
         call bml_copy(I_bml, Inv_bml(i))
       enddo
-    else
-      ! Otherwise use previous inverse as starting guess
-      call bml_copy(Inv_bml(1),ai_bml)
     end if
 
     do while ((occErr .gt. occErrLimit .or. muadj .eq. 1) .and. iter < maxiter)
@@ -122,8 +120,11 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
         call bml_add(y_bml, p_bml, 1.0_dp, -1.0_dp, threshold)
         call bml_scale_add_identity(y_bml, 2.0_dp, 1.0_dp, threshold)
         ! Find inverse ai = (2*(P2-P)+I)^-1
-        !call prg_conjgrad(y_bml, Inv_bml(i), I_bml, aux_bml, d_bml, w_bml, tol, threshold)
+        !call prg_conjgrad(y_bml, Inv_bml(i), I_bml, w_bml, d_bml, aux_bml, tol, threshold)
         !call bml_copy(Inv_bml(i),ai_bml)
+        if (iter .eq. 1) then
+          call prg_conjgrad(y_bml, Inv_bml(i), I_bml, aux_bml, d_bml, w_bml, 0.0001_dp, threshold)
+        endif
         call prg_newtonschulz(y_bml, Inv_bml(i), d_bml, w_bml, aux_bml, I_bml, tol, threshold)
         call bml_multiply(Inv_bml(i), p2_bml, p_bml, 1.0_dp, 0.0_dp, threshold)
         !call bml_copy(ai_bml, Inv_bml(i)) ! Save inverses for use in perturbation response calculation
@@ -136,7 +137,6 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
 
       ! If occupation error is too large, do bisection method
       if (occerr > 1.0_dp) then
-        ! if (newerr > occerr) then
         if (nocc-trP0 < 0.0_dp .and. prev .eq. -1) then
           prev = -1
         else if (nocc-trP0 > 0.0_dp .and. prev .eq. 1) then
@@ -144,36 +144,16 @@ subroutine prg_implicit_fermi_save_inverse(Inv_bml, h_bml, p_bml, nsteps, nocc,
         else if (nocc-trP0 > 0.0_dp .and. prev .eq. -1) then
           prev = 1
           alpha = alpha/2
-          !    newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
-          !    do while(newerr > occerr)
-          !      alpha = alpha/2
-          !      newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
-          !    enddo
         else
           prev = -1
           alpha = alpha/2
-          !    newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
-          !    do while(newerr > occerr)
-          !      alpha = alpha/2
-          !      newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
-          !    enddo
         endif
-        !newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
-        !do while(newerr > occerr .and. abs(occerr-newerr)/occerr<0.1 )
-        !  alpha = alpha/2
-        !  newerr = abs(trP0+prev*alpha*trdPdmu-nocc)
-        !enddo
-        !        write(*,*) 'newerr =', newerr
         mu = mu + prev*alpha
         muadj = 1
 
         ! Otherwise do Newton
       else if (occErr .gt. occErrLimit) then
         mustep = (nocc -trP0)/trdPdmu
-        !if (abs(mustep) > 1.0 .or. preverr < occErr) then
-        !  alpha = alpha/2
-        !  mustep = alpha*mustep
-        !end if
         mu = mu + mustep
         muadj = 1
         preverr = occErr
@@ -525,14 +505,14 @@ subroutine prg_implicit_fermi_first_order_response(H0_bml, H1_bml, P0_bml, P1_bm
 
 
     ! dPdmu = beta*P0(I-P0)
-    call bml_copy(P0_bml, B_bml)
-    call bml_scale_add_identity(B_bml, -1.0_dp, 1.0_dp, threshold)
-    call bml_multiply(P0_bml, B_bml, C_bml, beta, 0.0_dp, threshold)
-    dPdmu_trace = bml_trace(C_bml)
-    p1_trace = bml_trace(P1_bml)
-    mu1 =  - p1_trace/dPdmu_trace
+    !call bml_copy(P0_bml, B_bml)
+    !call bml_scale_add_identity(B_bml, -1.0_dp, 1.0_dp, threshold)
+    !call bml_multiply(P0_bml, B_bml, C_bml, beta, 0.0_dp, threshold)
+    !dPdmu_trace = bml_trace(C_bml)
+    !p1_trace = bml_trace(P1_bml)
+    !mu1 =  - p1_trace/dPdmu_trace
     if (abs(dPdmu_trace) > 1e-8) then
-      call bml_add(P1_bml,C_bml,1.0_dp,mu1,threshold)
+      !call bml_add(P1_bml,C_bml,1.0_dp,mu1,threshold)
     endif
 
     call bml_deallocate(B_bml)
diff --git a/src/prg_xlbokernel_mod.F90 b/src/prg_xlbokernel_mod.F90
index 8b36f9fb..3a25c8ce 100644
--- a/src/prg_xlbokernel_mod.F90
+++ b/src/prg_xlbokernel_mod.F90
@@ -439,7 +439,7 @@ end subroutine prg_kernel_multirank
   subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
        Element_Pointer, Nr_atoms,HDIM, Max_Nr_Neigh,Coulomb_acc, &
        Hinxlist,S_bml,Z_bml,Inv_bml,D1_bml,H1_bml,HO_bml, &
-       Nocc,m_rec,threshold,beta,Nr_elem)
+       Nocc,m_rec,threshold,beta,Nr_elem,nebcoul,totnebcoul)
 
     use bml
 
@@ -457,7 +457,7 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
     type(bml_matrix_t), intent(in)   :: S_bml,Z_bml,HO_bml
     type(bml_matrix_t),intent(inout) :: H1_bml,DO_bml,D1_bml
     real(PREC), intent(inout)      :: mu0
-    integer,    intent(in)         :: Element_Pointer(Nr_atoms)
+    integer,    intent(in)         :: Element_Pointer(Nr_atoms),nebcoul(4,Max_Nr_Neigh,Nr_atoms),totnebcoul(Nr_atoms)
     real(PREC)                     :: Coulomb_Pot_Real(Nr_atoms), Coulomb_Pot(Nr_atoms)
     real(PREC)                     :: Coulomb_Pot_Real_I, Coulomb_Pot_k(Nr_atoms)
     real(PREC)                     :: Coulomb_Pot_Real_dq_v(Nr_atoms), Coulomb_Pot_dq_v(Nr_atoms)
@@ -465,7 +465,7 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
     real(PREC)                     :: dq_dv(Nr_atoms), err,tol,start,ewaldk,ewaldr,ewaldkacc,ewaldracc,response,respacc
     real(PREC), intent(out)        :: KK(Nr_atoms,Nr_atoms)
     integer                        :: I,J,K, ITER, mm,It,N,MN
-    real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:)
+    real(PREC),allocatable         :: row1(:),row2(:),JJ(:,:),E(:,:),EReal(:,:),EKspace(:,:)
     type(bml_matrix_t)             :: ZT_bml, R_bml, JJI_bml, JJ_bml, tmp_bml, X_bml, Y_bml
     character(20)                  :: bml_type
 
@@ -477,12 +477,18 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
     call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,Y_bml)
     call bml_transpose(Z_bml,ZT_bml)
     allocate(row1(HDIM)); allocate(row2(HDIM)); allocate(JJ(Nr_atoms,Nr_atoms))
+    allocate(EReal(Nr_atoms,Nr_atoms)); allocate(EKspace(Nr_atoms,Nr_atoms))
+    allocate(E(Nr_atoms,Nr_atoms))
     Coulomb_Pot_dq_v = ZERO
     Coulomb_Pot_k = ZERO
     dq_v = ZERO
     JJ = ZERO
     KK = ZERO
 
+    call Ewald_Real_Space_Matrix_latte(EReal,RXYZ,Box,Hubbard_U,Element_Pointer, & 
+            Nr_atoms,Coulomb_acc,Nebcoul,Totnebcoul,HDIM,Max_Nr_Neigh,Nr_Elem)
+    call Ewald_k_Space_Matrix_latte(EKspace,RXYZ,Box,Nr_atoms,Coulomb_acc,Max_Nr_Neigh,Nebcoul,Totnebcoul)
+    E = EReal + EKspace
     write(*,*) 'Beginning response calculations'
     ewaldracc = 0.0; ewaldkacc = 0.0; respacc = 0.0;
     do J = 1,Nr_atoms
@@ -491,17 +497,18 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
       dq_v(J) = ONE
       !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,Coulomb_Pot_Real_I)
       do I = 1,Nr_atoms
-        call Ewald_Real_Space_Single_latte(Coulomb_Pot_Real_I,I,RXYZ,Box,Nr_elem, &
-             dq_v,J,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,HDIM,Max_Nr_Neigh)
-        Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
+        !call Ewald_Real_Space_Single_latte(Coulomb_Pot_Real_I,I,RXYZ,Box,Nr_elem, &
+        !     dq_v,J,Hubbard_U,Element_Pointer,Nr_atoms,Coulomb_acc,HDIM,Max_Nr_Neigh)
+        !Coulomb_Pot_Real(I) = Coulomb_Pot_Real_I
       enddo
       !$OMP END PARALLEL DO
       call cpu_time(ewaldr)
       ewaldracc = ewaldracc + ewaldr-start
-      call Ewald_k_Space_latte_single(Coulomb_Pot_k,J,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc)
-      Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
+      !call Ewald_k_Space_latte_single(Coulomb_Pot_k,J,RXYZ,Box,dq_v,Nr_atoms,Coulomb_acc)
+      !Coulomb_Pot_dq_v = Coulomb_Pot_Real+Coulomb_Pot_k
       call cpu_time(ewaldk)
       ewaldkacc = ewaldkacc + ewaldk-ewaldr
+      Coulomb_Pot_dq_v = matmul(E,dq_v)
 
       call bml_deallocate(H1_bml)
       call bml_zero_matrix(bml_type,bml_element_real,dp,N,MN,H1_bml)
@@ -562,6 +569,7 @@ subroutine prg_full_kernel_latte(KK,DO_bml,mu0,RXYZ,Box,Hubbard_U, &
     call Invert(JJ,KK,Nr_atoms)
 
     deallocate(row1); deallocate(row2); deallocate(JJ)
+    deallocate(E); deallocate(EReal); deallocate(EKspace)
     call bml_deallocate(ZT_bml)
     call bml_deallocate(X_bml)
     call bml_deallocate(Y_bml)