TABLE OF CONTENTS


ABINIT/cwavef_double_rfft_trick_pack [ Functions ]

[ Top ] [ Functions ]

NAME

 cwavef_double_rfft_trick_pack

FUNCTION

 We have C(G)=C(-G) and D(G)=D(-G) and only G components are in memory (istwfk=2)
 The Fourier transform is:
 C(r) = sum_G e^(iGr) C(G) = sum_(G_z>=0,G/=0) 2 Re[e^(iGr) C(G)] + C(0)
 so C(r) is a real function (same for D)
 Here we construct:
 E( G) = C(G)   + i D(G)
 E(-G) = C(G)^* + i D(G)^* (G/=0)
 so:
 E(r) = C(r) + i D(r)
 In short, one can do only one FFT on E (with istwfk=1) and obtains the FFT of C and D (istwfk=2)

INPUTS

 cwavef(2,npw_k*my_nspinor*(2*ndat))=planewave coefficients of wavefunctioni (istwfk=2)
 mpi_enreg=information about MPI parallelization
 ndat=number of FFTs to perform in parall
 npw_k=number of planewaves in basis for given k point (istwfk=2).

OUTPUT

 cwavef_fft(2,npw_fft*my_nspinor*ndat)=planewave coefficients of wavefunction (istwfk=1)

SIDE EFFECTS

NOTES

PARENTS

CHILDREN

SOURCE

1083 subroutine cwavef_double_rfft_trick_pack(cwavef,cwavef_fft,mpi_enreg,ndat,npw_k)
1084 
1085 !Arguments ------------------------------------
1086 !scalars
1087  integer,intent(in) :: ndat,npw_k
1088  type(MPI_type),intent(in) :: mpi_enreg
1089 !arrays
1090  real(dp),intent(in) :: cwavef(:,:)
1091  real(dp),intent(out) :: cwavef_fft(:,:)
1092 !Local variables-------------------------------
1093 !scalars
1094  integer :: idat,i0,ib1,ib2,npw_fft
1095 
1096  npw_fft=2*npw_k
1097  i0=1
1098  if (mpi_enreg%me_g0_fft==1) then! Do not include G=(0,0,0) twice
1099    npw_fft=npw_fft-1
1100    i0=2
1101  end if
1102 
1103  if (mpi_enreg%nproc_fft==1) then
1104    if (size(cwavef,1)/=2.or.size(cwavef,2)/=npw_k*2*ndat) then
1105      ABI_BUG('wrong size for cwavef')
1106    end if
1107  else
1108    if (size(cwavef,1)/=2.or.size(cwavef,2)/=npw_k*ndat) then
1109      ABI_BUG('wrong size for cwavef')
1110    end if
1111  end if
1112  if (size(cwavef_fft,1)/=2.or.size(cwavef_fft,2)/=npw_fft*ndat) then
1113    ABI_BUG('wrong size for cwavef_fft')
1114  end if
1115 
1116  if (mpi_enreg%nproc_fft==1) then
1117    do idat=1,ndat
1118      ib1=(idat-1)*npw_fft  ! band shift for cwavef_fft
1119      ib2=(idat-1)*2*npw_k ! band shift for cwavef
1120      ! E(G) = C(G) + i D(G)
1121      cwavef_fft(1,1+ib1:npw_k+ib1) = cwavef(1,1+ib2      :  npw_k+ib2) &
1122                                     -cwavef(2,1+npw_k+ib2:2*npw_k+ib2)
1123      cwavef_fft(2,1+ib1:npw_k+ib1) = cwavef(2,1+ib2      :  npw_k+ib2) &
1124                                     +cwavef(1,1+npw_k+ib2:2*npw_k+ib2)
1125      ! E(-G) = C(G)^* + i D(G)^* (G/=0)
1126      cwavef_fft(1,1+npw_k+ib1:npw_fft+ib1) = cwavef(1,i0      +ib2:  npw_k+ib2) &
1127                                             +cwavef(2,i0+npw_k+ib2:2*npw_k+ib2)
1128      cwavef_fft(2,1+npw_k+ib1:npw_fft+ib1) =-cwavef(2,i0      +ib2:  npw_k+ib2) &
1129                                             +cwavef(1,i0+npw_k+ib2:2*npw_k+ib2)
1130    end do
1131  else
1132    do idat=1,ndat
1133      ib1=(idat-1)*npw_fft  ! band shift for cwavef_fft
1134      ib2=(idat-1)*npw_k ! band shift for cwavef
1135      ! E(G) = C(G)
1136      cwavef_fft(:,1+ib1:npw_k+ib1) = cwavef(:,1+ib2      :  npw_k+ib2)
1137      ! E(-G) = C(G)^*
1138      cwavef_fft(1,1+npw_k+ib1:npw_fft+ib1) =  cwavef(1,i0+ib2:npw_k+ib2)
1139      cwavef_fft(2,1+npw_k+ib1:npw_fft+ib1) = -cwavef(2,i0+ib2:npw_k+ib2)
1140    end do
1141  end if
1142 
1143 end subroutine cwavef_double_rfft_trick_pack

ABINIT/cwavef_double_rfft_trick_unpack [ Functions ]

[ Top ] [ Functions ]

NAME

 cwavef_double_rfft_trick_unpack

FUNCTION

 From the "cwavef_double_rfft_trick_pack" routine we have:
 E( G) = C(G)   + i D(G)
 E(-G) = C(G)^* + i D(G)^* (G/=0)
 Here we compute:
 C(G) = ( E(G) + E(-G)^* ) / 2   (G/=0)
 D(G) = ( iE(-G)^* - iE(G)) / 2  (G/=0)
 and:
 C(0) = Re(E(0))
 D(0) = Im(E(0))

INPUTS

 cwavef_fft(2,npw_fft*my_nspinor*ndat_)=planewave coefficients of wavefunction (istwfk=1)
 mpi_enreg=information about MPI parallelization
 ndat=number of FFTs to perform in parall
 npw_k=number of planewaves in basis for given k point (istwfk=2).

OUTPUT

 cwavef(2,npw_fft*my_nspinor*(2*ndat))=planewave coefficients of wavefunction (istwfk=2)

SIDE EFFECTS

NOTES

PARENTS

CHILDREN

SOURCE

1182 subroutine cwavef_double_rfft_trick_unpack(cwavef,cwavef_fft,mpi_enreg,ndat,npw_k)
1183 
1184 !Arguments ------------------------------------
1185 !scalars
1186  integer,intent(in) :: ndat,npw_k
1187  type(MPI_type),intent(in) :: mpi_enreg
1188 !arrays
1189  real(dp),intent(out) :: cwavef(:,:)
1190  real(dp),intent(in) :: cwavef_fft(:,:)
1191 !Local variables-------------------------------
1192 !scalars
1193  integer :: idat,i0,ib1,ib2,npw_fft
1194 
1195  npw_fft=2*npw_k
1196  i0=1
1197  if (mpi_enreg%me_g0_fft==1) then! Do not include G=(0,0,0) twice
1198    npw_fft=npw_fft-1
1199    i0=2
1200  end if
1201 
1202  if (mpi_enreg%nproc_fft==1) then
1203    if (size(cwavef,1)/=2.or.size(cwavef,2)/=npw_k*2*ndat) then
1204      ABI_BUG('wrong size for cwavef')
1205    end if
1206  else
1207    if (size(cwavef,1)/=2.or.size(cwavef,2)/=npw_k*ndat) then
1208      ABI_BUG('wrong size for cwavef')
1209    end if
1210  end if
1211  if (size(cwavef_fft,1)/=2.or.size(cwavef_fft,2)/=npw_fft*ndat) then
1212    ABI_BUG('wrong size for cwavef_fft')
1213  end if
1214 
1215  if (mpi_enreg%nproc_fft==1) then
1216 
1217    do idat=1,ndat
1218      ib1=(idat-1)*npw_fft  ! band shift for cwavef_fft
1219      ib2=(idat-1)*2*npw_k ! band shift for cwavef
1220      ! C(G) = ( E(G) + E(-G)^* ) / 2 (factor 1/2 will be applied later)
1221      cwavef(1,i0+ib2:npw_k+ib2) = cwavef_fft(1,i0      +ib1:npw_k  +ib1) & !+Re(E( G))
1222                                  +cwavef_fft(1,1 +npw_k+ib1:npw_fft+ib1)   !+Re(E(-G))
1223      cwavef(2,i0+ib2:npw_k+ib2) = cwavef_fft(2,i0      +ib1:npw_k  +ib1) & !+Im(E( G))
1224                                  -cwavef_fft(2,1 +npw_k+ib1:npw_fft+ib1)   !-Im(E(-G))
1225      ! D(G) = ( iE(-G)^* - iE(G) ) / 2 (factor 1/2 will be applied later)
1226      cwavef(1,i0+npw_k+ib2:2*npw_k+ib2) = cwavef_fft(2,i0      +ib1:npw_k  +ib1) & !+Im(E( G))
1227                                          +cwavef_fft(2,1 +npw_k+ib1:npw_fft+ib1)   !+Im(E(-G))
1228      cwavef(2,i0+npw_k+ib2:2*npw_k+ib2) =-cwavef_fft(1,i0      +ib1:npw_k  +ib1) & !-Re(E( G))
1229                                          +cwavef_fft(1,1 +npw_k+ib1:npw_fft+ib1)   !+Re(E(-G))
1230      if (mpi_enreg%me_g0_fft==1) then
1231        ! Compute C(G=0) and D(G=0) and multiply by 2 as we apply 1/2 to the whole array shortly afterwards
1232        ! C(G=0) = Re(E(G=0))
1233        cwavef(1,1+ib2) = two*cwavef_fft(1,1+ib1)
1234        cwavef(2,1+ib2) = zero
1235        ! D(G=0) = Im(E(G=0))
1236        cwavef(1,1+npw_k+ib2) = two*cwavef_fft(2,1+ib1)
1237        cwavef(2,1+npw_k+ib2) = zero
1238      end if
1239 
1240    end do
1241 
1242    cwavef(:,:) = half*cwavef(:,:)
1243 
1244  else
1245 
1246    do idat=1,ndat
1247      ib1=(idat-1)*npw_fft  ! band shift for cwavef_fft
1248      ib2=(idat-1)*npw_k ! band shift for cwavef
1249      ! C(G) = E(G)
1250      cwavef(:,1+ib2:npw_k+ib2) = cwavef_fft(:,1+ib1:npw_k+ib1)
1251      if (i0==2) then
1252        cwavef(2,1+ib2) = zero ! This is important, otherwise it is numerically unstable...
1253      end if
1254    end do
1255 
1256  end if
1257 
1258 end subroutine cwavef_double_rfft_trick_unpack

ABINIT/getghc [ Functions ]

[ Top ] [ Functions ]

NAME

 getghc

FUNCTION

 Compute <G|H|C> for input vector |C> expressed in reciprocal space;
 Result is put in array ghc.
 <G|Vnonlocal + VfockACE|C> is also returned in gvnlxc if either NLoc NCPP or FockACE.
 If required, <G|S|C> is returned in gsc (S=overlap - PAW only)
 Note that left and right k points can be different, i.e. ghc=<k^prime+G|H|C_k>.

INPUTS

 cpopt=flag defining the status of cwaveprj%cp(:)=<Proj_i|Cnk> scalars (PAW only)
       (same meaning as in nonlop.F90 routine)
       if cpopt=-1, <p_lmn|in> (and derivatives) are computed here (and not saved)
       if cpopt= 0, <p_lmn|in> are computed here and saved
       if cpopt= 1, <p_lmn|in> and first derivatives are computed here and saved
       if cpopt= 2  <p_lmn|in> are already in memory;
       if cpopt= 3  <p_lmn|in> are already in memory; first derivatives are computed here and saved
       if cpopt= 4  <p_lmn|in> and first derivatives are already in memory;
 cwavef(2,npw*my_nspinor*ndat)=planewave coefficients of wavefunction.
 cwavef_r(2,n4,n5,n6,nspinor) = wave function in real space
 gs_ham <type(gs_hamiltonian_type)>=all data for the Hamiltonian to be applied
 lambda=factor to be used when computing <G|H-lambda.S|C> - only for sij_opt=-1
        Typically lambda is the eigenvalue (or its guess)
 mpi_enreg=information about MPI parallelization
 ndat=number of FFT to do in parallel
 prtvol=control print volume and debugging output
 sij_opt= -PAW ONLY-  if  0, only matrix elements <G|H|C> have to be computed
    (S=overlap)       if  1, matrix elements <G|S|C> have to be computed in gsc in addition to ghc
                      if -1, matrix elements <G|H-lambda.S|C> have to be computed in ghc (gsc not used)
 tim_getghc=timing code of the calling subroutine(can be set to 0 if not attributed)
 type_calc= option governing which part of Hamitonian is to be applied:
            1: local part only
            2: non-local+Fock+kinetic only (added to the existing Hamiltonian)
            3: local + kinetic only (added to the existing Hamiltonian)
 ===== Optional inputs =====
   [kg_fft_k(3,:)]=optional, (k+G) vector coordinates to be used for the FFT tranformation
                   instead of the one contained in gs_ham datastructure.
                   Typically used for real WF (in parallel) which are FFT-transformed 2 by 2.
   [kg_fft_kp(3,:)]=optional, (k^prime+G) vector coordinates to be used for the FFT tranformation
   [select_k]=optional, option governing the choice of k points to be used.
             gs_ham datastructure contains quantities needed to apply Hamiltonian
             in reciprocal space between 2 kpoints, k and k^prime (equal in most cases);
             if select_k=1, <k^prime|H|k>       is applied [default]
             if select_k=2, <k|H|k^prime>       is applied
             if select_k=3, <k|H|k>             is applied
             if select_k=4, <k^prime|H|k^prime> is applied

OUTPUT

  ghc(2,npw*my_nspinor*ndat)=matrix elements <G|H|C> (if sij_opt>=0)
                                          or <G|H-lambda.S|C> (if sij_opt=-1)
  gvnlxc(2,npw*my_nspinor*ndat)=matrix elements <G|Vnonlocal+VFockACE|C> (if sij_opt>=0)
                                            or <G|Vnonlocal+VFockACE-lambda.S|C> (if sij_opt=-1)
      include Vnonlocal if NCPP and non-local Fock if associated(gs_ham%fockcommon)
  if (sij_opt=1)
    gsc(2,npw*my_nspinor*ndat)=matrix elements <G|S|C> (S=overlap).

SIDE EFFECTS

  cwaveprj(natom,my_nspinor*(1+cpopt)*ndat)= wave function projected on nl projectors (PAW only)

SOURCE

 145 subroutine getghc(cpopt,cwavef,cwaveprj,ghc,gsc,gs_ham,gvnlxc,lambda,mpi_enreg,ndat,&
 146                   prtvol,sij_opt,tim_getghc,type_calc,&
 147                   kg_fft_k,kg_fft_kp,select_k,cwavef_r) ! optional arguments
 148 
 149 !Arguments ------------------------------------
 150 !scalars
 151  integer,intent(in) :: cpopt,ndat, prtvol
 152  integer,intent(in) :: sij_opt,tim_getghc,type_calc
 153  integer,intent(in),optional :: select_k
 154  real(dp),intent(in) :: lambda
 155  type(MPI_type),intent(in) :: mpi_enreg
 156  type(gs_hamiltonian_type),intent(inout),target :: gs_ham
 157 !arrays
 158  integer,intent(in),optional,target :: kg_fft_k(:,:),kg_fft_kp(:,:)
 159  real(dp),intent(out),target :: gsc(:,:)
 160  real(dp),intent(inout), target :: cwavef(:,:)
 161  real(dp),optional,intent(inout) :: cwavef_r(:,:,:,:,:)
 162  real(dp),intent(out), target :: ghc(:,:)
 163  real(dp),intent(out),target :: gvnlxc(:,:)
 164  type(pawcprj_type),intent(inout),target :: cwaveprj(:,:)
 165  !MG: Passing these arrays assumed-shape has the drawback that client code is
 166  !forced to use vec(2, npw*ndat) instead of the more user-friendly vec(2,npw,ndat)
 167 
 168 !Local variables-------------------------------
 169 !scalars
 170  integer,parameter :: level=114,re=1,im=2,tim_fourwf=1
 171  integer :: choice,cplex,cpopt_here,i1,i2,i3,idat,idir,ierr
 172  integer :: ig,igspinor,istwf_k_,ii,iispinor,ikpt_this_proc,ipw,ispinor,my_nspinor
 173  integer :: n4,n5,n6,ndat_,nnlout,npw_fft,npw_k1,npw_k2,nspinortot,option_fft
 174  integer :: paw_opt,select_k_,shift1,shift2,signs,tim_nonlop
 175  logical(kind=c_bool) :: k1_eq_k2
 176  logical :: double_rfft_trick,have_to_reequilibrate,has_fock,local_gvnlxc
 177  logical :: nspinor1TreatedByThisProc,nspinor2TreatedByThisProc,use_cwavef_r
 178  real(dp) :: ghcim,ghcre,weight
 179  character(len=500) :: msg
 180 
 181 !arrays
 182  integer,  pointer                :: gbound_k1(:,:)
 183  integer,  pointer                :: gbound_k2(:,:)
 184  integer,  pointer                :: kg_k1(:,:)
 185  integer,  pointer                :: kg_k2(:,:)
 186  integer,  ABI_CONTIGUOUS pointer :: indices_pw_fft(:), kg_k_fft(:,:)
 187  integer,  ABI_CONTIGUOUS pointer :: recvcount_fft(:), recvdisp_fft(:)
 188  integer,  ABI_CONTIGUOUS pointer :: sendcount_fft(:), senddisp_fft(:)
 189  integer,  allocatable:: dimcprj(:)
 190  real(dp)                         :: enlout(ndat), lambda_ndat(ndat), tsec(2)
 191  real(dp), target                 :: nonlop_dum(1,1)
 192  real(dp), allocatable            :: buff_wf(:,:)
 193  real(dp), allocatable            :: cwavef1(:,:)
 194  real(dp), allocatable            :: cwavef2(:,:)
 195  real(dp), allocatable            :: cwavef_fft(:,:)
 196  real(dp), allocatable            :: cwavef_fft_tr(:,:)
 197 
 198  real(dp), allocatable            :: ghc1(:,:)
 199  real(dp), allocatable            :: ghc2(:,:)
 200  real(dp), allocatable            :: ghc3(:,:)
 201  real(dp), allocatable            :: ghc4(:,:)
 202  real(dp), allocatable            :: ghc_mGGA(:,:)
 203  real(dp), allocatable            :: ghc_vectornd(:,:)
 204 
 205 #if defined HAVE_GPU && defined HAVE_YAKL
 206  real(c_double), ABI_CONTIGUOUS pointer :: gvnlc(:,:)
 207 #else
 208  real(dp), allocatable            :: gvnlc(:,:)
 209 #endif
 210 
 211  real(dp), allocatable            :: vlocal_tmp(:,:,:)
 212  real(dp), allocatable            :: work(:,:,:,:)
 213 
 214 #if defined HAVE_GPU && defined HAVE_YAKL
 215  real(c_double), ABI_CONTIGUOUS pointer :: gvnlxc_(:,:)
 216 #else
 217  real(dp), pointer                :: gvnlxc_(:,:)
 218 #endif
 219 
 220  real(dp), pointer                :: kinpw_k1(:)
 221  real(dp), pointer                :: kinpw_k2(:)
 222  real(dp), pointer                :: kpt_k1(:)
 223  real(dp), pointer                :: kpt_k2(:)
 224  real(dp), pointer                :: gsc_ptr(:,:)
 225  type(fock_common_type),pointer :: fock
 226  type(pawcprj_type),pointer :: cwaveprj_fock(:,:),cwaveprj_idat(:,:),cwaveprj_nonlop(:,:)
 227 
 228  real(c_double), parameter        :: hugevalue = huge(zero)*1.d-11
 229 
 230 ! *********************************************************************
 231 
 232  DBG_ENTER("COLL")
 233 
 234  if(gs_ham%gpu_option==ABI_GPU_OPENMP) then
 235    call getghc_ompgpu(cpopt,cwavef,cwaveprj,ghc,gsc,gs_ham,gvnlxc,lambda,mpi_enreg,ndat,&
 236                   prtvol,sij_opt,tim_getghc,type_calc,&
 237                   kg_fft_k,kg_fft_kp,select_k,cwavef_r)
 238    return
 239  end if
 240 
 241 !Keep track of total time spent in getghc:
 242  call timab(350+tim_getghc,1,tsec)
 243  ABI_NVTX_START_RANGE(NVTX_GETGHC)
 244 
 245 !Structured debugging if prtvol==-level
 246  if(prtvol==-level)then
 247    write(msg,'(80a,a,a)') ('=',ii=1,80),ch10,' getghc : enter, debugging '
 248    call wrtout(std_out,msg,'PERS')
 249  end if
 250 
 251 !Select k-dependent objects according to select_k input parameter
 252  select_k_=1;if (present(select_k)) select_k_=select_k
 253  if (select_k_==KPRIME_H_K) then
 254 !  <k^prime|H|k>
 255    npw_k1    =  gs_ham%npw_fft_k ; npw_k2    =  gs_ham%npw_fft_kp
 256    kpt_k1    => gs_ham%kpt_k     ; kpt_k2    => gs_ham%kpt_kp
 257    kg_k1     => gs_ham%kg_k      ; kg_k2     => gs_ham%kg_kp
 258    gbound_k1 => gs_ham%gbound_k  ; gbound_k2 => gs_ham%gbound_kp
 259    kinpw_k1  => gs_ham%kinpw_k   ; kinpw_k2  => gs_ham%kinpw_kp
 260  else if (select_k_==K_H_KPRIME) then
 261 !  <k|H|k^prime>
 262    npw_k1    =  gs_ham%npw_fft_kp; npw_k2    =  gs_ham%npw_fft_k
 263    kpt_k1    => gs_ham%kpt_kp    ; kpt_k2    => gs_ham%kpt_k
 264    kg_k1     => gs_ham%kg_kp     ; kg_k2     => gs_ham%kg_k
 265    gbound_k1 => gs_ham%gbound_kp ; gbound_k2 => gs_ham%gbound_k
 266    kinpw_k1  => gs_ham%kinpw_kp  ; kinpw_k2  => gs_ham%kinpw_k
 267  else if (select_k_==K_H_K) then
 268 !  <k|H|k>
 269    npw_k1    =  gs_ham%npw_fft_k ; npw_k2    =  gs_ham%npw_fft_k
 270    kpt_k1    => gs_ham%kpt_k     ; kpt_k2    => gs_ham%kpt_k
 271    kg_k1     => gs_ham%kg_k      ; kg_k2     => gs_ham%kg_k
 272    gbound_k1 => gs_ham%gbound_k  ; gbound_k2 => gs_ham%gbound_k
 273    kinpw_k1  => gs_ham%kinpw_k   ; kinpw_k2  => gs_ham%kinpw_k
 274  else if (select_k_==KPRIME_H_KPRIME) then
 275 !  <k^prime|H|k^prime>
 276    npw_k1    =  gs_ham%npw_fft_kp; npw_k2    =  gs_ham%npw_fft_kp
 277    kpt_k1    => gs_ham%kpt_kp    ; kpt_k2    => gs_ham%kpt_kp
 278    kg_k1     => gs_ham%kg_kp     ; kg_k2     => gs_ham%kg_kp
 279    gbound_k1 => gs_ham%gbound_kp ; gbound_k2 => gs_ham%gbound_kp
 280    kinpw_k1  => gs_ham%kinpw_kp  ; kinpw_k2  => gs_ham%kinpw_kp
 281  end if
 282  k1_eq_k2=(all(abs(kpt_k1(:)-kpt_k2(:))<tol8))
 283 
 284 !Check sizes
 285  my_nspinor=max(1,gs_ham%nspinor/mpi_enreg%nproc_spinor)
 286  if (size(cwavef)<2*npw_k1*my_nspinor*ndat) then
 287    ABI_BUG('wrong size for cwavef!')
 288  end if
 289  if (size(ghc)<2*npw_k2*my_nspinor*ndat) then
 290    ABI_BUG('wrong size for ghc!')
 291  end if
 292  if (sij_opt==1) then
 293    if (size(gsc)<2*npw_k2*my_nspinor*ndat) then
 294      ABI_BUG('wrong size for gsc!')
 295    end if
 296  end if
 297  if (gs_ham%usepaw==1.and.cpopt>=0) then
 298    if (size(cwaveprj)<gs_ham%natom*my_nspinor*ndat) then
 299      ABI_BUG('wrong size for cwaveprj!')
 300    end if
 301  end if
 302  if (any(type_calc == [0, 2, 3])) then
 303    local_gvnlxc = size(gvnlxc)==0
 304    if (local_gvnlxc) then
 305      if(gs_ham%gpu_option==ABI_GPU_KOKKOS) then
 306 #if defined HAVE_GPU && defined HAVE_YAKL
 307        ABI_MALLOC_MANAGED(gvnlxc_, (/2,npw_k2*my_nspinor*ndat/))
 308 #endif
 309      else
 310        ABI_MALLOC(gvnlxc_,(2,npw_k2*my_nspinor*ndat))
 311      end if
 312    else
 313      gvnlxc_ => gvnlxc
 314    end if
 315    if (size(gvnlxc_)<2*npw_k2*my_nspinor*ndat) then
 316      ABI_BUG('wrong size for gvnlxc!')
 317    end if
 318  end if
 319  use_cwavef_r=present(cwavef_r)
 320  n4=gs_ham%n4 ; n5=gs_ham%n5 ; n6=gs_ham%n6
 321  nspinortot=gs_ham%nspinor
 322  if (use_cwavef_r) then
 323    if (size(cwavef_r,1)/=2) then
 324      ABI_BUG('wrong size for cwavef_r (dimension 1)')
 325    end if
 326    if (size(cwavef_r,2)/=n4) then
 327      ABI_BUG('wrong size for cwavef_r (dimension 2)')
 328    end if
 329    if (size(cwavef_r,3)/=n5) then
 330      ABI_BUG('wrong size for cwavef_r (dimension 3)')
 331    end if
 332    if (size(cwavef_r,4)/=n6) then
 333      ABI_BUG('wrong size for cwavef_r (dimension 4)')
 334    end if
 335    if (size(cwavef_r,5)/=nspinortot) then
 336      ABI_BUG('wrong size for cwavef_r (dimension 5)')
 337    end if
 338  end if
 339 
 340 !Eventually overwrite plane waves data for FFT
 341  if (present(kg_fft_k)) then
 342    kg_k1 => kg_fft_k ; kg_k2 => kg_fft_k
 343    npw_k1=size(kg_k1,2) ; npw_k2=size(kg_k2,2)
 344  end if
 345  if (present(kg_fft_kp)) then
 346    kg_k2 => kg_fft_kp ; npw_k2=size(kg_k2,2)
 347  end if
 348 
 349 !paral_kgb constraint
 350  if (mpi_enreg%paral_kgb==1.and.(.not.k1_eq_k2)) then
 351    ABI_BUG('paral_kgb=1 not allowed for k/=k_^prime!')
 352  end if
 353 
 354 !Do we add Fock exchange term ?
 355  has_fock = associated(gs_ham%fockcommon)
 356  if (has_fock) fock => gs_ham%fockcommon
 357 
 358 !Parallelization over spinors management
 359  if (mpi_enreg%paral_spinor==0) then
 360    shift1=npw_k1;shift2=npw_k2
 361    nspinor1TreatedByThisProc=.true.
 362    nspinor2TreatedByThisProc=(nspinortot==2)
 363  else
 364    shift1=0;shift2=0
 365    nspinor1TreatedByThisProc=(mpi_enreg%me_spinor==0)
 366    nspinor2TreatedByThisProc=(mpi_enreg%me_spinor==1)
 367  end if
 368 
 369 !============================================================
 370 ! Application of the local potential
 371 !============================================================
 372 
 373  ABI_NVTX_START_RANGE(NVTX_GETGHC_LOCPOT)
 374 
 375  if (any(type_calc == [0, 1, 3])) then
 376 
 377 !  Need a Vlocal
 378    if (.not.associated(gs_ham%vlocal)) then
 379      ABI_BUG("We need vlocal in gs_ham!")
 380    end if
 381 
 382 !  fourwf can only process with one value of istwf_k
 383    if (.not.k1_eq_k2) then
 384      ABI_BUG('vlocal (fourwf) cannot be computed with k/=k^prime!')
 385    end if
 386 
 387 !  Eventually adjust load balancing for FFT (by changing FFT distrib)
 388    have_to_reequilibrate=.false.
 389    if (mpi_enreg%paral_kgb==1) then
 390      ikpt_this_proc=bandfft_kpt_get_ikpt()
 391      have_to_reequilibrate=bandfft_kpt(ikpt_this_proc)%have_to_reequilibrate
 392    end if
 393    ndat_                 = ndat
 394    istwf_k_              = gs_ham%istwf_k
 395    double_rfft_trick = istwf_k_==2.and.modulo(ndat,2)==0.and.k1_eq_k2
 396    double_rfft_trick = double_rfft_trick.and.(.not.have_to_reequilibrate).and.mpi_enreg%paral_kgb==1
 397    ! Note that the trick can be activated only if nspinortot=1 (if =2 then istwf_k=1), so gs_ham%nvloc=1 too
 398    ! For npfft>1, we don't use the trick but MPI-FFT does not handle istwf_k==2,
 399    ! so we have to use istwf_k=1 locally and build cwavef_fft in a similar way
 400    if (double_rfft_trick) then
 401      istwf_k_ = 1
 402      if (mpi_enreg%nproc_fft==1) then
 403        ndat_ = ndat / 2
 404      else
 405        ndat_ = ndat
 406      end if
 407      kg_k_fft => bandfft_kpt(ikpt_this_proc)%kg_k_gather_sym(:,:)
 408      npw_fft=2*npw_k1
 409      if (mpi_enreg%me_g0_fft==1) npw_fft=npw_fft-1 ! Do not include G=(0,0,0) twice
 410      ABI_MALLOC(cwavef_fft,(2,npw_fft*ndat_))
 411      call cwavef_double_rfft_trick_pack(cwavef,cwavef_fft,mpi_enreg,ndat_,npw_k1)
 412    end if
 413 
 414    if (have_to_reequilibrate) then
 415      npw_fft =  bandfft_kpt(ikpt_this_proc)%npw_fft
 416      sendcount_fft  => bandfft_kpt(ikpt_this_proc)%sendcount_fft(:)
 417      recvcount_fft  => bandfft_kpt(ikpt_this_proc)%recvcount_fft(:)
 418      senddisp_fft   => bandfft_kpt(ikpt_this_proc)%senddisp_fft(:)
 419      recvdisp_fft   => bandfft_kpt(ikpt_this_proc)%recvdisp_fft(:)
 420      indices_pw_fft => bandfft_kpt(ikpt_this_proc)%indices_pw_fft(:)
 421      kg_k_fft       => bandfft_kpt(ikpt_this_proc)%kg_k_fft(:,:)
 422      ABI_MALLOC(buff_wf,(2,npw_k1*ndat_) )
 423      ABI_MALLOC(cwavef_fft,(2,npw_fft*ndat_) )
 424      if(ndat_>1) then
 425        ABI_MALLOC(cwavef_fft_tr, (2,npw_fft*ndat_))
 426      end if
 427      do idat=1, ndat_
 428        do ipw = 1 ,npw_k1
 429          buff_wf(1:2, idat + ndat_*(indices_pw_fft(ipw)-1) ) = cwavef(1:2,ipw + npw_k1*(idat-1))
 430        end do
 431      end do
 432      if(ndat_ > 1) then
 433        call xmpi_alltoallv(buff_wf,2*ndat_*sendcount_fft,2*ndat_*senddisp_fft,  &
 434 &       cwavef_fft_tr,2*ndat_*recvcount_fft, 2*ndat_*recvdisp_fft, mpi_enreg%comm_fft,ierr)
 435 !      We need to transpose data
 436        do idat=1,ndat_
 437          do ipw = 1 ,npw_fft
 438            cwavef_fft(1:2,  ipw + npw_fft*(idat-1)) = cwavef_fft_tr(1:2,  idat + ndat_*(ipw-1))
 439          end do
 440        end do
 441      else
 442        call xmpi_alltoallv(buff_wf,2*sendcount_fft,2*senddisp_fft,  &
 443 &       cwavef_fft,2*recvcount_fft, 2*recvdisp_fft, mpi_enreg%comm_fft,ierr)
 444      end if
 445    end if
 446 
 447 !  Apply the local potential to the wavefunction
 448 !  Start from wavefunction in reciprocal space cwavef
 449 !  End with function ghc in reciprocal space also.
 450    ABI_MALLOC(work,(2,gs_ham%n4,gs_ham%n5,gs_ham%n6*ndat_))
 451    weight=one
 452    if (.not.use_cwavef_r) then
 453      option_fft=2
 454      if (nspinortot==2) then
 455        ABI_MALLOC(cwavef1,(2,npw_k1*ndat_))
 456        ABI_MALLOC(cwavef2,(2,npw_k1*ndat_))
 457        do idat=1,ndat_
 458          do ipw=1,npw_k1
 459            cwavef1(1:2,ipw+(idat-1)*npw_k1)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k1)
 460            cwavef2(1:2,ipw+(idat-1)*npw_k1)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k1+shift1)
 461          end do
 462        end do
 463 !      call cg_zcopy(npw_k1*ndat_,cwavef(1,1),cwavef1)
 464 !      call cg_zcopy(npw_k1*ndat_,cwavef(1,1+shift1),cwavef2)
 465      end if
 466    else
 467      option_fft=3
 468      if (nspinortot==2) then
 469        ABI_MALLOC(cwavef1,(0,0))
 470        ABI_MALLOC(cwavef2,(0,0))
 471      end if
 472    end if
 473 
 474    if (gs_ham%nvloc==1) then
 475 !  Treat scalar local potentials
 476 
 477      if (nspinortot==1) then
 478 
 479        if (use_cwavef_r) then
 480          do i3=1,n6
 481            do i2=1,n5
 482              do i1=1,n4
 483                work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(1,i1,i2,i3,1)
 484                work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(2,i1,i2,i3,1)
 485              end do
 486            end do
 487          end do
 488        end if
 489        if (have_to_reequilibrate.or.double_rfft_trick) then
 490          call fourwf(1,gs_ham%vlocal,cwavef_fft,cwavef_fft,work,gbound_k1,gbound_k2,&
 491 &         istwf_k_,kg_k_fft,kg_k_fft,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,&
 492 &         npw_fft,npw_fft,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,&
 493 &         weight,weight,gpu_option=gs_ham%gpu_option)
 494        else
 495          call fourwf(1,gs_ham%vlocal,cwavef,ghc,work,gbound_k1,gbound_k2,&
 496 &         istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,&
 497 &         npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,&
 498 &         weight,weight,gpu_option=gs_ham%gpu_option)
 499        end if
 500 
 501      else
 502        ! nspinortot==2
 503 
 504        if (nspinor1TreatedByThisProc) then
 505          if (use_cwavef_r) then
 506            do i3=1,n6
 507              do i2=1,n5
 508                do i1=1,n4
 509                  work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(1,i1,i2,i3,1)
 510                  work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(2,i1,i2,i3,1)
 511                end do
 512              end do
 513            end do
 514          end if
 515          ABI_MALLOC(ghc1,(2,npw_k2*ndat_))
 516          call fourwf(1,gs_ham%vlocal,cwavef1,ghc1,work,gbound_k1,gbound_k2,&
 517 &         istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,&
 518 &         npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,&
 519 &         weight,weight,gpu_option=gs_ham%gpu_option)
 520          do idat=1,ndat_
 521            do ipw =1, npw_k2
 522              ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2)=ghc1(1:2,ipw+(idat-1)*npw_k2)
 523            end do
 524          end do
 525          ABI_FREE(ghc1)
 526        end if ! spin 1 treated by this proc
 527 
 528        if (nspinor2TreatedByThisProc) then
 529          if (use_cwavef_r) then
 530            do i3=1,n6
 531              do i2=1,n5
 532                do i1=1,n4
 533                  work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(1,i1,i2,i3,2)
 534                  work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(2,i1,i2,i3,2)
 535                end do
 536              end do
 537            end do
 538          end if
 539          ABI_MALLOC(ghc2,(2,npw_k2*ndat_))
 540          call fourwf(1,gs_ham%vlocal,cwavef2,ghc2,work,gbound_k1,gbound_k2,&
 541 &         istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,&
 542 &         npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,&
 543 &         gpu_option=gs_ham%gpu_option)
 544          do idat=1,ndat_
 545            do ipw=1,npw_k2
 546              ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2+shift2)=ghc2(1:2,ipw+(idat-1)*npw_k2)
 547            end do
 548          end do
 549          ABI_FREE(ghc2)
 550        end if ! spin 2 treated by this proc
 551 
 552      end if ! nspinortot
 553 
 554    else if (gs_ham%nvloc==4) then
 555 !    Treat non-collinear local potentials
 556 
 557      ABI_MALLOC(ghc1,(2,npw_k2*ndat_))
 558      ABI_MALLOC(ghc2,(2,npw_k2*ndat_))
 559      ABI_MALLOC(ghc3,(2,npw_k2*ndat_))
 560      ABI_MALLOC(ghc4,(2,npw_k2*ndat_))
 561      ghc1(:,:)=zero; ghc2(:,:)=zero; ghc3(:,:)=zero ;  ghc4(:,:)=zero
 562      if (use_cwavef_r) then
 563        ABI_MALLOC(vlocal_tmp,(0,0,0))
 564      else
 565        ABI_MALLOC(vlocal_tmp,(gs_ham%n4,gs_ham%n5,gs_ham%n6))
 566      end if
 567 !    ghc1=v11*phi1
 568      if (nspinor1TreatedByThisProc) then
 569        if (use_cwavef_r) then
 570          do i3=1,n6
 571            do i2=1,n5
 572              do i1=1,n4
 573                work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(1,i1,i2,i3,1)
 574                work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(2,i1,i2,i3,1)
 575              end do
 576            end do
 577          end do
 578        else
 579          ! LB,07/22:
 580          ! Weird segmentation fault encountered here if called with multithreaded_getghc for big systems.
 581          ! Using an explicit loop instead of fortran syntax seems to solve the problem, I don't understand why...
 582          !vlocal_tmp(:,:,:)=gs_ham%vlocal(:,:,:,1)
 583          do i3=1,n6
 584            do i2=1,n5
 585              do i1=1,n4
 586                vlocal_tmp(i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)
 587              end do
 588            end do
 589          end do
 590        end if
 591        call fourwf(1,vlocal_tmp,cwavef1,ghc1,work,gbound_k1,gbound_k2,&
 592 &       istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,&
 593 &       npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,&
 594 &       gpu_option=gs_ham%gpu_option)
 595      end if
 596 !    ghc2=v22*phi2
 597      if (nspinor2TreatedByThisProc) then
 598        if (use_cwavef_r) then
 599          do i3=1,n6
 600            do i2=1,n5
 601              do i1=1,n4
 602                work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,2)*cwavef_r(1,i1,i2,i3,2)
 603                work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,2)*cwavef_r(2,i1,i2,i3,2)
 604              end do
 605            end do
 606          end do
 607        else
 608          ! LB,07/22:
 609          ! Weird segmentation fault encountered here if called with multithreaded_getghc for big systems.
 610          ! Using an explicit loop instead of fortran syntax seems to solve the problem, I don't understand why...
 611          !vlocal_tmp(:,:,:)=gs_ham%vlocal(:,:,:,2)
 612          do i3=1,n6
 613            do i2=1,n5
 614              do i1=1,n4
 615                vlocal_tmp(i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,2)
 616              end do
 617            end do
 618          end do
 619        end if
 620        call fourwf(1,vlocal_tmp,cwavef2,ghc2,work,gbound_k1,gbound_k2,&
 621 &       istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,&
 622 &       npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,&
 623 &       gpu_option=gs_ham%gpu_option)
 624      end if
 625      ABI_FREE(vlocal_tmp)
 626      cplex=2
 627      if (use_cwavef_r) then
 628        ABI_MALLOC(vlocal_tmp,(0,0,0))
 629      else
 630        ABI_MALLOC(vlocal_tmp,(cplex*gs_ham%n4,gs_ham%n5,gs_ham%n6))
 631      end if
 632 !    ghc3=(re(v12)-im(v12))*phi1
 633      if (nspinor1TreatedByThisProc) then
 634        if (use_cwavef_r) then
 635          do i3=1,n6
 636            do i2=1,n5
 637              do i1=1,n4
 638                work(1,i1,i2,i3)= gs_ham%vlocal(i1,i2,i3,3)*cwavef_r(1,i1,i2,i3,1)+gs_ham%vlocal(i1,i2,i3,4)*cwavef_r(2,i1,i2,i3,1)
 639                work(2,i1,i2,i3)=-gs_ham%vlocal(i1,i2,i3,4)*cwavef_r(1,i1,i2,i3,1)+gs_ham%vlocal(i1,i2,i3,3)*cwavef_r(2,i1,i2,i3,1)
 640              end do
 641            end do
 642          end do
 643        else
 644          do i3=1,gs_ham%n6
 645            do i2=1,gs_ham%n5
 646              do i1=1,gs_ham%n4
 647                vlocal_tmp(2*i1-1,i2,i3)= gs_ham%vlocal(i1,i2,i3,3)
 648                vlocal_tmp(2*i1  ,i2,i3)=-gs_ham%vlocal(i1,i2,i3,4)
 649              end do
 650            end do
 651          end do
 652        end if
 653        call fourwf(cplex,vlocal_tmp,cwavef1,ghc3,work,gbound_k1,gbound_k2,&
 654 &       istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,&
 655 &       npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,&
 656 &       gpu_option=gs_ham%gpu_option)
 657      end if
 658 !    ghc4=(re(v12)+im(v12))*phi2
 659      if (nspinor2TreatedByThisProc) then
 660        if (use_cwavef_r) then
 661          do i3=1,n6
 662            do i2=1,n5
 663              do i1=1,n4
 664                work(1,i1,i2,i3)= gs_ham%vlocal(i1,i2,i3,3)*cwavef_r(1,i1,i2,i3,2)-gs_ham%vlocal(i1,i2,i3,4)*cwavef_r(2,i1,i2,i3,2)
 665                work(2,i1,i2,i3)= gs_ham%vlocal(i1,i2,i3,4)*cwavef_r(1,i1,i2,i3,2)+gs_ham%vlocal(i1,i2,i3,3)*cwavef_r(2,i1,i2,i3,2)
 666              end do
 667            end do
 668          end do
 669        else
 670          do i3=1,gs_ham%n6
 671            do i2=1,gs_ham%n5
 672              do i1=1,gs_ham%n4
 673                vlocal_tmp(2*i1-1,i2,i3)= gs_ham%vlocal(i1,i2,i3,3)
 674                vlocal_tmp(2*i1  ,i2,i3)= gs_ham%vlocal(i1,i2,i3,4)
 675              end do
 676            end do
 677          end do
 678        end if
 679        call fourwf(cplex,vlocal_tmp,cwavef2,ghc4,work,gbound_k1,gbound_k2,&
 680 &       istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,&
 681 &       npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,&
 682 &       gpu_option=gs_ham%gpu_option)
 683      end if
 684      ABI_FREE(vlocal_tmp)
 685 !    Build ghc from pieces
 686 !    (v11,v22,Re(v12)+iIm(v12);Re(v12)-iIm(v12))(psi1;psi2): matrix product
 687      if (mpi_enreg%paral_spinor==0) then
 688        do idat=1,ndat_
 689          do ipw=1,npw_k2
 690            ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2)       =ghc1(1:2,ipw+(idat-1)*npw_k2)+ghc4(1:2,ipw+(idat-1)*npw_k2)
 691            ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2+shift2)=ghc3(1:2,ipw+(idat-1)*npw_k2)+ghc2(1:2,ipw+(idat-1)*npw_k2)
 692          end do
 693        end do
 694      else
 695        call xmpi_sum(ghc4,mpi_enreg%comm_spinor,ierr)
 696        call xmpi_sum(ghc3,mpi_enreg%comm_spinor,ierr)
 697        if (nspinor1TreatedByThisProc) then
 698          do idat=1,ndat_
 699            do ipw=1,npw_k2
 700              ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2)=ghc1(1:2,ipw+(idat-1)*npw_k2)+ghc4(1:2,ipw+(idat-1)*npw_k2)
 701            end do
 702          end do
 703        else if (nspinor2TreatedByThisProc) then
 704          do idat=1,ndat_
 705            do ipw=1,npw_k2
 706              ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2+shift2)=ghc3(1:2,ipw+(idat-1)*npw_k2)+ghc2(1:2,ipw+(idat-1)*npw_k2)
 707            end do
 708          end do
 709        end if
 710      end if
 711      ABI_FREE(ghc1)
 712      ABI_FREE(ghc2)
 713      ABI_FREE(ghc3)
 714      ABI_FREE(ghc4)
 715    end if ! nvloc
 716 
 717    if (nspinortot==2)  then
 718      ABI_FREE(cwavef1)
 719      ABI_FREE(cwavef2)
 720    end if
 721 
 722    ABI_FREE(work)
 723 
 724    if (double_rfft_trick) then
 725      call cwavef_double_rfft_trick_unpack(ghc,cwavef_fft,mpi_enreg,ndat_,npw_k1)
 726      ABI_FREE(cwavef_fft)
 727    end if
 728 
 729 !  Retrieve eventually original FFT distrib
 730    if(have_to_reequilibrate) then
 731      if(ndat_ > 1 ) then
 732        do idat=1,ndat_
 733          do ipw = 1 ,npw_fft
 734            cwavef_fft_tr(1:2,  idat + ndat_*(ipw-1)) = cwavef_fft(1:2,  ipw + npw_fft*(idat-1))
 735          end do
 736        end do
 737        call xmpi_alltoallv(cwavef_fft_tr,2*ndat_*recvcount_fft, 2*ndat_*recvdisp_fft, &
 738 &       buff_wf,2*ndat_*sendcount_fft,2*ndat_*senddisp_fft, mpi_enreg%comm_fft,ierr)
 739      else
 740        call xmpi_alltoallv(cwavef_fft,2*recvcount_fft, 2*recvdisp_fft, &
 741 &       buff_wf,2*sendcount_fft,2*senddisp_fft, mpi_enreg%comm_fft,ierr)
 742      end if
 743      do idat=1,ndat_
 744        do ipw = 1 ,npw_k2
 745          ghc(1:2,ipw + npw_k2*(idat-1)) = buff_wf(1:2, idat + ndat_*(indices_pw_fft(ipw)-1))
 746        end do
 747      end do
 748      ABI_FREE(buff_wf)
 749      ABI_FREE(cwavef_fft)
 750      if(ndat_ > 1) then
 751        ABI_FREE(cwavef_fft_tr)
 752      end if
 753    end if
 754 
 755 !  Add metaGGA contribution
 756    if (associated(gs_ham%vxctaulocal)) then
 757      if (.not.k1_eq_k2) then
 758        ABI_BUG('metaGGA not allowed for k/=k_^prime!')
 759      end if
 760      if (size(gs_ham%vxctaulocal)/=gs_ham%n4*gs_ham%n5*gs_ham%n6*gs_ham%nvloc*4) then
 761        ABI_BUG('wrong sizes for vxctaulocal!')
 762      end if
 763      ABI_MALLOC(ghc_mGGA,(2,npw_k2*my_nspinor*ndat_))
 764      call getghc_mGGA(cwavef,ghc_mGGA,gbound_k1,gs_ham%gprimd,istwf_k_,kg_k1,kpt_k1,&
 765 &     gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,npw_k1,gs_ham%nvloc,&
 766 &     gs_ham%n4,gs_ham%n5,gs_ham%n6,my_nspinor,gs_ham%vxctaulocal,gs_ham%gpu_option)
 767      ghc(1:2,1:npw_k2*my_nspinor*ndat_)=ghc(1:2,1:npw_k2*my_nspinor*ndat_)+ghc_mGGA(1:2,1:npw_k2*my_nspinor*ndat_)
 768      ABI_FREE(ghc_mGGA)
 769    end if
 770 
 771    !  Add nuclear dipole moment contribution
 772    if (associated(gs_ham%vectornd)) then
 773      if (.not.k1_eq_k2) then
 774        ABI_BUG('nuclear dipole vector potential not allowed for k/=k_^prime!')
 775      end if
 776      if (size(gs_ham%vectornd)/=gs_ham%n4*gs_ham%n5*gs_ham%n6*gs_ham%nvloc*3) then
 777        ABI_BUG('wrong sizes for vectornd in getghc!')
 778      end if
 779      ABI_MALLOC(ghc_vectornd,(2,npw_k2*my_nspinor*ndat_))
 780      ghc_vectornd=zero
 781      call getghc_nucdip(cwavef,ghc_vectornd,gbound_k1,istwf_k_,kg_k1,kpt_k1,&
 782 &     gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,npw_k1,gs_ham%nvloc,&
 783 &     gs_ham%n4,gs_ham%n5,gs_ham%n6,my_nspinor,gs_ham%vectornd,gs_ham%gpu_option)
 784      ghc(1:2,1:npw_k2*my_nspinor*ndat_)=ghc(1:2,1:npw_k2*my_nspinor*ndat_)+ghc_vectornd(1:2,1:npw_k2*my_nspinor*ndat_)
 785      ABI_FREE(ghc_vectornd)
 786    end if
 787 
 788  end if ! type_calc
 789 
 790  ABI_NVTX_END_RANGE()
 791 
 792 
 793  if (any(type_calc == [0, 2, 3])) then
 794 
 795 !============================================================
 796 ! Application of the non-local potential and the Fock potential
 797 !============================================================
 798 
 799    ABI_NVTX_START_RANGE(NVTX_GETGHC_NLOCPOT)
 800 
 801    if (type_calc==0 .or. type_calc==2) then
 802      signs=2 ; choice=1 ; nnlout=1 ; idir=0 ; tim_nonlop=1
 803      cpopt_here=-1;if (gs_ham%usepaw==1) cpopt_here=cpopt
 804      if (has_fock) then
 805        if (gs_ham%usepaw==1) then
 806          cpopt_here=max(cpopt,0)
 807          if (cpopt<2) then
 808            ABI_MALLOC(cwaveprj_fock,(gs_ham%natom,my_nspinor*ndat))
 809            ABI_MALLOC(dimcprj,(gs_ham%natom))
 810            call pawcprj_getdim(dimcprj,gs_ham%natom,gs_ham%nattyp,gs_ham%ntypat,&
 811 &           gs_ham%typat,fock%pawtab,'O')
 812            call pawcprj_alloc(cwaveprj_fock,0,dimcprj)
 813            ABI_FREE(dimcprj)
 814          else
 815            cwaveprj_fock=>cwaveprj
 816          end if
 817          cwaveprj_nonlop=>cwaveprj_fock
 818        else
 819          cwaveprj_nonlop=>cwaveprj
 820          cwaveprj_fock=>cwaveprj
 821        end if
 822      else
 823        cwaveprj_nonlop=>cwaveprj
 824      end if
 825      paw_opt=gs_ham%usepaw ; if (sij_opt/=0) paw_opt=sij_opt+3
 826      lambda_ndat = lambda
 827 
 828      if (gs_ham%usepaw==0) gsc_ptr => nonlop_dum
 829      if (gs_ham%usepaw==1) gsc_ptr => gsc
 830      call nonlop(choice,cpopt_here,cwaveprj_nonlop,enlout,gs_ham,idir,lambda_ndat,mpi_enreg,ndat,&
 831 &     nnlout,paw_opt,signs,gsc_ptr,tim_nonlop,cwavef,gvnlxc_,select_k=select_k_)
 832 
 833      if (gs_ham%usepaw==1 .and. has_fock)then
 834        if (fock_get_getghc_call(fock)==1) then
 835          if(gs_ham%gpu_option==ABI_GPU_KOKKOS) then
 836 #if defined HAVE_GPU && defined HAVE_YAKL
 837            ABI_MALLOC_MANAGED(gvnlc, (/2,npw_k2*my_nspinor*ndat/))
 838 #endif
 839          else
 840            ABI_MALLOC(gvnlc, (2,npw_k2*my_nspinor*ndat))
 841          end if
 842          gvnlc=gvnlxc_
 843        endif
 844      endif
 845 
 846 !    Calculation of the Fock exact exchange contribution from the Fock or ACE operator
 847      if (has_fock) then
 848        if (fock_get_getghc_call(fock)==1) then
 849          if (gs_ham%usepaw==0) cwaveprj_idat => cwaveprj
 850          if (fock%use_ACE==0) then
 851            call timab(360,1,tsec)
 852            do idat=1,ndat
 853              if (gs_ham%usepaw==1) cwaveprj_idat => cwaveprj_fock(:,(idat-1)*my_nspinor+1:idat*my_nspinor)
 854              call fock_getghc(cwavef(:,1+(idat-1)*npw_k1*my_nspinor:idat*npw_k1*my_nspinor),cwaveprj_idat,&
 855 &             gvnlxc_(:,1+(idat-1)*npw_k2*my_nspinor:idat*npw_k2*my_nspinor),gs_ham,mpi_enreg)
 856            end do ! idat
 857            call timab(360,2,tsec)
 858          else
 859            do idat=1,ndat
 860              call fock_ACE_getghc(cwavef(:,1+(idat-1)*npw_k1*my_nspinor:idat*npw_k1*my_nspinor),&
 861 &             gvnlxc_(:,1+(idat-1)*npw_k2*my_nspinor:idat*npw_k2*my_nspinor),gs_ham,mpi_enreg)
 862            end do ! idat
 863          end if
 864        end if
 865      end if
 866 
 867    else if (type_calc == 3) then
 868      ! for kinetic and local only, nonlocal and vfock should be zero
 869      gvnlxc_(:,:) = zero
 870    end if ! if(type_calc...
 871 
 872    ABI_NVTX_END_RANGE()
 873 
 874 !============================================================
 875 ! Assemble kinetic, local, nonlocal and Fock contributions
 876 !============================================================
 877 
 878    ABI_NVTX_START_RANGE(NVTX_GETGHC_KIN)
 879 
 880 #ifdef FC_NVHPC
 881    !FIXME This Kokkos kernel seems to cause issues under NVHPC so it is disabled
 882    if (.false.) then
 883 #else
 884    if (gs_ham%gpu_option == ABI_GPU_KOKKOS) then
 885 #endif
 886 
 887 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS)
 888      call assemble_energy_contribution_kokkos(c_loc(ghc), &
 889        & c_loc(gsc), c_loc(kinpw_k2), c_loc(cwavef), c_loc(gvnlxc_), &
 890        & ndat, my_nspinor, npw_k2, sij_opt, k1_eq_k2, hugevalue)
 891      ! sync device so that data can be reused safely on host
 892      ! will probably be moved elsewhere once all the scf loop runs on device
 893      call gpu_device_synchronize()
 894 #endif
 895 
 896    else
 897 
 898 #ifdef FC_NVHPC
 899 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS)
 900      !Related to FIXME above
 901      if (gs_ham%gpu_option == ABI_GPU_KOKKOS) call gpu_device_synchronize()
 902 #endif
 903 #endif
 904 
 905      !  Assemble modified kinetic, local and nonlocal contributions
 906      !  to <G|H|C(n,k)>. Take also into account build-in debugging.
 907      if(prtvol/=-level)then
 908        do idat=1,ndat
 909          if (k1_eq_k2) then
 910            !$OMP PARALLEL DO PRIVATE(igspinor) COLLAPSE(2) IF(gemm_nonlop_use_gemm)
 911            do ispinor=1,my_nspinor
 912              do ig=1,npw_k2
 913                igspinor=ig+npw_k2*(ispinor-1)+npw_k2*my_nspinor*(idat-1)
 914                if(kinpw_k2(ig)<huge(zero)*1.d-11)then
 915                  ghc(re,igspinor) = ghc(re,igspinor) + kinpw_k2(ig)*cwavef(re,igspinor) + gvnlxc_(re,igspinor)
 916                  ghc(im,igspinor) = ghc(im,igspinor) + kinpw_k2(ig)*cwavef(im,igspinor) + gvnlxc_(im,igspinor)
 917                else
 918                  ghc(re,igspinor)=zero
 919                  ghc(im,igspinor)=zero
 920                  if (sij_opt==1) then
 921                    gsc(re,igspinor)=zero
 922                    gsc(im,igspinor)=zero
 923                  end if
 924                end if
 925              end do ! ig
 926            end do ! ispinor
 927 
 928          else
 929            !$OMP PARALLEL DO PRIVATE(igspinor) COLLAPSE(2) IF(gemm_nonlop_use_gemm)
 930            do ispinor=1,my_nspinor
 931              do ig=1,npw_k2
 932                igspinor=ig+npw_k2*(ispinor-1)+npw_k2*my_nspinor*(idat-1)
 933                if(kinpw_k2(ig)<huge(zero)*1.d-11)then
 934                  ghc(re,igspinor)= ghc(re,igspinor) + gvnlxc_(re,igspinor)
 935                  ghc(im,igspinor)= ghc(im,igspinor) + gvnlxc_(im,igspinor)
 936                else
 937                  ghc(re,igspinor)=zero
 938                  ghc(im,igspinor)=zero
 939                  if (sij_opt==1) then
 940                    gsc(re,igspinor)=zero
 941                    gsc(im,igspinor)=zero
 942                  end if
 943                end if
 944              end do ! ig
 945            end do ! ispinor
 946          end if
 947        end do ! idat
 948      else
 949        !    Here, debugging section
 950        call wrtout(std_out,' getghc : components of ghc ','PERS')
 951        write(msg,'(a)')&
 952          &     'icp ig ispinor igspinor re/im     ghc        kinpw         cwavef      glocc        gvnlxc  gsc'
 953        call wrtout(std_out,msg,'PERS')
 954        do idat=1,ndat
 955          do ispinor=1,my_nspinor
 956            do ig=1,npw_k2
 957              igspinor=ig+npw_k2*(ispinor-1)+npw_k2*my_nspinor*(idat-1)
 958              if(kinpw_k2(ig)<huge(zero)*1.d-11)then
 959                if (k1_eq_k2) then
 960                  ghcre=kinpw_k2(ig)*cwavef(re,igspinor)+ghc(re,igspinor)+gvnlxc_(re,igspinor)
 961                  ghcim=kinpw_k2(ig)*cwavef(im,igspinor)+ghc(im,igspinor)+gvnlxc_(im,igspinor)
 962                else
 963                  ghcre=ghc(re,igspinor)+gvnlxc_(re,igspinor)
 964                  ghcim=ghc(im,igspinor)+gvnlxc_(im,igspinor)
 965                end if
 966              else
 967                ghcre=zero
 968                ghcim=zero
 969                if (sij_opt==1) then
 970                  gsc(re,igspinor)=zero
 971                  gsc(im,igspinor)=zero
 972                end if
 973              end if
 974              iispinor=ispinor;if (mpi_enreg%paral_spinor==1) iispinor=mpi_enreg%me_spinor+1
 975              if (sij_opt == 1) then
 976                write(msg,'(a,3(1x,i5),6(1x,es13.6))') '  1 ', ig, iispinor, igspinor,ghcre,&
 977                  &             kinpw_k2(ig),cwavef(re,igspinor),ghc(re,igspinor),gvnlxc_(re,igspinor), gsc(re,igspinor)
 978                call wrtout(std_out,msg,'PERS')
 979                write(msg,'(a,3(1x,i5),6(1x,es13.6))') '  2 ', ig, iispinor, igspinor,ghcim,&
 980                  &             kinpw_k2(ig),cwavef(im,igspinor),ghc(im,igspinor),gvnlxc_(im,igspinor), gsc(im,igspinor)
 981                call wrtout(std_out,msg,'PERS')
 982              else
 983                write(msg,'(a,3(1x,i5),6(1x,es13.6))') '  1 ', ig, iispinor, igspinor,ghcre,&
 984                  &             kinpw_k2(ig),cwavef(re,igspinor),ghc(re,igspinor),gvnlxc_(re,igspinor)
 985                call wrtout(std_out,msg,'PERS')
 986                write(msg,'(a,3(1x,i5),6(1x,es13.6))') '  2 ', ig, iispinor, igspinor,ghcim,&
 987                  &             kinpw_k2(ig),cwavef(im,igspinor),ghc(im,igspinor),gvnlxc_(im,igspinor)
 988                call wrtout(std_out,msg,'PERS')
 989              end if
 990              ghc(re,igspinor)=ghcre
 991              ghc(im,igspinor)=ghcim
 992            end do ! ig
 993          end do ! ispinor
 994        end do ! idat
 995      end if
 996    end if ! gs_ham%gpu_option
 997 
 998    ABI_NVTX_END_RANGE()
 999 
1000 !  Special case of PAW + Fock : only return Fock operator contribution in gvnlxc_
1001    if (gs_ham%usepaw==1 .and. has_fock) then
1002      gvnlxc_=gvnlxc_-gvnlc
1003      if(gs_ham%gpu_option==ABI_GPU_KOKKOS) then
1004 #if defined HAVE_GPU && defined HAVE_YAKL
1005        ABI_FREE_MANAGED(gvnlc)
1006 #endif
1007      else
1008        ABI_FREE(gvnlc)
1009      end if
1010    endif
1011 
1012    if (local_gvnlxc) then
1013      if(gs_ham%gpu_option==ABI_GPU_KOKKOS) then
1014 #if defined HAVE_GPU && defined HAVE_YAKL
1015        ABI_FREE_MANAGED(gvnlxc_)
1016 #endif
1017      else
1018        ABI_FREE(gvnlxc_)
1019      end if
1020    end if
1021 
1022 !  Structured debugging : if prtvol=-level, stop here.
1023    if(prtvol==-level)then
1024      write(msg,'(a,i0,a)')' getghc : exit prtvol=-',level,', debugging mode => stop '
1025      ABI_ERROR(msg)
1026    end if
1027 
1028    if (type_calc==0.or.type_calc==2) then
1029      if (has_fock.and.gs_ham%usepaw==1.and.cpopt<2) then
1030        call pawcprj_free(cwaveprj_fock)
1031        ABI_FREE(cwaveprj_fock)
1032      end if
1033    end if
1034 
1035  end if ! type_calc
1036 
1037  call timab(350+tim_getghc,2,tsec)
1038 
1039  DBG_EXIT("COLL")
1040  ABI_NVTX_END_RANGE()
1041 
1042 end subroutine getghc

ABINIT/getghc_mGGA [ Functions ]

[ Top ] [ Functions ]

NAME

 getghc_mGGA

FUNCTION

 Compute metaGGA contribution to <G|H|C> for input vector |C> expressed in reciprocal space.

INPUTS

 cwavef(2,npw_k*my_nspinor*ndat)=planewave coefficients of wavefunction.
 gbound_k(2*mgfft+4)=sphere boundary info
 istwf_k=input parameter that describes the storage of wfs
 kg_k(3,npw_k)=G vec coordinates wrt recip lattice transl.
 kpt(3)=current k point
 mgfft=maximum single fft dimension
 mpi_enreg=information about MPI parallelization
 my_nspinor=number of spinorial components of the wavefunctions (on current proc)
 ndat=number of FFTs to perform in parall
 ngfft(18)=contain all needed information about 3D FFT
 npw_k=number of planewaves in basis for given k point.
 nvloc=number of spin components of vxctaulocal
 n4,n5,n6=for dimensionning of vxctaulocal
 gpu_option= GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)
 vxctaulocal(n4,n5,n6,nvloc,4)= local potential corresponding to the derivative of XC energy with respect to
  kinetic energy density, in real space, on the augmented fft grid.
  This array contains also the gradient of vxctaulocal (gvxctaulocal) in vxctaulocal(:,:,:,:,2:4).

OUTPUT

  ghc_mGGA(2,npw_k*my_nspinor*ndat)=metaGGA contribution to <G|H|C>

SIDE EFFECTS

SOURCE

1536 subroutine getghc_mGGA(cwavef,ghc_mGGA,gbound_k,gprimd,istwf_k,kg_k,kpt,mgfft,mpi_enreg,&
1537 &                      ndat,ngfft,npw_k,nvloc,n4,n5,n6,my_nspinor,vxctaulocal,gpu_option)
1538 
1539 !Arguments ------------------------------------
1540 !scalars
1541  integer,intent(in) :: istwf_k,mgfft,my_nspinor,ndat,npw_k,nvloc,n4,n5,n6,gpu_option
1542  type(MPI_type),intent(in) :: mpi_enreg
1543 !arrays
1544  integer,intent(in) :: gbound_k(2*mgfft+4),kg_k(3,npw_k),ngfft(18)
1545  real(dp),intent(in) :: gprimd(3,3),kpt(3)
1546  real(dp),intent(inout) :: cwavef(2,npw_k*my_nspinor*ndat)
1547  real(dp),intent(inout) :: ghc_mGGA(2,npw_k*my_nspinor*ndat)
1548  real(dp),intent(inout) :: vxctaulocal(n4,n5,n6,nvloc,4)
1549 
1550 !Local variables-------------------------------
1551 !scalars
1552  integer,parameter :: tim_fourwf=1
1553  integer :: idat,idir,ipw,nspinortot,shift
1554  logical :: nspinor1TreatedByThisProc,nspinor2TreatedByThisProc
1555  real(dp) :: gp2pi1,gp2pi2,gp2pi3,kpt_cart,kg_k_cart,weight=one
1556 !arrays
1557  real(dp),allocatable :: cwavef1(:,:),cwavef2(:,:)
1558  real(dp),allocatable :: gcwavef(:,:,:),gcwavef1(:,:,:),gcwavef2(:,:,:)
1559  real(dp),allocatable :: ghc1(:,:),ghc2(:,:)
1560  real(dp),allocatable :: lcwavef(:,:),lcwavef1(:,:),lcwavef2(:,:)
1561  real(dp),allocatable :: work(:,:,:,:)
1562 
1563 ! *********************************************************************
1564 
1565  ghc_mGGA(:,:)=zero
1566  if (nvloc/=1) return
1567 
1568  nspinortot=min(2,(1+mpi_enreg%paral_spinor)*my_nspinor)
1569  if (mpi_enreg%paral_spinor==0) then
1570    shift=npw_k
1571    nspinor1TreatedByThisProc=.true.
1572    nspinor2TreatedByThisProc=(nspinortot==2)
1573  else
1574    shift=0
1575    nspinor1TreatedByThisProc=(mpi_enreg%me_spinor==0)
1576    nspinor2TreatedByThisProc=(mpi_enreg%me_spinor==1)
1577  end if
1578 
1579  ABI_MALLOC(work,(2,n4,n5,n6*ndat))
1580 
1581  if (nspinortot==1) then
1582 
1583    ABI_MALLOC(ghc1,(2,npw_k*ndat))
1584 
1585 !  Do it in 3 STEPs:
1586 !  STEP1: Compute grad of cwavef and Laplacian of cwavef
1587    ABI_MALLOC(gcwavef,(2,npw_k*ndat,3))
1588    ABI_MALLOC(lcwavef,(2,npw_k*ndat))
1589 !!$OMP PARALLEL DO
1590    do idat=1,ndat
1591      do ipw=1,npw_k
1592        gcwavef(:,ipw+(idat-1)*npw_k,1:3)=zero
1593        lcwavef(:,ipw+(idat-1)*npw_k)  =zero
1594      end do
1595    end do
1596    do idir=1,3
1597      gp2pi1=gprimd(idir,1)*two_pi
1598      gp2pi2=gprimd(idir,2)*two_pi
1599      gp2pi3=gprimd(idir,3)*two_pi
1600      kpt_cart=gp2pi1*kpt(1)+gp2pi2*kpt(2)+gp2pi3*kpt(3)
1601 !    Multiplication by 2pi i (G+k)_idir for gradient
1602 !    Multiplication by -(2pi (G+k)_idir )**2 for Laplacian
1603      do idat=1,ndat
1604        do ipw=1,npw_k
1605          kg_k_cart=gp2pi1*kg_k(1,ipw)+gp2pi2*kg_k(2,ipw)+gp2pi3*kg_k(3,ipw)+kpt_cart
1606          gcwavef(1,ipw+(idat-1)*npw_k,idir)= cwavef(2,ipw+(idat-1)*npw_k)*kg_k_cart
1607          gcwavef(2,ipw+(idat-1)*npw_k,idir)=-cwavef(1,ipw+(idat-1)*npw_k)*kg_k_cart
1608          lcwavef(1,ipw+(idat-1)*npw_k)=lcwavef(1,ipw+(idat-1)*npw_k)-cwavef(1,ipw+(idat-1)*npw_k)*kg_k_cart**2
1609          lcwavef(2,ipw+(idat-1)*npw_k)=lcwavef(2,ipw+(idat-1)*npw_k)-cwavef(2,ipw+(idat-1)*npw_k)*kg_k_cart**2
1610        end do
1611      end do
1612    end do ! idir
1613 !  STEP2: Compute (vxctaulocal)*(Laplacian of cwavef) and add it to ghc
1614    call fourwf(1,vxctaulocal(:,:,:,:,1),lcwavef,ghc1,work,gbound_k,gbound_k,&
1615 &   istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,&
1616 &   tim_fourwf,weight,weight,gpu_option=gpu_option)
1617 !!$OMP PARALLEL DO
1618    do idat=1,ndat
1619      do ipw=1,npw_k
1620        ghc_mGGA(:,ipw+(idat-1)*npw_k)=ghc_mGGA(:,ipw+(idat-1)*npw_k)-half*ghc1(:,ipw+(idat-1)*npw_k)
1621      end do
1622    end do
1623    ABI_FREE(lcwavef)
1624 !  STEP3: Compute sum of (grad components of vxctaulocal)*(grad components of cwavef)
1625    do idir=1,3
1626      call fourwf(1,vxctaulocal(:,:,:,:,1+idir),gcwavef(:,:,idir),ghc1,work,gbound_k,gbound_k,&
1627      istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,&
1628 &     tim_fourwf,weight,weight,gpu_option=gpu_option)
1629 !!$OMP PARALLEL DO
1630      do idat=1,ndat
1631        do ipw=1,npw_k
1632          ghc_mGGA(:,ipw+(idat-1)*npw_k)=ghc_mGGA(:,ipw+(idat-1)*npw_k)-half*ghc1(:,ipw+(idat-1)*npw_k)
1633        end do
1634      end do
1635    end do ! idir
1636    ABI_FREE(gcwavef)
1637    ABI_FREE(ghc1)
1638 
1639  else ! nspinortot==2
1640 
1641    ABI_MALLOC(cwavef1,(2,npw_k*ndat))
1642    ABI_MALLOC(cwavef2,(2,npw_k*ndat))
1643    do idat=1,ndat
1644      do ipw=1,npw_k
1645        cwavef1(1:2,ipw+(idat-1)*npw_k)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k)
1646        cwavef2(1:2,ipw+(idat-1)*npw_k)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k+shift)
1647      end do
1648    end do
1649 !  call cg_zcopy(npw*ndat,cwavef(1,1),cwavef1)
1650 !  call cg_zcopy(npw*ndat,cwavef(1,1+shift),cwavef2)
1651 
1652 
1653    if (nspinor1TreatedByThisProc) then
1654 
1655      ABI_MALLOC(ghc1,(2,npw_k*ndat))
1656 
1657 !    Do it in 3 STEPs:
1658 !    STEP1: Compute grad of cwavef and Laplacian of cwavef
1659      ABI_MALLOC(gcwavef1,(2,npw_k*ndat,3))
1660      ABI_MALLOC(lcwavef1,(2,npw_k*ndat))
1661 !!$OMP PARALLEL DO
1662      do idat=1,ndat
1663        do ipw=1,npw_k
1664          gcwavef1(:,ipw+(idat-1)*npw_k,1:3)=zero
1665          lcwavef1(:,ipw+(idat-1)*npw_k)=zero
1666        end do
1667      end do
1668      do idir=1,3
1669        gp2pi1=gprimd(idir,1)*two_pi
1670        gp2pi2=gprimd(idir,2)*two_pi
1671        gp2pi3=gprimd(idir,3)*two_pi
1672        kpt_cart=gp2pi1*kpt(1)+gp2pi2*kpt(2)+gp2pi3*kpt(3)
1673 !      Multiplication by 2pi i (G+k)_idir for gradient
1674 !      Multiplication by -(2pi (G+k)_idir )**2 for Laplacian
1675        do idat=1,ndat
1676          do ipw=1,npw_k
1677            kg_k_cart=gp2pi1*kg_k(1,ipw)+gp2pi2*kg_k(2,ipw)+gp2pi3*kg_k(3,ipw)+kpt_cart
1678            gcwavef1(1,ipw+(idat-1)*npw_k,idir)= cwavef1(2,ipw+(idat-1)*npw_k)*kg_k_cart
1679            gcwavef1(2,ipw+(idat-1)*npw_k,idir)=-cwavef1(1,ipw+(idat-1)*npw_k)*kg_k_cart
1680            lcwavef1(1,ipw+(idat-1)*npw_k)=lcwavef1(1,ipw+(idat-1)*npw_k)-cwavef1(1,ipw+(idat-1)*npw_k)*kg_k_cart**2
1681            lcwavef1(2,ipw+(idat-1)*npw_k)=lcwavef1(2,ipw+(idat-1)*npw_k)-cwavef1(2,ipw+(idat-1)*npw_k)*kg_k_cart**2
1682          end do
1683        end do
1684      end do ! idir
1685 !    STEP2: Compute (vxctaulocal)*(Laplacian of cwavef) and add it to ghc
1686      call fourwf(1,vxctaulocal(:,:,:,:,1),lcwavef1,ghc1,work,gbound_k,gbound_k,&
1687 &     istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,&
1688 &     tim_fourwf,weight,weight,gpu_option=gpu_option)
1689 !!$OMP PARALLEL DO
1690      do idat=1,ndat
1691        do ipw=1,npw_k
1692          ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)=ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)-half*ghc1(:,ipw+(idat-1)*npw_k)
1693        end do
1694      end do
1695      ABI_FREE(lcwavef1)
1696 !    STEP3: Compute (grad components of vxctaulocal)*(grad components of cwavef)
1697      do idir=1,3
1698        call fourwf(1,vxctaulocal(:,:,:,:,1+idir),gcwavef1(:,:,idir),ghc1,work,gbound_k,gbound_k,&
1699        istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,&
1700 &      tim_fourwf,weight,weight,gpu_option=gpu_option)
1701 !!$OMP PARALLEL DO
1702        do idat=1,ndat
1703          do ipw=1,npw_k
1704            ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k) = ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)-half*ghc1(:,ipw+(idat-1)*npw_k)
1705          end do
1706        end do
1707      end do ! idir
1708      ABI_FREE(gcwavef1)
1709      ABI_FREE(ghc1)
1710 
1711    end if ! spin 1 treated by this proc
1712 
1713    if (nspinor2TreatedByThisProc) then
1714 
1715      ABI_MALLOC(ghc2,(2,npw_k*ndat))
1716 
1717 !    Do it in 3 STEPs:
1718 !    STEP1: Compute grad of cwavef and Laplacian of cwavef
1719      ABI_MALLOC(gcwavef2,(2,npw_k*ndat,3))
1720      ABI_MALLOC(lcwavef2,(2,npw_k*ndat))
1721 !!$OMP PARALLEL DO
1722      do idat=1,ndat
1723        do ipw=1,npw_k
1724          gcwavef2(:,ipw+(idat-1)*npw_k,1:3)=zero
1725          lcwavef2(:,ipw+(idat-1)*npw_k)  =zero
1726        end do
1727      end do
1728      do idir=1,3
1729        gp2pi1=gprimd(idir,1)*two_pi
1730        gp2pi2=gprimd(idir,2)*two_pi
1731        gp2pi3=gprimd(idir,3)*two_pi
1732        kpt_cart=gp2pi1*kpt(1)+gp2pi2*kpt(2)+gp2pi3*kpt(3)
1733 !      Multiplication by 2pi i (G+k)_idir for gradient
1734 !      Multiplication by -(2pi (G+k)_idir )**2 for Laplacian
1735        do idat=1,ndat
1736          do ipw=1,npw_k
1737            kg_k_cart=gp2pi1*kg_k(1,ipw)+gp2pi2*kg_k(2,ipw)+gp2pi3*kg_k(3,ipw)+kpt_cart
1738            gcwavef2(1,ipw+(idat-1)*npw_k,idir)= cwavef2(2,ipw+(idat-1)*npw_k)*kg_k_cart
1739            gcwavef2(2,ipw+(idat-1)*npw_k,idir)=-cwavef2(1,ipw+(idat-1)*npw_k)*kg_k_cart
1740            lcwavef2(1,ipw+(idat-1)*npw_k)=lcwavef2(1,ipw+(idat-1)*npw_k)-cwavef2(1,ipw+(idat-1)*npw_k)*kg_k_cart**2
1741            lcwavef2(2,ipw+(idat-1)*npw_k)=lcwavef2(2,ipw+(idat-1)*npw_k)-cwavef2(2,ipw+(idat-1)*npw_k)*kg_k_cart**2
1742          end do
1743        end do
1744      end do ! idir
1745 !    STEP2: Compute (vxctaulocal)*(Laplacian of cwavef) and add it to ghc
1746      call fourwf(1,vxctaulocal(:,:,:,:,1),lcwavef2,ghc2,work,gbound_k,gbound_k,&
1747 &     istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,&
1748 &     tim_fourwf,weight,weight,gpu_option=gpu_option)
1749 !!$OMP PARALLEL DO
1750      do idat=1,ndat
1751         do ipw=1,npw_k
1752            ! original code
1753            ! ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)=ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)-half*ghc2(:,ipw+(idat-1)*npw_k)
1754            ! but this stores the spinor2 result in the spinor1 location. Should be stored with shift
1755            ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k+shift)=ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k+shift)&
1756                 & -half*ghc2(:,ipw+(idat-1)*npw_k)
1757        end do
1758      end do
1759      ABI_FREE(lcwavef2)
1760 !    STEP3: Compute sum of (grad components of vxctaulocal)*(grad components of cwavef)
1761      do idir=1,3
1762        call fourwf(1,vxctaulocal(:,:,:,:,1+idir),gcwavef2(:,:,idir),ghc2,work,gbound_k,gbound_k,&
1763        istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,&
1764 &      tim_fourwf,weight,weight,gpu_option=gpu_option)
1765 !!$OMP PARALLEL DO
1766        do idat=1,ndat
1767          do ipw=1,npw_k
1768            ! original code
1769            ! ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)=ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)-half*ghc2(:,ipw+(idat-1)*npw_k)
1770            ! but this stores the spinor2 result in the spinor1 location. Should be stored with shift
1771             ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k+shift)=ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k+shift)&
1772                  & -half*ghc2(:,ipw+(idat-1)*npw_k)
1773          end do
1774        end do
1775      end do ! idir
1776 
1777      ABI_FREE(gcwavef2)
1778      ABI_FREE(ghc2)
1779 
1780    end if ! spin 2 treated by this proc
1781 
1782    ABI_FREE(cwavef1)
1783    ABI_FREE(cwavef2)
1784 
1785  end if ! nspinortot
1786 
1787  ABI_FREE(work)
1788 
1789 end subroutine getghc_mGGA

ABINIT/getghc_nucdip [ Functions ]

[ Top ] [ Functions ]

NAME

 getghc_nucdip

FUNCTION

 Compute magnetic nuclear dipole moment contribution to <G|H|C>
 for input vector |C> expressed in reciprocal space.

INPUTS

 cwavef(2,npw_k*my_nspinor*ndat)=planewave coefficients of wavefunction.
 gbound_k(2*mgfft+4)=sphere boundary info
 gprimd(3,3)=dimensional reciprocal space primitive translations (b^-1)
 istwf_k=input parameter that describes the storage of wfs
 kg_k(3,npw_k)=G vec coordinates wrt recip lattice transl.
 kpt(3)=current k point
 mgfft=maximum single fft dimension
 mpi_enreg=information about MPI parallelization
 my_nspinor=number of spinorial components of the wavefunctions (on current proc)
 ndat=number of FFTs to perform in parall
 ngfft(18)=contain all needed information about 3D FFT
 npw_k=number of planewaves in basis for given k point.
 nvloc=number of spin components of vxctaulocal
 n4,n5,n6=for dimensionning of vxctaulocal
 gpu_option= GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)
 vectornd(n4,n5,n6,nvloc,3)= local potential corresponding to the vector potential of the array
  of nuclear magnetic dipoles, in real space, on the augmented fft grid.

OUTPUT

  ghc_vectornd(2,npw_k*my_nspinor*ndat)=A.p contribution to <G|H|C> for array of nuclear dipoles

SIDE EFFECTS

NOTES

 this code is a copied, simplied version of getghc_mGGA (see below) and should eventually be
 integrated into that code, to simplify maintenance

SOURCE

1300 subroutine getghc_nucdip(cwavef,ghc_vectornd,gbound_k,istwf_k,kg_k,kpt,mgfft,mpi_enreg,&
1301 &                      ndat,ngfft,npw_k,nvloc,n4,n5,n6,my_nspinor,vectornd,gpu_option)
1302 
1303 !Arguments ------------------------------------
1304 !scalars
1305  integer,intent(in) :: istwf_k,mgfft,my_nspinor,ndat,npw_k,nvloc,n4,n5,n6,gpu_option
1306  type(MPI_type),intent(in) :: mpi_enreg
1307 !arrays
1308  integer,intent(in) :: gbound_k(2*mgfft+4),kg_k(3,npw_k),ngfft(18)
1309  real(dp),intent(in) :: kpt(3)
1310  real(dp),intent(inout) :: cwavef(2,npw_k*my_nspinor*ndat)
1311  real(dp),intent(inout) :: ghc_vectornd(2,npw_k*my_nspinor*ndat)
1312  real(dp),intent(inout) :: vectornd(n4,n5,n6,nvloc,3)
1313 
1314 !Local variables-------------------------------
1315 !scalars
1316  integer,parameter :: tim_fourwf=1
1317  integer :: idat,idir,ipw,nspinortot,shift
1318  logical :: nspinor1TreatedByThisProc,nspinor2TreatedByThisProc
1319  real(dp) :: scale_conversion,weight=one
1320  !arrays
1321  real(dp),allocatable :: cwavef1(:,:),cwavef2(:,:)
1322  real(dp),allocatable :: gcwavef(:,:,:),gcwavef1(:,:,:),gcwavef2(:,:,:)
1323  real(dp),allocatable :: ghc1(:,:),ghc2(:,:),kgkpk(:,:)
1324  real(dp),allocatable :: work(:,:,:,:)
1325 
1326 ! *********************************************************************
1327 
1328  ghc_vectornd(:,:)=zero
1329  if (nvloc/=1) return
1330 
1331  nspinortot=min(2,(1+mpi_enreg%paral_spinor)*my_nspinor)
1332  if (mpi_enreg%paral_spinor==0) then
1333    shift=npw_k
1334    nspinor1TreatedByThisProc=.true.
1335    nspinor2TreatedByThisProc=(nspinortot==2)
1336  else
1337    shift=0
1338    nspinor1TreatedByThisProc=(mpi_enreg%me_spinor==0)
1339    nspinor2TreatedByThisProc=(mpi_enreg%me_spinor==1)
1340  end if
1341 
1342  ABI_MALLOC(work,(2,n4,n5,n6*ndat))
1343 
1344  ! scale conversion from SI to atomic units,
1345  ! here \alpha^2 where \alpha is the fine structure constant
1346  scale_conversion = FineStructureConstant2
1347 
1348  if (nspinortot==1) then
1349 
1350     ABI_MALLOC(ghc1,(2,npw_k*ndat))
1351 
1352     !  Do it in 2 STEPs:
1353     !  STEP1: Compute grad of cwavef
1354     ABI_MALLOC(gcwavef,(2,npw_k*ndat,3))
1355 
1356     gcwavef = zero
1357 
1358     ! compute k + G. Note these are in reduced coords
1359     ABI_MALLOC(kgkpk,(npw_k,3))
1360     do ipw = 1, npw_k
1361        kgkpk(ipw,:) = kpt(:) + kg_k(:,ipw)
1362     end do
1363 
1364     ! make 2\pi(k+G)c(G)|G> by element-wise multiplication
1365     do idir = 1, 3
1366        do idat = 1, ndat
1367           gcwavef(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k,idir) = &
1368                & cwavef(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k)*kgkpk(1:npw_k,idir)
1369           gcwavef(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k,idir) = &
1370                & cwavef(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k)*kgkpk(1:npw_k,idir)
1371        end do
1372     end do
1373     ABI_FREE(kgkpk)
1374     gcwavef = gcwavef*two_pi
1375 
1376     !  STEP2: Compute sum of (grad components of vectornd)*(grad components of cwavef)
1377     do idir=1,3
1378       call fourwf(1,vectornd(:,:,:,:,idir),gcwavef(:,:,idir),ghc1,work,gbound_k,gbound_k,&
1379            istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,&
1380            &     tim_fourwf,weight,weight,gpu_option=gpu_option)
1381 !!$OMP PARALLEL DO
1382        ! DAXPY is a BLAS routine for y -> A*x + y, here x = ghc1, A = scale_conversion, and y = ghc_vectornd
1383        ! should be faster than explicit loop over ipw as npw_k gets large
1384       do idat=1,ndat
1385         call DAXPY(npw_k,scale_conversion,ghc1(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1,&
1386              & ghc_vectornd(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1)
1387         call DAXPY(npw_k,scale_conversion,ghc1(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1,&
1388              & ghc_vectornd(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1)
1389       end do
1390     end do ! idir
1391     ABI_FREE(gcwavef)
1392     ABI_FREE(ghc1)
1393 
1394  else ! nspinortot==2
1395 
1396     ABI_MALLOC(cwavef1,(2,npw_k*ndat))
1397     ABI_MALLOC(cwavef2,(2,npw_k*ndat))
1398     do idat=1,ndat
1399        do ipw=1,npw_k
1400           cwavef1(1:2,ipw+(idat-1)*npw_k)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k)
1401           cwavef2(1:2,ipw+(idat-1)*npw_k)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k+shift)
1402        end do
1403     end do
1404 
1405     ! compute k + G. Note these are in reduced coords
1406     ABI_MALLOC(kgkpk,(npw_k,3))
1407     do ipw = 1, npw_k
1408        kgkpk(ipw,:) = kpt(:) + kg_k(:,ipw)
1409     end do
1410 
1411     if (nspinor1TreatedByThisProc) then
1412 
1413        ABI_MALLOC(ghc1,(2,npw_k*ndat))
1414 
1415        !  Do it in 2 STEPs:
1416        !  STEP1: Compute grad of cwavef
1417        ABI_MALLOC(gcwavef1,(2,npw_k*ndat,3))
1418 
1419        gcwavef1 = zero
1420        ! make 2\pi(k+G)c(G)|G> by element-wise multiplication
1421        do idir = 1, 3
1422           do idat = 1, ndat
1423              gcwavef1(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k,idir) = &
1424                   & cwavef1(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k)*kgkpk(1:npw_k,idir)
1425              gcwavef1(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k,idir) = &
1426                   & cwavef1(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k)*kgkpk(1:npw_k,idir)
1427           end do
1428        end do
1429        gcwavef1 = gcwavef1*two_pi
1430 
1431        !  STEP2: Compute sum of (grad components of vectornd)*(grad components of cwavef)
1432        do idir=1,3
1433           call fourwf(1,vectornd(:,:,:,:,idir),gcwavef1(:,:,idir),ghc1,work,gbound_k,gbound_k,&
1434                istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,&
1435                &     tim_fourwf,weight,weight,gpu_option=gpu_option)
1436 !!$OMP PARALLEL DO
1437           ! DAXPY is a BLAS routine for y -> A*x + y, here x = ghc1, A = scale_conversion, and y = ghc_vectornd
1438           ! should be faster than explicit loop over ipw as npw_k gets large
1439           do idat=1,ndat
1440              call DAXPY(npw_k,scale_conversion,ghc1(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1,&
1441                   & ghc_vectornd(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1)
1442              call DAXPY(npw_k,scale_conversion,ghc1(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1,&
1443                   & ghc_vectornd(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1)
1444           end do
1445        end do ! idir
1446        ABI_FREE(gcwavef1)
1447        ABI_FREE(ghc1)
1448 
1449     end if ! end spinor 1
1450 
1451     if (nspinor2TreatedByThisProc) then
1452 
1453        ABI_MALLOC(ghc2,(2,npw_k*ndat))
1454 
1455        !  Do it in 2 STEPs:
1456        !  STEP1: Compute grad of cwavef
1457        ABI_MALLOC(gcwavef2,(2,npw_k*ndat,3))
1458 
1459        gcwavef2 = zero
1460        ! make 2\pi(k+G)c(G)|G> by element-wise multiplication
1461        do idir = 1, 3
1462           do idat = 1, ndat
1463              gcwavef2(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k,idir) = &
1464                   & cwavef2(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k)*kgkpk(1:npw_k,idir)
1465              gcwavef2(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k,idir) = &
1466                   & cwavef2(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k)*kgkpk(1:npw_k,idir)
1467           end do
1468        end do
1469        gcwavef2 = gcwavef2*two_pi
1470 
1471        !  STEP2: Compute sum of (grad components of vectornd)*(grad components of cwavef)
1472        do idir=1,3
1473           call fourwf(1,vectornd(:,:,:,:,idir),gcwavef2(:,:,idir),ghc2,work,gbound_k,gbound_k,&
1474                istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,&
1475                &     tim_fourwf,weight,weight,gpu_option=gpu_option)
1476 !!$OMP PARALLEL DO
1477           ! DAXPY is a BLAS routine for y -> A*x + y, here x = ghc1, A = scale_conversion, and y = ghc_vectornd
1478           ! should be faster than explicit loop over ipw as npw_k gets large
1479           do idat=1,ndat
1480              call DAXPY(npw_k,scale_conversion,ghc2(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1,&
1481                   & ghc_vectornd(1,1+(idat-1)*npw_k+shift:npw_k+(idat-1)*npw_k+shift),1)
1482              call DAXPY(npw_k,scale_conversion,ghc2(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1,&
1483                   & ghc_vectornd(2,1+(idat-1)*npw_k+shift:npw_k+(idat-1)*npw_k+shift),1)
1484           end do
1485        end do ! idir
1486        ABI_FREE(gcwavef2)
1487        ABI_FREE(ghc2)
1488 
1489     end if ! end spinor 2
1490 
1491     ABI_FREE(cwavef1)
1492     ABI_FREE(cwavef2)
1493     ABI_FREE(kgkpk)
1494 
1495  end if ! nspinortot
1496 
1497  ABI_FREE(work)
1498 
1499 end subroutine getghc_nucdip

ABINIT/getgsc [ Functions ]

[ Top ] [ Functions ]

NAME

 getgsc

FUNCTION

 Compute <G|S|C> for all input vectors |Cnk> at a given k-point,
              OR for one input vector |Cnk>.
 |Cnk> are expressed in reciprocal space.
 S is the overlap operator between |Cnk> (used for PAW).

INPUTS

  cg(2,mcg)=planewave coefficients of wavefunctions
  cprj(natom,mcprj)= wave functions projected with non-local projectors: cprj=<p_i|Cnk>
  gs_ham <type(gs_hamiltonian_type)>=all data for the Hamiltonian at k+q
  ibg=shift to be applied on the location of data in the array cprj (beginning of current k-point)
  icg=shift to be applied on the location of data in the array cg (beginning of current k-point)
  igsc=shift to be applied on the location of data in the array gsc (beginning of current k-point)
  ikpt,isppol=indexes of current (spin.kpoint)
  mcg=second dimension of the cg array
  mcprj=second dimension of the cprj array
  mgsc=second dimension of the gsc array
  mpi_enreg=information about MPI parallelization
  natom=number of atoms in unit cell.
  nband= if positive: number of bands at this k point for that spin polarization
         if negative: abs(nband) is the index of the only band to be computed
  npw_k=number of planewaves in basis for given k point.
  nspinor=number of spinorial components of the wavefunctions
 [select_k]=optional, option governing the choice of k points to be used.
             gs_ham datastructure contains quantities needed to apply overlap operator
             in reciprocal space between 2 kpoints, k and k^prime (equal in most cases);
             if select_k=1, <k^prime|S|k>       is applied [default]
             if select_k=2, <k|S|k^prime>       is applied
             if select_k=3, <k|S|k>             is applied
             if select_k=4, <k^prime|S|k^prime> is applied

OUTPUT

  gsc(2,mgsc)= <g|S|Cnk> or <g|S^(1)|Cnk> (S=overlap)

SOURCE

1832 subroutine getgsc(cg,cprj,gs_ham,gsc,ibg,icg,igsc,ikpt,isppol,&
1833 &                 mcg,mcprj,mgsc,mpi_enreg,natom,nband,npw_k,nspinor,select_k)
1834 
1835 !Arguments ------------------------------------
1836 !scalars
1837  integer,intent(in) :: ibg,icg,igsc,ikpt,isppol,mcg,mcprj
1838  integer,intent(in) :: mgsc,natom,nband,npw_k,nspinor
1839 !TODO : may be needed to distribute cprj over band procs
1840 ! integer,intent(in) :: mband_mem
1841  integer,intent(in),optional :: select_k
1842  type(MPI_type),intent(in) :: mpi_enreg
1843  type(gs_hamiltonian_type),intent(inout),target :: gs_ham
1844 !arrays
1845  real(dp),intent(in) :: cg(2,mcg)
1846  real(dp),intent(out) :: gsc(2,mgsc)
1847  type(pawcprj_type),intent(in) :: cprj(natom,mcprj)
1848 
1849 !Local variables-------------------------------
1850 !scalars
1851  integer :: choice,cpopt,dimenl1,dimenl2,iband,iband1,iband2,index_cg,index_cprj
1852  integer :: index_gsc,me,my_nspinor,paw_opt,select_k_,signs,tim_nonlop,useylm
1853  !character(len=500) :: msg
1854 !arrays
1855  real(dp) :: enlout_dum(1),tsec(2)
1856  real(dp),allocatable :: cwavef(:,:),scwavef(:,:)
1857  type(pawcprj_type),allocatable :: cwaveprj(:,:)
1858 
1859 ! *********************************************************************
1860 
1861  DBG_ENTER("COLL")
1862 
1863 !Compatibility tests
1864  my_nspinor=max(1,nspinor/mpi_enreg%nproc_spinor)
1865  if(gs_ham%usepaw==0) then
1866    ABI_BUG('Only compatible with PAW (usepaw=1) !')
1867  end if
1868  if(nband<0.and.(mcg<npw_k*my_nspinor.or.mgsc<npw_k*my_nspinor.or.mcprj<my_nspinor)) then
1869    ABI_BUG('Invalid value for mcg, mgsc or mcprj !')
1870  end if
1871 
1872 !Keep track of total time spent in getgsc:
1873  call timab(565,1,tsec)
1874 
1875  gsc = zero
1876 
1877 !Prepare some data
1878  ABI_MALLOC(cwavef,(2,npw_k*my_nspinor))
1879  ABI_MALLOC(scwavef,(2,npw_k*my_nspinor))
1880  if (gs_ham%usecprj==1) then
1881    ABI_MALLOC(cwaveprj,(natom,my_nspinor))
1882    call pawcprj_alloc(cwaveprj,0,gs_ham%dimcprj)
1883  else
1884    ABI_MALLOC(cwaveprj,(0,0))
1885  end if
1886  dimenl1=gs_ham%dimekb1;dimenl2=natom;tim_nonlop=0
1887  choice=1;signs=2;cpopt=-1+3*gs_ham%usecprj;paw_opt=3;useylm=1
1888  select_k_=1;if (present(select_k)) select_k_=select_k
1889  me=mpi_enreg%me_kpt
1890 
1891 !Loop over bands
1892  index_cprj=ibg;index_cg=icg;index_gsc=igsc
1893  if (nband>0) then
1894    iband1=1;iband2=nband
1895 !only do 1 band in case nband < 0 (the |nband|th one)
1896  else if (nband<0) then
1897    iband1=abs(nband);iband2=iband1
1898    index_cprj=index_cprj+(iband1-1)*my_nspinor
1899    index_cg  =index_cg  +(iband1-1)*npw_k*my_nspinor
1900    index_gsc =index_gsc +(iband1-1)*npw_k*my_nspinor
1901  end if
1902 
1903  do iband=iband1,iband2
1904 
1905    if (mpi_enreg%proc_distrb(ikpt,iband,isppol)/=me.and.nband>0) then
1906 ! No longer needed 28/03/2020 to parallelize memory
1907 !     gsc(:,1+index_gsc:npw_k*my_nspinor+index_gsc)=zero
1908 !     index_gsc=index_gsc+npw_k*my_nspinor
1909      !index_cprj=index_cprj+my_nspinor
1910      !index_cg=index_cg+npw_k*my_nspinor
1911 
1912      cycle
1913    end if
1914 
1915 !  Retrieve WF at (n,k)
1916    cwavef(:,1:npw_k*my_nspinor)=cg(:,1+index_cg:npw_k*my_nspinor+index_cg)
1917    if (gs_ham%usecprj==1) then
1918      call pawcprj_copy(cprj(:,1+index_cprj:my_nspinor+index_cprj),cwaveprj)
1919    end if
1920 
1921 !  Compute <g|S|Cnk>
1922    call nonlop(choice,cpopt,cwaveprj,enlout_dum,gs_ham,0,(/zero/),mpi_enreg,1,1,paw_opt,&
1923 &   signs,scwavef,tim_nonlop,cwavef,cwavef,select_k=select_k_)
1924 
1925    gsc(:,1+index_gsc:npw_k*my_nspinor+index_gsc)=scwavef(:,1:npw_k*my_nspinor)
1926 
1927 !  End of loop over bands
1928    index_cprj=index_cprj+my_nspinor
1929    index_cg=index_cg+npw_k*my_nspinor
1930    index_gsc=index_gsc+npw_k*my_nspinor
1931  end do
1932 
1933 !Memory deallocation
1934  ABI_FREE(cwavef)
1935  ABI_FREE(scwavef)
1936  if (gs_ham%usecprj==1) then
1937    call pawcprj_free(cwaveprj)
1938  end if
1939  ABI_FREE(cwaveprj)
1940 
1941  call timab(565,2,tsec)
1942 
1943  DBG_EXIT("COLL")
1944 
1945 end subroutine getgsc

ABINIT/m_getghc [ Modules ]

[ Top ] [ Modules ]

NAME

  m_getghc

FUNCTION

 Compute <G|H|C> for input vector |C> expressed in reciprocal space;

COPYRIGHT

  Copyright (C) 1998-2024 ABINIT group (DCA, XG, GMR, LSI, MT, JB, JWZ)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

16 #if defined HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include "abi_common.h"
21 
22 ! nvtx related macro definition
23 #include "nvtx_macros.h"
24 
25 module m_getghc
26 
27  use, intrinsic :: iso_c_binding
28 
29  use defs_basis
30  use m_errors
31  use m_abicore
32  use m_xmpi
33 
34  use defs_abitypes, only : mpi_type
35  use m_time,        only : timab
36  use m_pawcprj,     only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_getdim, pawcprj_copy
37  use m_bandfft_kpt, only : bandfft_kpt, bandfft_kpt_get_ikpt
38  use m_hamiltonian, only : gs_hamiltonian_type, KPRIME_H_K, K_H_KPRIME, K_H_K, KPRIME_H_KPRIME
39  use m_fock,        only : fock_common_type, fock_get_getghc_call
40  use m_fock_getghc, only : fock_getghc, fock_ACE_getghc
41  use m_nonlop,      only : nonlop
42  use m_gemm_nonlop, only : gemm_nonlop_use_gemm
43  use m_fft,         only : fourwf
44  use m_getghc_ompgpu,  only : getghc_ompgpu
45 
46 #ifdef HAVE_FFTW3_THREADS
47  use m_fftw3,       only : fftw3_spawn_threads_here, fftw3_use_lib_threads 
48 #endif
49 
50 #if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
51  use m_nvtx_data
52 #endif
53 
54 #if defined HAVE_GPU_CUDA
55  use m_gpu_toolbox
56 #endif
57 
58 #if defined HAVE_YAKL
59  use gator_mod
60 #endif
61 
62 #ifdef HAVE_KOKKOS
63  use m_manage_kokkos, only : assemble_energy_contribution_kokkos
64 #endif
65 
66  implicit none
67 
68  private

ABINIT/multithreaded_getghc [ Functions ]

[ Top ] [ Functions ]

NAME

 multithreaded_getghc

FUNCTION

INPUTS

 cpopt=flag defining the status of cwaveprj%cp(:)=<Proj_i|Cnk> scalars (PAW only)
       (same meaning as in nonlop.F90 routine)
       if cpopt=-1, <p_lmn|in> (and derivatives) are computed here (and not saved)
       if cpopt= 0, <p_lmn|in> are computed here and saved
       if cpopt= 1, <p_lmn|in> and first derivatives are computed here and saved
       if cpopt= 2  <p_lmn|in> are already in memory;
       if cpopt= 3  <p_lmn|in> are already in memory; first derivatives are computed here and saved
       if cpopt= 4  <p_lmn|in> and first derivatives are already in memory;
 cwavef(2,npw*my_nspinor*ndat)=planewave coefficients of wavefunction.
 gs_ham <type(gs_hamiltonian_type)>=all data for the Hamiltonian to be applied
 lambda=factor to be used when computing <G|H-lambda.S|C> - only for sij_opt=-1
        Typically lambda is the eigenvalue (or its guess)
 mpi_enreg=information about MPI parallelization
 ndat=number of FFT to do in parallel
 prtvol=control print volume and debugging output
 sij_opt= -PAW ONLY-  if  0, only matrix elements <G|H|C> have to be computed
    (S=overlap)       if  1, matrix elements <G|S|C> have to be computed in gsc in addition to ghc
                      if -1, matrix elements <G|H-lambda.S|C> have to be computed in ghc (gsc not used)
 tim_getghc=timing code of the calling subroutine(can be set to 0 if not attributed)
 type_calc= option governing which part of Hamitonian is to be applied:
            0: whole Hamiltonian
            1: local part only
            2: non-local+kinetic only (added to the existing Hamiltonian)
            3: local + kinetic only (added to the existing Hamiltonian)
 ===== Optional inputs =====
   [kg_fft_k(3,:)]=optional, (k+G) vector coordinates to be used for the FFT tranformation
                   instead of the one contained in gs_ham datastructure.
                   Typically used for real WF (in parallel) which are FFT-transformed 2 by 2.
   [kg_fft_kp(3,:)]=optional, (k^prime+G) vector coordinates to be used for the FFT tranformation
   [select_k]=optional, option governing the choice of k points to be used.
             gs_ham datastructure contains quantities needed to apply Hamiltonian
             in reciprocal space between 2 kpoints, k and k^prime (equal in most cases);
             if select_k=1, <k^prime|H|k>       is applied [default]
             if select_k=2, <k|H|k^prime>       is applied
             if select_k=3, <k|H|k>             is applied
             if select_k=4, <k^prime|H|k^prime> is applied

OUTPUT

  ghc(2,npw*my_nspinor*ndat)=matrix elements <G|H|C> (if sij_opt>=0)
                                          or <G|H-lambda.S|C> (if sij_opt=-1)
  gvnlxc(2,npw*my_nspinor*ndat)=matrix elements <G|Vnonlocal|C> (if sij_opt>=0)
                                            or <G|Vnonlocal-lambda.S|C> (if sij_opt=-1)
  if (sij_opt=1)
    gsc(2,npw*my_nspinor*ndat)=matrix elements <G|S|C> (S=overlap).

SIDE EFFECTS

  cwaveprj(natom,my_nspinor*(1+cpopt)*ndat)= wave function projected on nl projectors (PAW only)

SOURCE

2006 subroutine multithreaded_getghc(cpopt,cwavef,cwaveprj,ghc,gsc,gs_ham,gvnlxc,lambda,mpi_enreg,ndat,&
2007 &                 prtvol,sij_opt,tim_getghc,type_calc,&
2008 &                 kg_fft_k,kg_fft_kp,select_k) ! optional arguments
2009 
2010 #ifdef HAVE_OPENMP
2011    use omp_lib
2012 #endif
2013 
2014 !Arguments ------------------------------------
2015 !scalars
2016  integer,intent(in) :: cpopt,ndat, prtvol
2017  integer,intent(in) :: sij_opt,tim_getghc,type_calc
2018  integer,intent(in),optional :: select_k
2019  real(dp),intent(in) :: lambda
2020  type(MPI_type),intent(in) :: mpi_enreg
2021  type(gs_hamiltonian_type),intent(inout),target :: gs_ham
2022 !arrays
2023  integer,intent(in),optional,target :: kg_fft_k(:,:),kg_fft_kp(:,:)
2024  real(dp),intent(out),target :: gsc(:,:)
2025  real(dp),intent(inout) :: cwavef(:,:)
2026  real(dp),intent(out) :: ghc(:,:),gvnlxc(:,:)
2027  type(pawcprj_type),intent(inout),target :: cwaveprj(:,:)
2028 
2029 !Local variables-------------------------------
2030 !scalars
2031  integer :: firstelt, firstprj, lastelt, lastprj,usegvnlxc
2032  integer :: nthreads
2033  integer :: ithread
2034  integer :: chunk
2035  integer :: residuchunk
2036  integer :: firstband
2037  integer :: lastband
2038  integer :: spacedim, spacedim_prj
2039 #ifdef HAVE_OPENMP
2040  logical :: is_nested,fftw3_use_lib_threads_sav
2041 #endif
2042  integer :: select_k_default
2043 
2044  ! *************************************************************************
2045 
2046  select_k_default = 1; if ( present(select_k) ) select_k_default = select_k
2047 
2048  spacedim     = size(cwavef  ,dim=2)/ndat
2049  spacedim_prj = size(cwaveprj,dim=2)/ndat
2050 
2051 
2052  ! Disabling multithreading for GPU variants (getghc_ompgpu is not thread-safe for now)
2053  !$omp parallel default (none) &
2054  !$omp& private(ithread,nthreads,chunk,firstband,lastband,residuchunk,firstelt,lastelt), &
2055  !$omp& private(firstprj,lastprj,is_nested,usegvnlxc,fftw3_use_lib_threads_sav), &
2056  !$omp& shared(cwavef,ghc,gsc, gvnlxc,spacedim,spacedim_prj,ndat,kg_fft_k,kg_fft_kp,gs_ham,cwaveprj,mpi_enreg), &
2057  !$omp& shared(gemm_nonlop_use_gemm), &
2058  !$omp& firstprivate(cpopt,lambda,prtvol,sij_opt,tim_getghc,type_calc,select_k_default) &
2059  !$omp& IF(gs_ham%gpu_option==ABI_GPU_DISABLED .and. .not. gemm_nonlop_use_gemm)
2060  ithread = 0
2061  nthreads = 1
2062  if(gs_ham%gpu_option==ABI_GPU_DISABLED .and. .not. gemm_nonlop_use_gemm) then
2063 #ifdef HAVE_OPENMP
2064    ithread = omp_get_thread_num()
2065    nthreads = omp_get_num_threads()
2066 !   is_nested = omp_get_nested()
2067    is_nested = .false.
2068 !   call omp_set_nested(.false.)
2069 !Ensure that libs are used without threads (mkl, openblas, fftw3, ...)
2070 #ifdef HAVE_LINALG_MKL_THREADS
2071    call mkl_set_num_threads(1)
2072 #endif
2073 #ifdef HAVE_LINALG_OPENBLAS_THREADS
2074    call openblas_set_num_threads(1)
2075 #endif
2076 #ifdef HAVE_FFTW3_THREADS
2077    fftw3_use_lib_threads_sav=(.not.fftw3_spawn_threads_here(nthreads,nthreads))
2078    call fftw3_use_lib_threads(.false.)
2079 #endif
2080 #endif
2081  end if
2082  chunk = ndat/nthreads ! Divide by 2 to construct chunk of even number of bands
2083  residuchunk = ndat - nthreads*chunk
2084  if ( ithread < nthreads-residuchunk ) then
2085    firstband = ithread*chunk+1
2086    lastband = (ithread+1)*chunk
2087  else
2088    firstband = (nthreads-residuchunk)*chunk + ( ithread -(nthreads-residuchunk) )*(chunk+1) +1
2089    lastband = firstband+chunk
2090  end if
2091  usegvnlxc=1
2092  if (size(gvnlxc)<=1) usegvnlxc=0
2093 
2094  if ( lastband /= 0 ) then
2095    firstelt = (firstband-1)*spacedim+1
2096    firstprj = (firstband-1)*spacedim_prj+1
2097    lastelt = lastband*spacedim
2098    lastprj = lastband*spacedim_prj
2099       ! Don't know how to manage optional arguments .... :(
2100    if ( present(kg_fft_k) ) then
2101      if (present(kg_fft_kp)) then
2102        call getghc(cpopt,cwavef(:,firstelt:lastelt),cwaveprj(:,firstprj:lastprj),&
2103 &      ghc(:,firstelt:lastelt),gsc(:,firstelt:lastelt*gs_ham%usepaw),&
2104 &      gs_ham,gvnlxc(:,firstelt:lastelt*usegvnlxc),lambda, mpi_enreg,lastband-firstband+1,&
2105 &      prtvol,sij_opt,tim_getghc,type_calc,&
2106 &      select_k=select_k_default,kg_fft_k=kg_fft_k,kg_fft_kp=kg_fft_kp)
2107      else
2108        call getghc(cpopt,cwavef(:,firstelt:lastelt),cwaveprj(:,firstprj:lastprj),&
2109 &      ghc(:,firstelt:lastelt),gsc(:,firstelt:lastelt*gs_ham%usepaw),&
2110 &      gs_ham,gvnlxc(:,firstelt:lastelt*usegvnlxc),lambda, mpi_enreg,lastband-firstband+1,&
2111 &      prtvol,sij_opt,tim_getghc,type_calc,&
2112 &      select_k=select_k_default,kg_fft_k=kg_fft_k)
2113      end if
2114    else
2115      if (present(kg_fft_kp)) then
2116        call getghc(cpopt,cwavef(:,firstelt:lastelt),cwaveprj(:,firstprj:lastprj),&
2117 &      ghc(:,firstelt:lastelt),gsc(:,firstelt:lastelt*gs_ham%usepaw),&
2118 &      gs_ham,gvnlxc(:,firstelt:lastelt*usegvnlxc),lambda, mpi_enreg,lastband-firstband+1,&
2119 &      prtvol,sij_opt,tim_getghc,type_calc,&
2120 &      select_k=select_k_default,kg_fft_kp=kg_fft_kp)
2121      else
2122        call getghc(cpopt,cwavef(:,firstelt:lastelt),cwaveprj(:,firstprj:lastprj),&
2123 &      ghc(:,firstelt:lastelt),gsc(:,firstelt:lastelt*gs_ham%usepaw),&
2124 &      gs_ham,gvnlxc(:,firstelt:lastelt*usegvnlxc),lambda, mpi_enreg,lastband-firstband+1,&
2125 &      prtvol,sij_opt,tim_getghc,type_calc,&
2126 &      select_K=select_k_default)
2127      end if
2128    end if
2129  end if
2130  if(gs_ham%gpu_option==ABI_GPU_DISABLED .and. .not. gemm_nonlop_use_gemm) then
2131 #ifdef HAVE_OPENMP
2132   ! call omp_set_nested(is_nested)
2133   !Restore libs behavior (mkl, openblas, fftw3, ...)
2134 #ifdef HAVE_LINALG_MKL_THREADS
2135    call mkl_set_num_threads(nthreads)
2136 #endif
2137 #ifdef HAVE_LINALG_OPENBLAS_THREADS
2138    call openblas_set_num_threads(nthreads)
2139 #endif
2140 #ifdef HAVE_FFTW3_THREADS
2141    call fftw3_use_lib_threads(fftw3_use_lib_threads_sav)
2142 #endif
2143 #endif
2144  end if
2145     !$omp end parallel
2146 
2147 end subroutine multithreaded_getghc