TABLE OF CONTENTS


ABINIT/fxphas [ Functions ]

[ Top ] [ Functions ]

NAME

 fxphas

FUNCTION

 Fix phase of all bands. Keep normalization but maximize real part
 (minimize imag part). Also fix the sign of real part
 by setting the first non-zero element to be positive.
 See also fxphas_seq in m_cgtools

INPUTS

  cg(2,mcg)= contains the wavefunction |c> coefficients.
  gsc(2,mgsc)= if useoverlap==1, contains the S|c> coefficients,
               where S is an overlap matrix.
  icg=shift to be applied on the location of data in the array cg
  igsc=shift to be applied on the location of data in the array gsc
  istwfk=input option parameter that describes the storage of wfs
    (set to 1 if usual complex vectors)
  mcg=size of second dimension of cg
  mgsc=size of second dimension of gsc
  mpi_enreg=information about MPI parallelization
  nband_k=number of bands
  npw_k=number of planewaves
  useoverlap=describe the overlap of wavefunctions:
               0: no overlap (S=Identi0,ty_matrix)
               1: wavefunctions are overlapping

OUTPUT

  cg(2,mcg)=same array with altered phase.
  gsc(2,mgsc)= same array with altered phase.

NOTES

 When the sign of the real part was fixed (modif v3.1.3g.6), the
 test Tv3#5 , dataset 5, behaved differently than previously.
 This should be cleared up.

PARENTS

      vtowfk

CHILDREN

      timab,xmpi_sum

SOURCE

1015 subroutine fxphas(cg,gsc,icg,igsc,istwfk,mcg,mgsc,mpi_enreg,nband_k,npw_k,useoverlap)
1016 
1017 
1018 !This section has been created automatically by the script Abilint (TD).
1019 !Do not modify the following lines by hand.
1020 #undef ABI_FUNC
1021 #define ABI_FUNC 'fxphas'
1022 !End of the abilint section
1023 
1024  implicit none
1025 
1026 !Arguments ------------------------------------
1027 !scalars
1028  integer,intent(in) :: icg,igsc,istwfk,mcg,mgsc,nband_k,npw_k,useoverlap
1029  type(MPI_type),intent(in) :: mpi_enreg
1030 !arrays
1031  real(dp),intent(inout) :: cg(2,mcg),gsc(2,mgsc*useoverlap)
1032 
1033 !Local variables-------------------------------
1034 !scalars
1035  integer :: iband,ierr,ii,indx
1036  real(dp) :: cim,cre,gscim,gscre,quotient,root1,root2,saa,sab,sbb,theta
1037  real(dp) :: thppi,xx,yy
1038  character(len=500) :: message
1039 !arrays
1040  real(dp) :: buffer2(nband_k,2),buffer3(nband_k,3),tsec(2)
1041  real(dp),allocatable :: cimb(:),creb(:),saab(:),sabb(:),sbbb(:) !,sarr(:,:)
1042 
1043 ! *************************************************************************
1044 
1045 !The general case, where a complex phase indeterminacy is present
1046  if(istwfk==1)then
1047 
1048    ABI_ALLOCATE(cimb,(nband_k))
1049    ABI_ALLOCATE(creb,(nband_k))
1050    ABI_ALLOCATE(saab,(nband_k))
1051    ABI_ALLOCATE(sabb,(nband_k))
1052    ABI_ALLOCATE(sbbb,(nband_k))
1053    cimb(:)=zero ; creb(:)=zero
1054 
1055 !  Loop over bands
1056 !  TODO: MG store saa arrays in sarr(3,nband_k) to reduce false sharing.
1057 !$OMP PARALLEL DO DEFAULT(PRIVATE) SHARED(nband_k,icg,npw_k,cg,saab,sbbb,sabb)
1058    do iband=1,nband_k
1059      indx=icg+(iband-1)*npw_k
1060 
1061 !    Compute several sums over Re, Im parts of c
1062      saa=zero; sbb=zero; sab=zero
1063      do ii=1+indx,npw_k+indx
1064        saa=saa+cg(1,ii)*cg(1,ii)
1065        sbb=sbb+cg(2,ii)*cg(2,ii)
1066        sab=sab+cg(1,ii)*cg(2,ii)
1067      end do
1068      saab(iband)=saa
1069      sbbb(iband)=sbb
1070      sabb(iband)=sab
1071    end do
1072 
1073 !  XG030513 : MPIWF : should transmit saab,sbbb,sabb from the procs
1074 !  of the WF group to the master processor of the WF group
1075    if (mpi_enreg%paral_kgb == 1) then
1076      buffer3(:,1)=saab(:)
1077      buffer3(:,2)=sbbb(:)
1078      buffer3(:,3)=sabb(:)
1079      call timab(48,1,tsec)
1080      call xmpi_sum(buffer3,mpi_enreg%comm_fft,ierr)
1081      if (mpi_enreg%paral_spinor==1) then
1082        call xmpi_sum(buffer3,mpi_enreg%comm_spinor,ierr)
1083      end if
1084      call timab(48,2,tsec)
1085      saab(:)=buffer3(:,1)
1086      sbbb(:)=buffer3(:,2)
1087      sabb(:)=buffer3(:,3)
1088    end if
1089 
1090 !  XG030513 : MPIWF this loop should only be executed by the master of the WF group
1091 
1092    if (mpi_enreg%paral_kgb==0.or.mpi_enreg%me_fft==0) then
1093      do iband=1,nband_k
1094        indx=icg+(iband-1)*npw_k
1095 
1096        saa=saab(iband)
1097        sbb=sbbb(iband)
1098        sab=sabb(iband)
1099 
1100 !      Get phase angle theta
1101        if (sbb+saa>tol8)then
1102          if(abs(sbb-saa)>tol8*(sbb+saa) .or. 2*abs(sab)>tol8*(sbb+saa))then
1103            if (abs(sbb-saa)>tol8*abs(sab)) then
1104              quotient=sab/(sbb-saa)
1105              theta=0.5_dp*atan(2.0_dp*quotient)
1106            else
1107 !            Taylor expansion of the atan in terms of inverse of its argument. Correct up to 1/x2, included.
1108              theta=0.25_dp*(pi-(sbb-saa)/sab)
1109            end if
1110 !          Check roots to get theta for max Re part
1111            root1=cos(theta)**2*saa+sin(theta)**2*sbb-2.0_dp*cos(theta)*sin(theta)*sab
1112            thppi=theta+0.5_dp*pi
1113            root2=cos(thppi)**2*saa+sin(thppi)**2*sbb-2.0_dp*cos(thppi)*sin(thppi)*sab
1114            if (root2>root1) theta=thppi
1115          else
1116 !          The real part vector and the imaginary part vector are orthogonal, and of same norm. Strong indeterminacy.
1117 !          Will determine the first non-zero coefficient, and fix its phase
1118 !          Hypothesis : there is at least one non-zero element on the master node ...
1119            do ii=1+indx,npw_k+indx
1120              cre=cg(1,ii)
1121              cim=cg(2,ii)
1122              if(cre**2+cim**2>tol8**2*(saa+sbb))then
1123                if(cre**2>tol8**2**cim**2)then
1124                  theta=atan(cim/cre)
1125                else
1126 !                Taylor expansion of the atan in terms of inverse of its argument. Correct up to 1/x2, included.
1127                  theta=pi/2-cre/cim
1128                end if
1129                exit
1130              end if
1131            end do
1132          end if
1133        else
1134          write(message,'(a,i0,5a)')&
1135 &         'The eigenvector with band ',iband,' has zero norm.',ch10,&
1136 &         'This usually happens when the number of bands (nband) is comparable to the number of planewaves (mpw)',ch10,&
1137 &         'Action: Check the parameters of the calculation. If nband ~ mpw, then decrease nband or, alternatively, increase ecut'
1138          MSG_ERROR(message)
1139        end if
1140 
1141        xx=cos(theta)
1142        yy=sin(theta)
1143 
1144 !      Here, set the first non-zero element to be positive
1145 !      Comment the next nine lines to recover the behaviour of pre v3.1.3g
1146 !      Hypothesis : there is at least one non-zero element on the master node ...
1147        do ii=1+indx,npw_k+indx
1148          cre=cg(1,ii)
1149          cim=cg(2,ii)
1150          cre=xx*cre-yy*cim
1151          if(abs(cre)>tol8)exit
1152        end do
1153        if(cre<zero)then
1154          xx=-xx ; yy=-yy
1155        end if
1156 
1157        creb(iband)=xx
1158        cimb(iband)=yy
1159 
1160      end do
1161    end if
1162 
1163 !  XG030513 : MPIWF : should transmit creb(:),cimb(:) of the master
1164 !  processor of the WF group to the others procs of the WF group
1165    if (mpi_enreg%paral_kgb == 1) then
1166      call timab(48,1,tsec)
1167      buffer2(:,1)=creb(:)
1168      buffer2(:,2)=cimb(:)
1169      call xmpi_sum(buffer2,mpi_enreg%comm_fft,ierr)
1170      if (mpi_enreg%paral_spinor==1) then
1171        call xmpi_sum(buffer2,mpi_enreg%comm_spinor,ierr)
1172      end if
1173      call timab(48,2,tsec)
1174      creb(:)=buffer2(:,1)
1175      cimb(:)=buffer2(:,2)
1176    end if
1177 
1178 !  MG TODO: Scaling can be done with zscal
1179 !$OMP PARALLEL DO PRIVATE(indx,xx,yy,cre,cim,gscre,gscim)
1180    do iband=1,nband_k
1181      indx=icg+(iband-1)*npw_k
1182 
1183      xx=creb(iband)
1184      yy=cimb(iband)
1185 !    Alter phase of array |cg>
1186      do ii=1+indx,npw_k+indx
1187        cre=cg(1,ii)
1188        cim=cg(2,ii)
1189        cg(1,ii)=xx*cre-yy*cim
1190        cg(2,ii)=xx*cim+yy*cre
1191      end do
1192 
1193 !    Alter phase of array S|cg>
1194      if (useoverlap==1) then
1195        indx=igsc+(iband-1)*npw_k
1196        do ii=1+indx,npw_k+indx
1197          gscre=gsc(1,ii)
1198          gscim=gsc(2,ii)
1199          gsc(1,ii)=xx*gscre-yy*gscim
1200          gsc(2,ii)=xx*gscim+yy*gscre
1201        end do
1202      end if
1203    end do ! iband
1204 
1205    ABI_DEALLOCATE(cimb)
1206    ABI_DEALLOCATE(creb)
1207    ABI_DEALLOCATE(saab)
1208    ABI_DEALLOCATE(sabb)
1209    ABI_DEALLOCATE(sbbb)
1210 
1211  else  ! if istwfk/=1.  Storages that take into account the time-reversal symmetry : the freedom is only a sign freedom
1212 
1213    ABI_ALLOCATE(creb,(nband_k))
1214    creb(:)=zero
1215 !  XG030513 : MPIWF : this loop should be done only by the master processor of the WF group
1216 
1217    if (mpi_enreg%paral_kgb==0.or.mpi_enreg%me_fft==0) then
1218 
1219 !    Loop over bands
1220      do iband=1,nband_k
1221 
1222        indx=icg+(iband-1)*npw_k
1223 
1224 !      Here, set the first non-zero real element to be positive
1225        do ii=1+indx,npw_k+indx
1226          cre=cg(1,ii)
1227          if(abs(cre)>tol8)exit
1228        end do
1229        creb(iband)=cre
1230 
1231      end do ! iband
1232 
1233    end if
1234 !  XG030513 : MPIWF : should transmit cre(:) of the master processor of the WF group to the others
1235    if (mpi_enreg%paral_kgb == 1) then
1236      call timab(48,1,tsec)
1237      call xmpi_sum(creb,mpi_enreg%comm_fft,ierr)
1238      if (mpi_enreg%paral_spinor==1) then
1239        call xmpi_sum(creb,mpi_enreg%comm_spinor,ierr)
1240      end if
1241      call timab(48,2,tsec)
1242    end if
1243 
1244    do iband=1,nband_k
1245      cre=creb(iband)
1246      if(cre<zero)then
1247        indx=icg+(iband-1)*npw_k
1248        do ii=1+indx,npw_k+indx
1249          cg(1,ii)=-cg(1,ii)
1250          cg(2,ii)=-cg(2,ii)
1251        end do
1252        if(useoverlap==1)then
1253          indx=igsc+(iband-1)*npw_k
1254          do ii=1+indx,npw_k+indx
1255            gsc(1,ii)=-gsc(1,ii)
1256            gsc(2,ii)=-gsc(2,ii)
1257          end do
1258        end if
1259      end if
1260    end do ! iband
1261 
1262    ABI_DEALLOCATE(creb)
1263  end if ! istwfk
1264 
1265 end subroutine fxphas

ABINIT/m_vtowfk [ Modules ]

[ Top ] [ Modules ]

NAME

  m_vtowfk

FUNCTION

COPYRIGHT

  Copyright (C) 1998-2018 ABINIT group (DCA, XG, GMR, MT)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

PARENTS

CHILDREN

SOURCE

21 #if defined HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24 
25 #include "abi_common.h"
26 
27 module m_vtowfk
28 
29  use defs_basis
30  use defs_abitypes
31  use m_abicore
32  use m_errors
33  use m_xmpi
34  use m_efield
35  use m_linalg_interfaces
36  use m_cgtools
37 
38  use m_time,        only : timab
39  use m_hamiltonian, only : gs_hamiltonian_type
40  use m_paw_dmft,    only : paw_dmft_type
41  use m_pawcprj,     only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_put,pawcprj_copy
42  use m_paw_dmft,    only : paw_dmft_type
43  use m_gwls_hamiltonian, only : build_H
44  use m_cgwf,        only : cgwf
45  use m_lobpcgwf_old,only : lobpcgwf
46  use m_lobpcgwf,    only : lobpcgwf2
47  use m_spacepar,    only : meanvalue_g
48  use m_chebfi,      only : chebfi
49  use m_nonlop,      only : nonlop
50  use m_prep_kgb,    only : prep_nonlop, prep_fourwf
51  use m_fft,         only : fourwf
52 
53  implicit none
54 
55  private

ABINIT/vtowfk [ Functions ]

[ Top ] [ Functions ]

NAME

 vtowfk

FUNCTION

 This routine compute the partial density at a given k-point,
 for a given spin-polarization, from a fixed Hamiltonian
 but might also simply compute eigenvectors and eigenvalues at this k point

INPUTS

  cgq = array that holds the WF of the nearest neighbours of
        the current k-point (electric field, MPI //)
  cpus= cpu time limit in seconds
  dtefield <type(efield_type)> = variables related to Berry phase
      calculations (see initberry.f)
  dtfil <type(datafiles_type)>=variables related to files
  dtset <type(dataset_type)>=all input variables for this dataset
  fixed_occ=true if electronic occupations are fixed (occopt<3)
  gs_hamk <type(gs_hamiltonian_type)>=all data for the Hamiltonian at k
  icg=shift to be applied on the location of data in the array cprj
  icg=shift to be applied on the location of data in the array cg
  ikpt=number of the k-point
  iscf=(<= 0  =>non-SCF), >0 => SCF
  isppol isppol=1 for unpolarized, 2 for spin-polarized
  kg_k(3,npw_k)=reduced planewave coordinates.
  kinpw(npw_k)=(modified) kinetic energy for each plane wave (Hartree)
  mcg=second dimension of the cg array
  mcgq=second dimension of the cgq array (electric field, MPI //)
  mcprj=size of projected wave-functions array (cprj) =nspinor*mband*mkmem*nsppol
  mkgq = second dimension of pwnsfacq
  mpi_enreg=informations about MPI parallelization
  mpw=maximum dimensioned size of npw
  natom=number of atoms in cell.
  nband_k=number of bands at this k point for that spin polarization
  nkpt=number of k points.
  nnsclo_now=number of non-self-consistent loops for the current vtrial
             (often 1 for SCF calculation, =nstep for non-SCF calculations)
  npw_k=number of plane waves at this k point
  npwarr(nkpt)=number of planewaves in basis at this k point
  occ_k(nband_k)=occupation number for each band (usually 2) for each k.
  optforces=option for the computation of forces
  prtvol=control print volume and debugging output
  pwind(pwind_alloc,2,3)= array used to compute
           the overlap matrix smat between k-points (see initberry.f)
  pwind_alloc= first dimension of pwind
  pwnsfac(2,pwind_alloc)= phase factors for non-symmorphic translations
                          (see initberry.f)
  pwnsfacq(2,mkgq)= phase factors for the nearest neighbours of the
                    current k-point (electric field, MPI //)
  usebanfft=flag for band-fft parallelism
  paw_dmft  <type(paw_dmft_type)>= paw+dmft related data
  wtk=weight assigned to the k point.
  zshift(nband_k)=energy shifts for the squared shifted hamiltonian algorithm

OUTPUT

  dphase_k(3)=change in Zak phase for the current k-point
  eig_k(nband_k)=array for holding eigenvalues (hartree)
  ek_k(nband_k)=contribution from each band to kinetic energy, at this k-point
  ek_k_nd(2,nband_k,nband_k*use_dmft)=contribution to kinetic energy, including non-diagonal terms, at this k-point (usefull if use_dmft)
  resid_k(nband_k)=residuals for each band over all k points,
                   BEFORE the band rotation.
  ==== if optforces>0 ====
    grnl_k(3*natom,nband_k)=nonlocal gradients, at this k-point
  ==== if (gs_hamk%usepaw==0) ====
    enl_k(nband_k)=contribution from each band to nonlocal pseudopotential part of total energy, at this k-point
  ==== if (gs_hamk%usepaw==1) ====
    cprj(natom,mcprj*usecprj)= wave functions projected with non-local projectors:
                               cprj(n,k,i)=<p_i|Cnk> where p_i is a non-local projector.

SIDE EFFECTS

  cg(2,mcg)=updated wavefunctions
  rhoaug(n4,n5,n6,nvloc)= density in electrons/bohr**3, on the augmented fft grid.
                    (cumulative, so input as well as output). Update only
                    for occopt<3 (fixed occupation numbers)

PARENTS

      vtorho

CHILDREN

      build_h,cgwf,chebfi,dsymm,fourwf,fxphas,lobpcgwf,lobpcgwf2,meanvalue_g
      nonlop,pawcprj_alloc,pawcprj_copy,pawcprj_free,pawcprj_put,prep_fourwf
      prep_nonlop,pw_orthon,subdiago,timab,wrtout,xmpi_sum,zhemm

NOTES

  The cprj are distributed over band and spinors processors.
  One processor doesn't know all the cprj.
  Only the mod((iband-1)/mpi_enreg%bandpp,mpi_enreg%nproc_band) projectors
  are stored on each proc.

SOURCE

155 subroutine vtowfk(cg,cgq,cprj,cpus,dphase_k,dtefield,dtfil,dtset,&
156 & eig_k,ek_k,ek_k_nd,enl_k,fixed_occ,grnl_k,gs_hamk,&
157 & ibg,icg,ikpt,iscf,isppol,kg_k,kinpw,mband_cprj,mcg,mcgq,mcprj,mkgq,mpi_enreg,&
158 & mpw,natom,nband_k,nkpt,nnsclo_now,npw_k,npwarr,occ_k,optforces,prtvol,&
159 & pwind,pwind_alloc,pwnsfac,pwnsfacq,resid_k,rhoaug,paw_dmft,wtk,zshift)
160 
161 
162 !This section has been created automatically by the script Abilint (TD).
163 !Do not modify the following lines by hand.
164 #undef ABI_FUNC
165 #define ABI_FUNC 'vtowfk'
166 !End of the abilint section
167 
168  implicit none
169 
170 !Arguments ------------------------------------
171  integer, intent(in) :: ibg,icg,ikpt,iscf,isppol,mband_cprj,mcg,mcgq,mcprj,mkgq,mpw
172  integer, intent(in) :: natom,nband_k,nkpt,nnsclo_now,npw_k,optforces
173  integer, intent(in) :: prtvol,pwind_alloc
174  logical,intent(in) :: fixed_occ
175  real(dp), intent(in) :: cpus,wtk
176  type(datafiles_type), intent(in) :: dtfil
177  type(efield_type), intent(inout) :: dtefield
178  type(dataset_type), intent(in) :: dtset
179  type(gs_hamiltonian_type), intent(inout) :: gs_hamk
180  type(MPI_type), intent(inout) :: mpi_enreg
181  type(paw_dmft_type), intent(in)  :: paw_dmft
182  integer, intent(in) :: kg_k(3,npw_k)
183  integer, intent(in) :: npwarr(nkpt),pwind(pwind_alloc,2,3)
184  real(dp), intent(in) :: cgq(2,mcgq),kinpw(npw_k),occ_k(nband_k)
185  real(dp), intent(in) :: pwnsfac(2,pwind_alloc),pwnsfacq(2,mkgq)
186  real(dp), intent(in) :: zshift(nband_k)
187  real(dp), intent(out) :: eig_k(nband_k),ek_k(nband_k),dphase_k(3),ek_k_nd(2,nband_k,nband_k*paw_dmft%use_dmft)
188  real(dp), intent(out) :: enl_k(nband_k*(1-gs_hamk%usepaw))
189  real(dp), intent(out) :: grnl_k(3*natom,nband_k*optforces)
190  real(dp), intent(out) :: resid_k(nband_k)
191  real(dp), intent(inout) :: cg(2,mcg),rhoaug(gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,gs_hamk%nvloc)
192  type(pawcprj_type),intent(inout) :: cprj(natom,mcprj*gs_hamk%usecprj)
193 
194 !Local variables-------------------------------
195  logical :: newlobpcg
196  integer,parameter :: level=112,tim_fourwf=2,tim_nonlop_prep=11
197  integer,save :: nskip=0
198 !     Flag use_subovl: 1 if "subovl" array is computed (see below)
199 !     subovl should be Identity (in that case we should use use_subovl=0)
200 !     But this is true only if conjugate gradient algo. converges
201  integer :: use_subovl=0
202  integer :: bandpp_cprj,blocksize,choice,cpopt,iband,iband1
203  integer :: iblock,iblocksize,ibs,idir,ierr,igs,igsc,ii,pidx,inonsc
204  integer :: iorder_cprj,ipw,ispinor,ispinor_index,istwf_k,iwavef,jj,mgsc,my_nspinor,n1,n2,n3 !kk
205  integer :: nband_k_cprj,nblockbd,ncpgr,ndat,nkpt_max,nnlout,ortalgo
206  integer :: paw_opt,quit,signs,spaceComm,tim_nonlop,wfoptalg,wfopta10
207  logical :: nspinor1TreatedByThisProc,nspinor2TreatedByThisProc
208  real(dp) :: ar,ar_im,eshift,occblock
209  real(dp) :: res,residk,weight
210  character(len=500) :: message
211  real(dp) :: dummy(2,1),nonlop_dum(1,1),tsec(2)
212  real(dp),allocatable :: cwavef(:,:),cwavef1(:,:),cwavef_x(:,:),cwavef_y(:,:),cwavefb(:,:,:)
213  real(dp),allocatable :: eig_save(:),enlout(:),evec(:,:),evec_loc(:,:),gsc(:,:)
214  real(dp),allocatable :: mat_loc(:,:),mat1(:,:,:),matvnl(:,:,:)
215  real(dp),allocatable :: subham(:),subovl(:),subvnl(:),totvnl(:,:),wfraug(:,:,:,:)
216  type(pawcprj_type),allocatable :: cwaveprj(:,:)
217 
218 ! **********************************************************************
219 
220  DBG_ENTER("COLL")
221 
222  call timab(28,1,tsec) ! Keep track of total time spent in "vtowfk"
223 
224 !Structured debugging if prtvol==-level
225  if(prtvol==-level)then
226    write(message,'(80a,a,a)') ('=',ii=1,80),ch10,'vtowfk : enter'
227    call wrtout(std_out,message,'PERS')
228  end if
229 
230 
231 !=========================================================================
232 !============= INITIALIZATIONS AND ALLOCATIONS ===========================
233 !=========================================================================
234 
235  nkpt_max=50; if(xmpi_paral==1)nkpt_max=-1
236 
237  wfoptalg=mod(dtset%wfoptalg,100); wfopta10=mod(wfoptalg,10)
238  newlobpcg = (dtset%wfoptalg == 114 .and. dtset%use_gpu_cuda == 0)
239  istwf_k=gs_hamk%istwf_k
240  quit=0
241 
242 !Parallelization over spinors management
243  my_nspinor=max(1,dtset%nspinor/mpi_enreg%nproc_spinor)
244  if (mpi_enreg%paral_spinor==0) then
245    ispinor_index=1
246    nspinor1TreatedByThisProc=.true.
247    nspinor2TreatedByThisProc=(dtset%nspinor==2)
248  else
249    ispinor_index=mpi_enreg%me_spinor+1
250    nspinor1TreatedByThisProc=(mpi_enreg%me_spinor==0)
251    nspinor2TreatedByThisProc=(mpi_enreg%me_spinor==1)
252  end if
253 
254 !Parallelism over FFT and/or bands: define sizes and tabs
255  !if (mpi_enreg%paral_kgb==1) then
256  nblockbd=nband_k/(mpi_enreg%nproc_band*mpi_enreg%bandpp)
257  !else
258  !  nblockbd=nband_k/mpi_enreg%nproc_fft
259  !  if (nband_k/=nblockbd*mpi_enreg%nproc_fft) nblockbd=nblockbd+1
260  !end if
261  blocksize=nband_k/nblockbd
262 
263 !Save eshift
264  if(wfoptalg==3)then
265    eshift=zshift(1)
266    ABI_ALLOCATE(eig_save,(nband_k))
267    eig_save(:)=eshift
268  end if
269 
270  n1=gs_hamk%ngfft(1); n2=gs_hamk%ngfft(2); n3=gs_hamk%ngfft(3)
271 
272  if ( .not. newlobpcg ) then
273    igsc=0
274    mgsc=nband_k*npw_k*my_nspinor*gs_hamk%usepaw
275 
276    ABI_STAT_ALLOCATE(gsc,(2,mgsc), ierr)
277    ABI_CHECK(ierr==0, "out of memory in gsc")
278    gsc=zero
279  end if
280 
281  if(wfopta10 /= 1 .and. .not. newlobpcg ) then !chebfi already does this stuff inside
282    ABI_ALLOCATE(evec,(2*nband_k,nband_k))
283    ABI_ALLOCATE(subham,(nband_k*(nband_k+1)))
284 
285    ABI_ALLOCATE(subvnl,(0))
286    ABI_ALLOCATE(totvnl,(0,0))
287    if (gs_hamk%usepaw==0) then
288      if (wfopta10==4) then
289        ABI_DEALLOCATE(totvnl)
290        if (istwf_k==1) then
291          ABI_ALLOCATE(totvnl,(2*nband_k,nband_k))
292        else if (istwf_k==2) then
293          ABI_ALLOCATE(totvnl,(nband_k,nband_k))
294        end if
295      else
296        ABI_DEALLOCATE(subvnl)
297        ABI_ALLOCATE(subvnl,(nband_k*(nband_k+1)))
298      end if
299    end if
300 
301    if (use_subovl==1) then
302      ABI_ALLOCATE(subovl,(nband_k*(nband_k+1)))
303    else
304      ABI_ALLOCATE(subovl,(0))
305    end if
306  end if
307 
308 !Carry out UP TO dtset%nline steps, or until resid for every band is < dtset%tolwfr
309 
310  if(prtvol>2 .or. ikpt<=nkpt_max)then
311    write(message,'(a,i5,2x,a,3f9.5,2x,a)')' non-scf iterations; kpt # ',ikpt,', k= (',gs_hamk%kpt_k,'), band residuals:'
312    call wrtout(std_out,message,'PERS')
313  end if
314 
315 !Electric field: initialize dphase_k
316  dphase_k(:) = zero
317 
318 !=========================================================================
319 !==================== NON-SELF-CONSISTENT LOOP ===========================
320 !=========================================================================
321 
322 !nnsclo_now=number of non-self-consistent loops for the current vtrial
323 !(often 1 for SCF calculation, =nstep for non-SCF calculations)
324  call timab(39,1,tsec) ! "vtowfk (loop)"
325 
326  do inonsc=1,nnsclo_now
327 
328 !  This initialisation is needed for the MPI-parallelisation (gathering using sum)
329    if(wfopta10 /= 1 .and. .not. newlobpcg) then
330      subham(:)=zero
331      if (gs_hamk%usepaw==0) then
332        if (wfopta10==4) then
333          totvnl(:,:)=zero
334        else
335          subvnl(:)=zero
336        end if
337      end if
338      if (use_subovl==1)subovl(:)=zero
339    end if
340    resid_k(:)=zero
341 
342 !  Filter the WFs when modified kinetic energy is too large (see routine mkkin.f)
343 !  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(igs,iwavef)
344    do ispinor=1,my_nspinor
345      do iband=1,nband_k
346        igs=(ispinor-1)*npw_k
347        iwavef=(iband-1)*npw_k*my_nspinor+icg
348        do ipw=1+igs,npw_k+igs
349          if(kinpw(ipw-igs)>huge(zero)*1.d-11)then
350            cg(1,ipw+iwavef)=zero
351            cg(2,ipw+iwavef)=zero
352          end if
353        end do
354      end do
355    end do
356 
357    ! JLJ 17/10/2014 : If it is a GWLS calculation, construct the hamiltonian
358    ! as in a usual GS calc., but skip any minimisation procedure.
359    ! This would be equivalent to nstep=0, if the latter did work.
360    if(dtset%optdriver/=RUNL_GWLS) then
361 
362      if(wfopta10==4.or.wfopta10==1) then
363        if (istwf_k/=1.and.istwf_k/=2) then !no way to use lobpcg
364          write(message,'(3a)')&
365 &         'Only istwfk=1 or 2 are allowed with wfoptalg=4/14 !',ch10,&
366 &         'Action: put istwfk to 1 or remove k points with half integer coordinates.'
367          MSG_ERROR(message)
368        end if
369 
370 !    =========================================================================
371 !    ============ MINIMIZATION OF BANDS: LOBPCG ==============================
372 !    =========================================================================
373        if (wfopta10==4) then
374          if ( .not. newlobpcg ) then
375            call lobpcgwf(cg,dtset,gs_hamk,gsc,icg,igsc,kinpw,mcg,mgsc,mpi_enreg,&
376 &           nband_k,nblockbd,npw_k,prtvol,resid_k,subham,totvnl)
377 !          In case of FFT parallelism, exchange subspace arrays
378            spaceComm=mpi_enreg%comm_bandspinorfft
379            call xmpi_sum(subham,spaceComm,ierr)
380            if (gs_hamk%usepaw==0) then
381              if (wfopta10==4) then
382                call xmpi_sum(totvnl,spaceComm,ierr)
383              else
384                call xmpi_sum(subvnl,spaceComm,ierr)
385              end if
386            end if
387            if (use_subovl==1) call xmpi_sum(subovl,spaceComm,ierr)
388          else
389            call lobpcgwf2(cg(:,icg+1:),dtset,eig_k,enl_k,gs_hamk,kinpw,mpi_enreg,&
390 &           nband_k,npw_k,my_nspinor,prtvol,resid_k)
391          end if
392 !        In case of FFT parallelism, exchange subspace arrays
393 
394 !    =========================================================================
395 !    ============ MINIMIZATION OF BANDS: CHEBYSHEV FILTERING =================
396 !    =========================================================================
397        else if (wfopta10 == 1) then
398          call chebfi(cg(:, icg+1:),dtset,eig_k,enl_k,gs_hamk,gsc,kinpw,&
399 &         mpi_enreg,nband_k,npw_k,my_nspinor,prtvol,resid_k)
400        end if
401 
402 !      =========================================================================
403 !      ======== MINIMIZATION OF BANDS: CONJUGATE GRADIENT (Teter et al.) =======
404 !      =========================================================================
405      else
406        call cgwf(dtset%berryopt,cg,cgq,dtset%chkexit,cpus,dphase_k,dtefield,dtfil%filnam_ds(1),&
407 &       gsc,gs_hamk,icg,igsc,ikpt,inonsc,isppol,dtset%mband,mcg,mcgq,mgsc,mkgq,&
408 &       mpi_enreg,mpw,nband_k,dtset%nbdblock,nkpt,dtset%nline,npw_k,npwarr,my_nspinor,&
409 &       dtset%nsppol,dtset%ortalg,prtvol,pwind,pwind_alloc,pwnsfac,pwnsfacq,quit,resid_k,&
410 &       subham,subovl,subvnl,dtset%tolrde,dtset%tolwfr,use_subovl,wfoptalg,zshift)
411      end if
412    end if
413 
414 !  =========================================================================
415 !  ===================== FIND LARGEST RESIDUAL =============================
416 !  =========================================================================
417 
418 !  Find largest resid over bands at this k point
419 !  Note that this operation is done BEFORE rotation of bands :
420 !  it would be time-consuming to recompute the residuals after.
421    residk=maxval(resid_k(1:max(1,nband_k-dtset%nbdbuf)))
422 
423 !  Print residuals
424    if(prtvol>2 .or. ikpt<=nkpt_max)then
425      do ii=0,(nband_k-1)/8
426        write(message,'(a,8es10.2)')' res:',(resid_k(iband),iband=1+ii*8,min(nband_k,8+ii*8))
427        call wrtout(std_out,message,'PERS')
428      end do
429    end if
430 
431 !  =========================================================================
432 !  ========== DIAGONALIZATION OF HAMILTONIAN IN WFs SUBSPACE ===============
433 !  =========================================================================
434 
435    if( .not. wfopta10 == 1 .and. .not. newlobpcg ) then
436      call timab(585,1,tsec) !"vtowfk(subdiago)"
437      call subdiago(cg,eig_k,evec,gsc,icg,igsc,istwf_k,&
438 &     mcg,mgsc,nband_k,npw_k,my_nspinor,dtset%paral_kgb,&
439 &     subham,subovl,use_subovl,gs_hamk%usepaw,mpi_enreg%me_g0)
440      call timab(585,2,tsec)
441    end if
442 
443 !  Print energies
444    if(prtvol>2 .or. ikpt<=nkpt_max)then
445      do ii=0,(nband_k-1)/8
446        write(message, '(a,8es10.2)' )' ene:',(eig_k(iband),iband=1+ii*8,min(nband_k,8+ii*8))
447        call wrtout(std_out,message,'PERS')
448      end do
449    end if
450 
451 !  THIS CHANGE OF SHIFT DOES NOT WORK WELL
452 !  Update zshift in the case of wfoptalg==3
453 !  if(wfoptalg==3 .and. inonsc/=1)then
454 !  do iband=1,nband_k
455 !  if(eig_k(iband)<eshift .and. eig_save(iband)<eshift)then
456 !  zshift(iband)=max(eig_k(iband),eig_save(iband))
457 !  end if
458 !  if(eig_k(iband)>eshift .and. eig_save(iband)>eshift)then
459 !  zshift(iband)=min(eig_k(iband),eig_save(iband))
460 !  end if
461 !  end do
462 !  eig_save(:)=eig_k(:)
463 !  end if
464 
465 !  =========================================================================
466 !  =============== ORTHOGONALIZATION OF WFs (if needed) ====================
467 !  =========================================================================
468 
469 !  Re-orthonormalize the wavefunctions at this k point--
470 !  this is redundant but is performed to combat rounding error in wavefunction orthogonality
471 
472    call timab(583,1,tsec) ! "vtowfk(pw_orthon)"
473    ortalgo=mpi_enreg%paral_kgb
474    if ((wfoptalg/=14 .and. wfoptalg /= 1).or.dtset%ortalg>0) then
475      call pw_orthon(icg,igsc,istwf_k,mcg,mgsc,npw_k*my_nspinor,nband_k,ortalgo,gsc,gs_hamk%usepaw,cg,&
476 &     mpi_enreg%me_g0,mpi_enreg%comm_bandspinorfft)
477    end if
478    call timab(583,2,tsec)
479 
480 !  DEBUG seq==par comment next block
481 !  Fix phases of all bands
482    if ((xmpi_paral/=1).or.(mpi_enreg%paral_kgb/=1)) then
483      if ( .not. newlobpcg ) then
484        call fxphas(cg,gsc,icg,igsc,istwf_k,mcg,mgsc,mpi_enreg,nband_k,npw_k*my_nspinor,gs_hamk%usepaw)
485      else
486        ! GSC is local to vtowfk and is completely useless since everything
487        ! is calcultated in my lobpcg, we don't care about the phase of gsc !
488        call fxphas(cg,gsc,icg,igsc,istwf_k,mcg,mgsc,mpi_enreg,nband_k,npw_k*my_nspinor,0)
489      end if
490    end if
491 
492    if (residk<dtset%tolwfr) exit  !  Exit loop over inonsc if converged
493  end do !  End loop over inonsc (NON SELF-CONSISTENT LOOP)
494 
495  call timab(39,2,tsec)
496  call timab(30,1,tsec) ! "vtowfk  (afterloop)"
497 
498 !###################################################################
499 
500 !Compute kinetic energy and non-local energy for each band, and in the SCF
501 !case, contribution to forces, and eventually accumulate rhoaug
502 
503  ndat=1;if (mpi_enreg%paral_kgb==1) ndat=mpi_enreg%bandpp
504  if(iscf>0 .and. fixed_occ)  then
505    ABI_ALLOCATE(wfraug,(2,gs_hamk%n4,gs_hamk%n5,gs_hamk%n6*ndat))
506  end if
507 
508 !"nonlop" routine input parameters
509  nnlout=3*natom*optforces
510  signs=1;idir=0
511  if (gs_hamk%usepaw==0) then
512    choice=1+optforces
513    paw_opt=0;cpopt=-1;tim_nonlop=2
514  else
515    choice=2*optforces
516    paw_opt=2;cpopt=0;tim_nonlop=10-8*optforces
517    if (dtset%usefock==1) then
518 !     if (dtset%optforces/= 0) then
519      if (optforces/= 0) then
520        choice=2;cpopt=1; nnlout=3*natom
521      end if
522    end if
523  end if
524 
525  ABI_ALLOCATE(enlout,(nnlout*blocksize))
526 
527 !Allocation of memory space for one WF
528  ABI_ALLOCATE(cwavef,(2,npw_k*my_nspinor*blocksize))
529  if (gs_hamk%usepaw==1.and.(iscf>0.or.gs_hamk%usecprj==1)) then
530    iorder_cprj=0
531    nband_k_cprj=nband_k*(mband_cprj/dtset%mband)
532    bandpp_cprj=mpi_enreg%bandpp
533    ABI_DATATYPE_ALLOCATE(cwaveprj,(natom,my_nspinor*bandpp_cprj))
534    ncpgr=0;if (cpopt==1) ncpgr=cprj(1,1)%ncpgr
535    call pawcprj_alloc(cwaveprj,ncpgr,gs_hamk%dimcprj)
536  else
537    ABI_DATATYPE_ALLOCATE(cwaveprj,(0,0))
538  end if
539 
540 #undef DEV_NEW_CODE
541 !#define DEV_NEW_CODE
542 
543 !The code below is more efficient if paral_kgb==1 (less MPI communications)
544 !however OMP is not compatible with paral_kgb since we should define
545 !which threads performs the call to MPI_ALL_REDUCE.
546 !This problem can be easily solved by removing MPI_enreg from meanvalue_g so that
547 !the MPI call is done only once outside the OMP parallel region.
548 
549 #ifdef DEV_NEW_CODE
550 !Loop over bands or blocks of bands. Note that in sequential mode iblock=iband, nblockbd=nband_k and blocksize=1
551 !!$OMP PARALLEL DO PRIVATE(iband,ar)
552  do iblock=1,nblockbd
553 
554 !  Compute kinetic energy of each band
555    do iblocksize=1,blocksize
556      iband=(iblock-1)*blocksize+iblocksize
557 
558      call meanvalue_g(ar,kinpw,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
559 &     cg(:,1+(iband-1)*npw_k*my_nspinor+icg:iband*npw_k*my_nspinor+icg),&
560 &     cg(:,1+(iband-1)*npw_k*my_nspinor+icg:iband*npw_k*my_nspinor+icg),0)
561 
562      ek_k(iband)=ar
563 
564      if(paw_dmft%use_dmft==1) then
565        do iband1=1,nband_k
566          call meanvalue_g(ar,kinpw,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
567 &         cg(:,1+(iband -1)*npw_k*my_nspinor+icg:iband *npw_k*my_nspinor+icg),&
568 &         cg(:,1+(iband1-1)*npw_k*my_nspinor+icg:iband1*npw_k*my_nspinor+icg),paw_dmft%use_dmft,ar_im=ar_im)
569          ek_k_nd(1,iband,iband1)=ar
570          ek_k_nd(2,iband,iband1)=ar_im
571        end do
572      end if
573 !    if(use_dmft==1) then
574 !    do iband1=1,nband_k
575 !    call meanvalue_g(ar,kinpw,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
576 !    &         cg(:,1+(iband -1)*npw_k*my_nspinor+icg:iband *npw_k*my_nspinor+icg),&
577 !    &         cg(:,1+(iband1-1)*npw_k*my_nspinor+icg:iband1*npw_k*my_nspinor+icg),use_dmft)
578 !    ek_k_nd(iband,iband1)=ar
579 !    end do
580 !    end if
581 
582    end do
583  end do
584 !TODO: xmpi_sum is missing but I have to understand the logic used to deal with the different
585 !MPI options and communicators.
586 #endif
587 
588 
589 !Loop over bands or blocks of bands. Note that in sequential mode iblock=iband, nblockbd=nband_k and blocksize=1
590  do iblock=1,nblockbd
591    occblock=maxval(occ_k(1+(iblock-1)*blocksize:iblock*blocksize))
592    cwavef(:,:)=cg(:,1+(iblock-1)*npw_k*my_nspinor*blocksize+icg:iblock*npw_k*my_nspinor*blocksize+icg)
593 
594 #ifndef DEV_NEW_CODE
595 !  Compute kinetic energy of each band
596    do iblocksize=1,blocksize
597      iband=(iblock-1)*blocksize+iblocksize
598 
599      call meanvalue_g(ar,kinpw,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
600 &     cg(:,1+(iband-1)*npw_k*my_nspinor+icg:iband*npw_k*my_nspinor+icg),&
601 &     cg(:,1+(iband-1)*npw_k*my_nspinor+icg:iband*npw_k*my_nspinor+icg),0)
602 
603      ek_k(iband)=ar
604 
605      if(paw_dmft%use_dmft==1) then
606        do iband1=1,nband_k
607          call meanvalue_g(ar,kinpw,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
608 &         cg(:,1+(iband -1)*npw_k*my_nspinor+icg:iband *npw_k*my_nspinor+icg),&
609 &         cg(:,1+(iband1-1)*npw_k*my_nspinor+icg:iband1*npw_k*my_nspinor+icg),paw_dmft%use_dmft,ar_im=ar_im)
610          ek_k_nd(1,iband,iband1)=ar
611          ek_k_nd(2,iband,iband1)=ar_im
612        end do
613      end if
614    end do
615 #endif
616 
617    if(iscf>0)then ! In case of fixed occupation numbers, accumulates the partial density
618      if (fixed_occ .and. mpi_enreg%paral_kgb/=1) then
619        if (abs(occ_k(iblock))>=tol8) then
620          weight=occ_k(iblock)*wtk/gs_hamk%ucvol
621 !        Accumulate charge density in real space in array rhoaug
622 
623 !        The same section of code is also found in mkrho.F90 : should be rationalized !
624          call fourwf(1,rhoaug(:,:,:,1),cwavef,dummy,wfraug,gs_hamk%gbound_k,gs_hamk%gbound_k,&
625 &         istwf_k,gs_hamk%kg_k,gs_hamk%kg_k,gs_hamk%mgfft,mpi_enreg,1,gs_hamk%ngfft,npw_k,1,&
626 &         gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,1,&
627 &         dtset%paral_kgb,tim_fourwf,weight,weight,use_gpu_cuda=dtset%use_gpu_cuda)
628 
629          if(dtset%nspinor==2)then
630            ABI_ALLOCATE(cwavef1,(2,npw_k))
631            cwavef1(:,:)=cwavef(:,1+npw_k:2*npw_k) ! EB FR spin dn part and used for m_z component (cwavef_z)
632 
633            if(dtset%nspden==1) then
634 
635              call fourwf(1,rhoaug(:,:,:,1),cwavef1,dummy,wfraug,&
636 &             gs_hamk%gbound_k,gs_hamk%gbound_k,&
637 &             istwf_k,gs_hamk%kg_k,gs_hamk%kg_k,gs_hamk%mgfft,mpi_enreg,1,gs_hamk%ngfft,npw_k,1,&
638 &             gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,1,&
639 &             dtset%paral_kgb,tim_fourwf,weight,weight,use_gpu_cuda=dtset%use_gpu_cuda)
640 
641            else if(dtset%nspden==4) then
642 !            Build the four components of rho. We use only norm quantities and, so fourwf.
643 !$\sum_{n} f_n \Psi^{* \alpha}_n \Psi^{\alpha}_n =\rho^{\alpha \alpha}$
644 !$\sum_{n} f_n (\Psi^{1}+\Psi^{2})^*_n (\Psi^{1}+\Psi^{2})_n=rho+m_x$
645 !$\sum_{n} f_n (\Psi^{1}-i \Psi^{2})^*_n (\Psi^{1}-i \Psi^{2})_n=rho+m_y$
646              ABI_ALLOCATE(cwavef_x,(2,npw_k))
647              ABI_ALLOCATE(cwavef_y,(2,npw_k))
648 !$(\Psi^{1}+\Psi^{2})$
649              cwavef_x(:,:)=cwavef(:,1:npw_k)+cwavef1(:,1:npw_k)
650 !$(\Psi^{1}-i \Psi^{2})$
651              cwavef_y(1,:)=cwavef(1,1:npw_k)+cwavef1(2,1:npw_k)
652              cwavef_y(2,:)=cwavef(2,1:npw_k)-cwavef1(1,1:npw_k)
653 ! z component
654              call fourwf(1,rhoaug(:,:,:,4),cwavef1,dummy,wfraug,gs_hamk%gbound_k,gs_hamk%gbound_k,&
655 &             istwf_k,gs_hamk%kg_k,gs_hamk%kg_k,gs_hamk%mgfft,mpi_enreg,1,gs_hamk%ngfft,npw_k,1,&
656 &             gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,1,dtset%paral_kgb,&
657 &             tim_fourwf,weight,weight,use_gpu_cuda=dtset%use_gpu_cuda)
658 ! x component
659              call fourwf(1,rhoaug(:,:,:,2),cwavef_x,dummy,wfraug,gs_hamk%gbound_k,gs_hamk%gbound_k,&
660 &             istwf_k,gs_hamk%kg_k,gs_hamk%kg_k,gs_hamk%mgfft,mpi_enreg,1,gs_hamk%ngfft,npw_k,1,&
661 &             gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,1,dtset%paral_kgb,&
662 &             tim_fourwf,weight,weight,use_gpu_cuda=dtset%use_gpu_cuda)
663 ! y component
664              call fourwf(1,rhoaug(:,:,:,3),cwavef_y,dummy,wfraug,gs_hamk%gbound_k,gs_hamk%gbound_k,&
665 &             istwf_k,gs_hamk%kg_k,gs_hamk%kg_k,gs_hamk%mgfft,mpi_enreg,1,gs_hamk%ngfft,npw_k,1,&
666 &             gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,1,dtset%paral_kgb,&
667 &             tim_fourwf,weight,weight,use_gpu_cuda=dtset%use_gpu_cuda)
668 
669              ABI_DEALLOCATE(cwavef_x)
670              ABI_DEALLOCATE(cwavef_y)
671 
672            end if ! dtset%nspden/=4
673            ABI_DEALLOCATE(cwavef1)
674          end if
675        else
676          nskip=nskip+1
677        end if
678 
679 !      In case of fixed occupation numbers,in bandFFT mode accumulates the partial density
680      else if (fixed_occ .and. mpi_enreg%paral_kgb==1) then
681 
682        if (dtset%nspinor==1) then
683          call timab(537,1,tsec) ! "prep_fourwf%vtow"
684          call prep_fourwf(rhoaug(:,:,:,1),blocksize,cwavef,wfraug,iblock,istwf_k,&
685 &         gs_hamk%mgfft,mpi_enreg,nband_k,ndat,gs_hamk%ngfft,npw_k,&
686 &         gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,occ_k,&
687 &         1,gs_hamk%ucvol,wtk,use_gpu_cuda=dtset%use_gpu_cuda)
688          call timab(537,2,tsec)
689        else if (dtset%nspinor==2) then
690          ABI_ALLOCATE(cwavefb,(2,npw_k*blocksize,2))
691          ibs=(iblock-1)*npw_k*my_nspinor*blocksize+icg
692 !        --- No parallelization over spinors ---
693          if (mpi_enreg%paral_spinor==0) then
694            do iband=1,blocksize
695              cwavefb(:,(iband-1)*npw_k+1:iband*npw_k,1)=cg(:,1+(2*iband-2)*npw_k+ibs:(iband*2-1)*npw_k+ibs)
696              cwavefb(:,(iband-1)*npw_k+1:iband*npw_k,2)=cg(:,1+(2*iband-1)*npw_k+ibs:iband*2*npw_k+ibs)
697            end do
698          else
699 !          --- Parallelization over spinors ---
700 !          (split the work between 2 procs)
701            cwavefb(:,:,3-ispinor_index)=zero
702            do iband=1,blocksize
703              cwavefb(:,(iband-1)*npw_k+1:iband*npw_k,ispinor_index) = cg(:,1+(iband-1)*npw_k+ibs:iband*npw_k+ibs)
704            end do
705            call xmpi_sum(cwavefb,mpi_enreg%comm_spinor,ierr)
706          end if
707 
708          call timab(537,1,tsec) !"prep_fourwf%vtow"
709          if (nspinor1TreatedByThisProc) then
710            call prep_fourwf(rhoaug(:,:,:,1),blocksize,cwavefb(:,:,1),wfraug,iblock,&
711 &           istwf_k,gs_hamk%mgfft,mpi_enreg,nband_k,ndat,gs_hamk%ngfft,npw_k,&
712 &           gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,occ_k,1,gs_hamk%ucvol,wtk,&
713 &           use_gpu_cuda=dtset%use_gpu_cuda)
714          end if
715          if(dtset%nspden==1) then
716            if (nspinor2TreatedByThisProc) then
717              call prep_fourwf(rhoaug(:,:,:,1),blocksize,cwavefb(:,:,2),wfraug,&
718 &             iblock,istwf_k,gs_hamk%mgfft,mpi_enreg,nband_k,ndat,&
719 &             gs_hamk%ngfft,npw_k,gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,occ_k,1,&
720 &             gs_hamk%ucvol,wtk,use_gpu_cuda=dtset%use_gpu_cuda)
721            end if
722          else if(dtset%nspden==4) then
723            ABI_ALLOCATE(cwavef_x,(2,npw_k*blocksize))
724            ABI_ALLOCATE(cwavef_y,(2,npw_k*blocksize))
725            cwavef_x(:,:)=cwavefb(:,1:npw_k*blocksize,1)+cwavefb(:,:,2)
726            cwavef_y(1,:)=cwavefb(1,1:npw_k*blocksize,1)+cwavefb(2,:,2)
727            cwavef_y(2,:)=cwavefb(2,:,1)-cwavefb(1,:,2)
728            if (nspinor1TreatedByThisProc) then
729              call prep_fourwf(rhoaug(:,:,:,4),blocksize,cwavefb(:,:,2),wfraug,&
730 &             iblock,istwf_k,gs_hamk%mgfft,mpi_enreg,nband_k,ndat,gs_hamk%ngfft,&
731 &             npw_k,gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,occ_k,1,gs_hamk%ucvol,wtk,&
732 &             use_gpu_cuda=dtset%use_gpu_cuda)
733            end if
734            if (nspinor2TreatedByThisProc) then
735              call prep_fourwf(rhoaug(:,:,:,2),blocksize,cwavef_x,wfraug,&
736 &             iblock,istwf_k,gs_hamk%mgfft,mpi_enreg,nband_k,ndat,gs_hamk%ngfft,&
737 &             npw_k,gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,occ_k,1,gs_hamk%ucvol,wtk,&
738 &             use_gpu_cuda=dtset%use_gpu_cuda)
739              call prep_fourwf(rhoaug(:,:,:,3),blocksize,cwavef_y,wfraug,&
740 &             iblock,istwf_k,gs_hamk%mgfft,mpi_enreg,nband_k,ndat,gs_hamk%ngfft,&
741 &             npw_k,gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,occ_k,1,gs_hamk%ucvol,wtk,&
742 &             use_gpu_cuda=dtset%use_gpu_cuda)
743            end if
744            ABI_DEALLOCATE(cwavef_x)
745            ABI_DEALLOCATE(cwavef_y)
746          end if
747          call timab(537,2,tsec)
748          ABI_DEALLOCATE(cwavefb)
749        end if
750      end if
751    end if ! End of SCF calculation
752 
753 !    Call to nonlocal operator:
754 !    - Compute nonlocal forces from most recent wfs
755 !    - PAW: compute projections of WF onto NL projectors (cprj)
756    if(iscf>0.or.gs_hamk%usecprj==1)then
757      if (gs_hamk%usepaw==1.or.optforces/=0) then
758 !      Treat all wavefunctions in case of varying occupation numbers or PAW
759 !      Only treat occupied bands in case of fixed occupation numbers and NCPP
760        if(fixed_occ.and.abs(occblock)<=tol8.and.gs_hamk%usepaw==0) then
761          if (optforces>0) grnl_k(:,(iblock-1)*blocksize+1:iblock*blocksize)=zero
762        else
763          if(gs_hamk%usepaw==1) then
764            call timab(554,1,tsec)  ! "vtowfk:rhoij"
765          end if
766          if(cpopt==1) then
767            iband=1+(iblock-1)*bandpp_cprj
768            call pawcprj_copy(cprj(:,1+(iblock-1)*my_nspinor*blocksize+ibg:iblock*my_nspinor*blocksize+ibg),cwaveprj)
769          end if
770          if (mpi_enreg%paral_kgb==1) then
771            call timab(572,1,tsec) ! 'prep_nonlop%vtowfk'
772            call prep_nonlop(choice,cpopt,cwaveprj,enlout,gs_hamk,idir, &
773 &           eig_k(1+(iblock-1)*blocksize:iblock*blocksize),blocksize,&
774 &           mpi_enreg,nnlout,paw_opt,signs,nonlop_dum,tim_nonlop_prep,cwavef,cwavef)
775            call timab(572,2,tsec)
776          else
777            call nonlop(choice,cpopt,cwaveprj,enlout,gs_hamk,idir,eig_k(1+(iblock-1)*blocksize:iblock*blocksize),&
778 &           mpi_enreg,blocksize,nnlout,&
779 &           paw_opt,signs,nonlop_dum,tim_nonlop,cwavef,cwavef)
780          end if
781          if(gs_hamk%usepaw==1) then
782            call timab(554,2,tsec)
783          end if
784 !        Acccumulate forces
785          if (optforces>0) then
786            iband=(iblock-1)*blocksize
787            do iblocksize=1,blocksize
788              ii=0
789              if (nnlout>3*natom) ii=6
790              iband=iband+1;ibs=ii+nnlout*(iblocksize-1)
791              grnl_k(1:nnlout,iband)=enlout(ibs+1:ibs+nnlout)
792            end do
793          end if
794 !        Store cprj (<Pnl|Psi>)
795          if (gs_hamk%usepaw==1.and.gs_hamk%usecprj==1) then
796            iband=1+(iblock-1)*bandpp_cprj
797            call pawcprj_put(gs_hamk%atindx,cwaveprj,cprj,natom,iband,ibg,ikpt,iorder_cprj,isppol,&
798 &           mband_cprj,dtset%mkmem,natom,bandpp_cprj,nband_k_cprj,gs_hamk%dimcprj,my_nspinor,&
799 &           dtset%nsppol,dtfil%unpaw,mpicomm=mpi_enreg%comm_kpt,proc_distrb=mpi_enreg%proc_distrb)
800          end if
801        end if
802      end if ! PAW or forces
803    end if ! iscf>0 or iscf=-3
804 
805  end do !  End of loop on blocks
806 
807  ABI_DEALLOCATE(cwavef)
808  ABI_DEALLOCATE(enlout)
809 
810  if (gs_hamk%usepaw==1.and.(iscf>0.or.gs_hamk%usecprj==1)) then
811    call pawcprj_free(cwaveprj)
812  end if
813  ABI_DATATYPE_DEALLOCATE(cwaveprj)
814 
815  if (fixed_occ.and.iscf>0) then
816    ABI_DEALLOCATE(wfraug)
817  end if
818 
819 !Write the number of one-way 3D ffts skipped until now (in case of fixed occupation numbers
820  if(iscf>0 .and. fixed_occ .and. (prtvol>2 .or. ikpt<=nkpt_max) )then
821    write(message,'(a,i0)')' vtowfk : number of one-way 3D ffts skipped in vtowfk until now =',nskip
822    call wrtout(std_out,message,'PERS')
823  end if
824 
825 !Norm-conserving only: Compute nonlocal part of total energy : rotate subvnl
826  if (gs_hamk%usepaw==0 .and. wfopta10 /= 1 .and. .not. newlobpcg ) then
827    call timab(586,1,tsec)   ! 'vtowfk(nonlocalpart)'
828    ABI_ALLOCATE(matvnl,(2,nband_k,nband_k))
829    ABI_ALLOCATE(mat1,(2,nband_k,nband_k))
830    mat1=zero
831 
832    if (wfopta10==4) then
833      enl_k(1:nband_k)=zero
834 
835      if (istwf_k==1) then
836        call zhemm('l','l',nband_k,nband_k,cone,totvnl,nband_k,evec,nband_k,czero,mat1,nband_k)
837        do iband=1,nband_k
838          res = cg_real_zdotc(nband_k,evec(:,iband),mat1(:,:,iband))
839          enl_k(iband)= res
840        end do
841      else if (istwf_k==2) then
842        ABI_ALLOCATE(evec_loc,(nband_k,nband_k))
843        ABI_ALLOCATE(mat_loc,(nband_k,nband_k))
844        do iband=1,nband_k
845          do jj=1,nband_k
846            evec_loc(iband,jj)=evec(2*iband-1,jj)
847          end do
848        end do
849        call dsymm('l','l',nband_k,nband_k,one,totvnl,nband_k,evec_loc,nband_k,zero,mat_loc,nband_k)
850        do iband=1,nband_k
851          enl_k(iband)=ddot(nband_k,evec_loc(:,iband),1,mat_loc(:,iband),1)
852        end do
853        ABI_DEALLOCATE(evec_loc)
854        ABI_DEALLOCATE(mat_loc)
855      end if
856 
857    else
858 !    MG: This version is much faster with good OMP scalability.
859 !    Construct upper triangle of matvnl from subvnl using full storage mode.
860      pidx=0
861      do jj=1,nband_k
862        do ii=1,jj
863          pidx=pidx+1
864          matvnl(1,ii,jj)=subvnl(2*pidx-1)
865          matvnl(2,ii,jj)=subvnl(2*pidx  )
866        end do
867      end do
868 
869      call zhemm('L','U',nband_k,nband_k,cone,matvnl,nband_k,evec,nband_k,czero,mat1,nband_k)
870 
871 !$OMP PARALLEL DO PRIVATE(res)
872      do iband=1,nband_k
873        res = cg_real_zdotc(nband_k,evec(:,iband),mat1(:,:,iband))
874        enl_k(iband) = res
875      end do
876    end if
877 
878    ABI_DEALLOCATE(matvnl)
879    ABI_DEALLOCATE(mat1)
880    call timab(586,2,tsec)
881  end if
882 
883 !###################################################################
884 
885  if (iscf<=0 .and. residk>dtset%tolwfr) then
886    write(message,'(a,2i5,a,es13.5)')&
887 &   'Wavefunctions not converged for nnsclo,ikpt=',nnsclo_now,ikpt,' max resid=',residk
888    MSG_WARNING(message)
889  end if
890 
891 
892 !Print out eigenvalues (hartree)
893  if (prtvol>2 .or. ikpt<=nkpt_max) then
894    write(message, '(5x,a,i5,2x,a,a,a,i4,a,i4,a)' ) &
895 &   'eigenvalues (hartree) for',nband_k,'bands',ch10,&
896 &   '              after ',inonsc,' non-SCF iterations with ',dtset%nline,' CG line minimizations'
897    call wrtout(std_out,message,'PERS')
898    do ii=0,(nband_k-1)/6
899      write(message, '(1p,6e12.4)' ) (eig_k(iband),iband=1+6*ii,min(6+6*ii,nband_k))
900      call wrtout(std_out,message,'PERS')
901    end do
902  else if(ikpt==nkpt_max+1)then
903    call wrtout(std_out,' vtowfk : prtvol=0 or 1, do not print more k-points.','PERS')
904  end if
905 
906 !Print out decomposition of eigenvalues in the non-selfconsistent case or if prtvol>=10
907  if( (iscf<0 .and. (prtvol>2 .or. ikpt<=nkpt_max)) .or. prtvol>=10)then
908    write(message, '(5x,a,i5,2x,a,a,a,i4,a,i4,a)' ) &
909 &   ' mean kinetic energy (hartree) for ',nband_k,' bands',ch10,&
910 &   '              after ',inonsc,' non-SCF iterations with ',dtset%nline,' CG line minimizations'
911    call wrtout(std_out,message,'PERS')
912 
913    do ii=0,(nband_k-1)/6
914      write(message, '(1p,6e12.4)' ) (ek_k(iband),iband=1+6*ii,min(6+6*ii,nband_k))
915      call wrtout(std_out,message,'PERS')
916    end do
917 
918    if (gs_hamk%usepaw==0) then
919      write(message, '(5x,a,i5,2x,a,a,a,i4,a,i4,a)' ) &
920 &     ' mean non-local energy (hartree) for ',nband_k,' bands',ch10,&
921 &     '              after ',inonsc,' non-SCF iterations with ',dtset%nline,' CG line minimizations'
922      call wrtout(std_out,message,'PERS')
923 
924      do ii=0,(nband_k-1)/6
925        write(message,'(1p,6e12.4)') (enl_k(iband),iband=1+6*ii,min(6+6*ii,nband_k))
926        call wrtout(std_out,message,'PERS')
927      end do
928    end if
929  end if
930 
931  !Hamiltonian constructor for gwls_sternheimer
932  if(dtset%optdriver==RUNL_GWLS) then
933    call build_H(dtset,mpi_enreg,cpopt,cg,gs_hamk,kg_k,kinpw)
934  end if
935 
936  if(wfopta10 /= 1 .and. .not. newlobpcg) then
937    ABI_DEALLOCATE(evec)
938    ABI_DEALLOCATE(subham)
939    !if (gs_hamk%usepaw==0) then
940    !if (wfopta10==4) then
941    ABI_DEALLOCATE(totvnl)
942    !else
943    ABI_DEALLOCATE(subvnl)
944    !end if
945    !end if
946    ABI_DEALLOCATE(subovl)
947  end if
948  if ( .not. newlobpcg ) then
949    ABI_DEALLOCATE(gsc)
950  end if
951 
952  if(wfoptalg==3) then
953    ABI_DEALLOCATE(eig_save)
954  end if
955 
956 !Structured debugging : if prtvol=-level, stop here.
957  if(prtvol==-level)then
958    write(message,'(a,a,a,i0,a)')' vtowfk : exit ',ch10,'  prtvol=-',level,', debugging mode => stop '
959    MSG_ERROR(message)
960  end if
961 
962  call timab(30,2,tsec)
963  call timab(28,2,tsec)
964 
965  DBG_EXIT("COLL")
966 
967 end subroutine vtowfk