TABLE OF CONTENTS


ABINIT/m_vtowfk [ Modules ]

[ Top ] [ Modules ]

NAME

  m_vtowfk

FUNCTION

COPYRIGHT

  Copyright (C) 1998-2024 ABINIT group (DCA, XG, GMR, MT)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

16 #if defined HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include "abi_common.h"
21 
22 ! nvtx related macro definition
23 #include "nvtx_macros.h"
24 
25 module m_vtowfk
26 
27   use, intrinsic :: iso_fortran_env, only: int32, int64, real32, real64
28 
29  use defs_basis
30  use m_abicore
31  use m_errors
32  use m_xmpi
33  use m_efield
34  use m_linalg_interfaces
35  use m_cgtools
36  use m_dtset
37  use m_dtfil
38  use m_xomp
39 
40  use defs_abitypes, only : MPI_type
41  use m_time,        only : timab, cwtime, cwtime_report, sec2str
42  use m_fstrings,    only : sjoin, itoa, ftoa
43  use m_hamiltonian, only : gs_hamiltonian_type
44  use m_getghc,      only : getghc_nucdip
45  use m_paw_dmft,    only : paw_dmft_type
46  use m_pawcprj,     only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_put,pawcprj_copy
47  use m_paw_dmft,    only : paw_dmft_type
48  use m_gwls_hamiltonian, only : build_H
49  use m_fftcore,     only : fftcore_set_mixprec, fftcore_mixprec
50  use m_cgwf,        only : cgwf
51  use m_cgwf_cprj,   only : cgwf_cprj,mksubovl,cprj_update,cprj_update_oneband
52  use m_lobpcgwf_old,only : lobpcgwf
53  use m_lobpcgwf,    only : lobpcgwf2
54  use m_chebfiwf,    only : chebfiwf2
55  use m_spacepar,    only : meanvalue_g
56  use m_chebfi,      only : chebfi
57  use m_rmm_diis,    only : rmm_diis
58  use m_nonlop,      only : nonlop !, nonlop_counter
59  use m_prep_kgb,    only : prep_nonlop, prep_fourwf
60  use m_cgprj,       only : cprj_rotate
61  use m_fft,         only : fourwf
62  use m_cgtk,        only : cgtk_fixphase
63 
64 #if defined HAVE_YAKL
65  use gator_mod
66 #endif
67 
68 #ifdef HAVE_OPENMP_OFFLOAD
69  use m_ompgpu_fourwf
70 #endif
71 
72 #if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
73  use m_nvtx_data
74 #endif
75 
76  implicit none
77 
78  private

ABINIT/vtowfk [ Functions ]

[ Top ] [ Functions ]

NAME

 vtowfk

FUNCTION

 This routine compute the partial density at a given k-point,
 for a given spin-polarization, from a fixed Hamiltonian
 but might also simply compute eigenvectors and eigenvalues at this k point

INPUTS

  cgq = array that holds the WF of the nearest neighbours of
        the current k-point (electric field, MPI //)
  cpus= cpu time limit in seconds
  dtefield <type(efield_type)> = variables related to Berry phase
      calculations (see initberry.f)
  dtfil <type(datafiles_type)>=variables related to files
  dtset <type(dataset_type)>=all input variables for this dataset
  fixed_occ=true if electronic occupations are fixed (occopt<3)
  gs_hamk <type(gs_hamiltonian_type)>=all data for the Hamiltonian at k
  ibg=shift to be applied on the location of data in the array cprj
  icg=shift to be applied on the location of data in the array cg
  ikpt=number of the k-point
  iscf=(<= 0  =>non-SCF), >0 => SCF
  isppol= 1 for unpolarized, 2 for spin-polarized
  kg_k(3,npw_k)=reduced planewave coordinates.
  kinpw(npw_k)=(modified) kinetic energy for each plane wave (Hartree)
  mcg=second dimension of the cg array
  mcgq=second dimension of the cgq array (electric field, MPI //)
  mcprj=size of projected wave-functions array (cprj) =nspinor*mband*mkmem*nsppol
  mkgq = second dimension of pwnsfacq
  mpi_enreg=information about MPI parallelization
  mpw=maximum dimensioned size of npw
  natom=number of atoms in cell.
  nband_k=number of bands at this k point for that spin polarization
  nkpt=number of k points.
  istep=index of the number of steps in the routine scfcv
  nnsclo_now=number of non-self-consistent loops for the current vtrial
             (often 1 for SCF calculation, =nstep for non-SCF calculations)
  npw_k=number of plane waves at this k point
  npwarr(nkpt)=number of planewaves in basis at this k point
  occ_k(nband_k)=occupation number for each band (usually 2) for each k.
  optforces=option for the computation of forces
  prtvol=control print volume and debugging output
  pwind(pwind_alloc,2,3)= array used to compute
           the overlap matrix smat between k-points (see initberry.f)
  pwind_alloc= first dimension of pwind
  pwnsfac(2,pwind_alloc)= phase factors for non-symmorphic translations
                          (see initberry.f)
  pwnsfacq(2,mkgq)= phase factors for the nearest neighbours of the
                    current k-point (electric field, MPI //)
  usebanfft=flag for band-fft parallelism
  paw_dmft  <type(paw_dmft_type)>= paw+dmft related data
  wtk=weight assigned to the k point.
  zshift(nband_k)=energy shifts for the squared shifted hamiltonian algorithm

OUTPUT

  dphase_k(3)=change in Zak phase for the current k-point
  eig_k(nband_k)=array for holding eigenvalues (hartree)
  ek_k(nband_k)=contribution from each band to kinetic energy, at this k-point
  ek_k_nd(2,nband_k,nband_k*use_dmft)=contribution to kinetic energy,
     including non-diagonal terms, at this k-point (usefull if use_dmft)
  end_k(nband_k)=contribution from each band to nuclear dipole energy, at this k-point
  resid_k(nband_k)=residuals for each band over all k points, BEFORE the band rotation.
   In input: previous residuals.
  ==== if optforces>0 ====
    grnl_k(3*natom,nband_k)=nonlocal gradients, at this k-point
  ==== if gs_hamk%usepaw==0 ====
    enlx_k(nband_k)=contribution from each band to
                    nonlocal pseudopotential + Fock-type part of total energy, at this k-point
  ==== if (gs_hamk%usepaw==1) ====
    cprj(natom,mcprj*usecprj)= wave functions projected with non-local projectors:
                               cprj(n,k,i)=<p_i|Cnk> where p_i is a non-local projector.

SIDE EFFECTS

  cg(2,mcg)=updated wavefunctions
  rhoaug(n4,n5,n6,nvloc)= density in electrons/bohr**3, on the augmented fft grid.
                    (cumulative, so input as well as output). Update only
                    for occopt<3 (fixed occupation numbers)
  rmm_diis_status: Status of the RMM-DIIS eigensolver. See m_rmm_diis.

NOTES

  The cprj are distributed over band and spinors processors.
  One processor doesn't know all the cprj.
  Only the mod((iband-1)/mpi_enreg%bandpp,mpi_enreg%nproc_band) projectors
  are stored on each proc.

SOURCE

 175 subroutine vtowfk(cg,cgq,cprj,cpus,dphase_k,dtefield,dtfil,dtset,&
 176 & eig_k,ek_k,ek_k_nd,end_k,enlx_k,fixed_occ,grnl_k,gs_hamk,&
 177 & ibg,icg,ikpt,iscf,isppol,kg_k,kinpw,mband_cprj,mcg,mcgq,mcprj,mkgq,mpi_enreg,&
 178 & mpw,natom,nband_k,nbdbuf,nkpt,istep,nnsclo_now,npw_k,npwarr,occ_k,optforces,prtvol,&
 179 & pwind,pwind_alloc,pwnsfac,pwnsfacq,resid_k,rhoaug,paw_dmft,wtk,zshift, rmm_diis_status)
 180 
 181 !Arguments ------------------------------------
 182  integer, intent(in) :: ibg,icg,ikpt,iscf,isppol,mband_cprj,mcg,mcgq,mcprj,mkgq,mpw
 183  integer, intent(in) :: natom,nband_k,nbdbuf,nkpt,nnsclo_now,npw_k,optforces
 184  integer, intent(in) :: prtvol,pwind_alloc,istep
 185  logical,intent(in) :: fixed_occ
 186  real(dp), intent(in) :: cpus,wtk
 187  type(datafiles_type), intent(in) :: dtfil
 188  type(efield_type), intent(inout) :: dtefield
 189  type(dataset_type), intent(in) :: dtset
 190  type(gs_hamiltonian_type), intent(inout) :: gs_hamk
 191  type(MPI_type), intent(inout) :: mpi_enreg
 192  type(paw_dmft_type), intent(in)  :: paw_dmft
 193  integer, intent(in) :: kg_k(3,npw_k)
 194  integer, intent(in) :: npwarr(nkpt),pwind(pwind_alloc,2,3)
 195  integer, intent(inout) :: rmm_diis_status(2)
 196  real(dp), intent(in) :: cgq(2,mcgq),kinpw(npw_k),occ_k(nband_k)
 197  real(dp), intent(in) :: pwnsfac(2,pwind_alloc),pwnsfacq(2,mkgq)
 198  real(dp), intent(in) :: zshift(nband_k)
 199  real(dp), intent(out) :: eig_k(nband_k),ek_k(nband_k),dphase_k(3),ek_k_nd(2,nband_k,nband_k*paw_dmft%use_dmft)
 200  real(dp), intent(out) :: end_k(nband_k),enlx_k(nband_k)
 201  real(dp), intent(out) :: grnl_k(3*natom,nband_k*optforces)
 202  real(dp), intent(inout) :: resid_k(nband_k)
 203  real(dp), intent(inout),target :: cg(2,mcg)
 204  real(dp), intent(inout) :: rhoaug(gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,gs_hamk%nvloc)
 205  type(pawcprj_type),intent(inout),target :: cprj(natom,mcprj*gs_hamk%usecprj)
 206 
 207 !Local variables-------------------------------
 208  logical :: has_cprj_in_memory,has_fock,newchebfi,newlobpcg,update_cprj
 209  logical :: do_subdiago,do_ortho,rotate_subvnlx,use_rmm_diis
 210  integer,parameter :: level=112,tim_fourwf=2,tim_nonlop_prep=11,enough=3,tim_getcprj=5
 211  integer,save :: nskip=0
 212 !     Flag use_subovl: 1 if "subovl" array is computed (see below)
 213 !     subovl should be Identity (in that case we should use use_subovl=0)
 214 !     But this is true only if conjugate gradient algo. converges
 215  integer :: use_subovl=0
 216  integer :: use_subvnlx=0
 217  integer :: use_totvnlx=0
 218  integer :: bandpp_cprj,blocksize,choice,cpopt,iband,iband1
 219  integer :: iblock,iblocksize,ibs,idir,ierr,igs,igsc,ii,inonsc
 220  integer :: iorder_cprj,ipw,ispinor,ispinor_index,istwf_k,iwavef,mgsc,my_nspinor,n1,n2,n3 !kk
 221  integer :: nband_k_cprj,nblockbd,ncpgr,ndat,nkpt_max,nnlout,ortalgo
 222  integer :: paw_opt,quit,signs,spaceComm,tim_nonlop,wfoptalg,wfopta10
 223  logical :: nspinor1TreatedByThisProc,nspinor2TreatedByThisProc
 224  real(dp) :: ar,ar_im,eshift,occblock,norm
 225  real(dp) :: residk,weight,cpu,wall,gflops
 226  character(len=500) :: msg
 227  real(dp) :: dummy(2,1),nonlop_dum(1,1),tsec(2)
 228  real(dp),allocatable :: cwavef1(:,:),cwavef_x(:,:),cwavef_y(:,:),cwavefb(:,:,:)
 229 
 230 #if defined HAVE_GPU && defined HAVE_YAKL
 231  real(real64), ABI_CONTIGUOUS pointer :: cwavef(:,:)  => null()
 232 #else
 233  real(dp),allocatable,target :: cwavef(:,:)
 234 #endif
 235 
 236  real(dp),allocatable :: eig_save(:),enlout(:),evec(:,:),gsc(:,:),ghc_vectornd(:,:)
 237  real(dp),allocatable :: subham(:),subovl(:),subvnlx(:),totvnlx(:,:)
 238 
 239 
 240 #if defined HAVE_GPU && defined HAVE_YAKL
 241  real(real64), ABI_CONTIGUOUS pointer :: wfraug(:,:,:,:)
 242 #else
 243  real(dp),allocatable :: wfraug(:,:,:,:)
 244 #endif
 245 
 246  real(dp),pointer :: cwavef_iband(:,:)
 247  type(pawcprj_type),pointer :: cwaveprj(:,:)
 248  type(pawcprj_type),pointer :: cprj_cwavef_bands(:,:),cprj_cwavef(:,:)
 249 
 250  real(dp), allocatable :: weight_t(:) ! only allocated and used with GPU fourwf
 251 
 252 
 253 ! **********************************************************************
 254 
 255  DBG_ENTER("COLL")
 256 
 257  call timab(28,1,tsec) ! Keep track of total time spent in "vtowfk"
 258 
 259 !Structured debugging if prtvol==-level
 260  if(prtvol==-level)then
 261    write(msg,'(80a,a,a)') ('=',ii=1,80),ch10,'vtowfk: enter'
 262    call wrtout(std_out,msg,'PERS')
 263  end if
 264 
 265 !=========================================================================
 266 !============= INITIALIZATIONS AND ALLOCATIONS ===========================
 267 !=========================================================================
 268 
 269  nkpt_max=50; if(xmpi_paral==1)nkpt_max=-1
 270 
 271  wfoptalg=mod(dtset%wfoptalg,100); wfopta10=mod(wfoptalg,10)
 272  newlobpcg = (dtset%wfoptalg == 114)
 273  newchebfi = (dtset%wfoptalg == 111)
 274  istwf_k=gs_hamk%istwf_k
 275  has_fock=(associated(gs_hamk%fockcommon))
 276  quit=0
 277  igsc=0
 278 
 279 !Parallelization over spinors management
 280  my_nspinor=max(1,dtset%nspinor/mpi_enreg%nproc_spinor)
 281  if (mpi_enreg%paral_spinor==0) then
 282    ispinor_index=1
 283    nspinor1TreatedByThisProc=.true.
 284    nspinor2TreatedByThisProc=(dtset%nspinor==2)
 285  else
 286    ispinor_index=mpi_enreg%me_spinor+1
 287    nspinor1TreatedByThisProc=(mpi_enreg%me_spinor==0)
 288    nspinor2TreatedByThisProc=(mpi_enreg%me_spinor==1)
 289  end if
 290 
 291 !Parallelism over FFT and/or bands: define sizes and tabs
 292  !if (mpi_enreg%paral_kgb==1) then
 293  nblockbd=nband_k/(mpi_enreg%nproc_band*mpi_enreg%bandpp)
 294  !else
 295  !  nblockbd=nband_k/mpi_enreg%nproc_fft
 296  !  if (nband_k/=nblockbd*mpi_enreg%nproc_fft) nblockbd=nblockbd+1
 297  !end if
 298  blocksize=nband_k/nblockbd
 299 
 300 !Save eshift
 301  if(wfoptalg==3)then
 302    eshift=zshift(1)
 303    ABI_MALLOC(eig_save,(nband_k))
 304    eig_save(:)=eshift
 305  end if
 306 
 307  n1=gs_hamk%ngfft(1); n2=gs_hamk%ngfft(2); n3=gs_hamk%ngfft(3)
 308 
 309  ! Decide whether RMM-DIIS eigensolver should be activated.
 310  ! rmm_diis > 0 --> Activate it after (3 + rmm_diis) iterations with wfoptalg algorithm.
 311  ! rmm_diis < 0 --> Start with RMM-DIIS directly (risky)
 312  use_rmm_diis = .False.
 313  if (dtset%rmm_diis /= 0 .and. iscf > 0) then
 314    use_rmm_diis = istep > 3 + dtset%rmm_diis
 315    !if (use_rmm_diis) call wrtout(std_out, " Activating RMM-DIIS eigensolver in SCF mode.")
 316  end if
 317  !nonlop_counter = 0
 318 
 319  mgsc=0
 320  igsc=0
 321  has_cprj_in_memory=dtset%cprj_in_memory/=0
 322  if ((.not. newlobpcg .and. .not.has_cprj_in_memory) .or. dtset%rmm_diis /= 0) then
 323    mgsc=nband_k*npw_k*my_nspinor*gs_hamk%usepaw
 324    ABI_MALLOC_OR_DIE(gsc,(2,mgsc), ierr)
 325    gsc=zero
 326  else
 327    ABI_MALLOC(gsc,(0,0))
 328  end if
 329 
 330  if(wfopta10 /= 1 .and. .not. newlobpcg ) then
 331    !chebfi already does this stuff inside
 332    ABI_MALLOC(evec,(2*nband_k,nband_k))
 333    ABI_MALLOC(subham,(nband_k*(nband_k+1)))
 334 
 335    ABI_MALLOC(subvnlx,(0))
 336    ABI_MALLOC(totvnlx,(0,0))
 337    if (wfopta10==4) then
 338 !    Later, will have to generalize to Fock case, like when wfopta10/=4
 339      if (gs_hamk%usepaw==0) then
 340        ABI_FREE(totvnlx)
 341        if (istwf_k==1) then
 342          ABI_MALLOC(totvnlx,(2*nband_k,nband_k))
 343        else if (istwf_k==2) then
 344          ABI_MALLOC(totvnlx,(nband_k,nband_k))
 345        end if
 346        use_totvnlx=1
 347      endif
 348    else
 349      if (gs_hamk%usepaw==0 .or. has_fock) then
 350        ABI_FREE(subvnlx)
 351        ABI_MALLOC(subvnlx,(nband_k*(nband_k+1)))
 352        use_subvnlx=1
 353      end if
 354    end if
 355 
 356    if (use_subovl==1) then
 357      ABI_MALLOC(subovl,(nband_k*(nband_k+1)))
 358    else
 359      ABI_MALLOC(subovl,(0))
 360    end if
 361  end if
 362 
 363  ! Carry out UP TO dtset%nline steps, or until resid for every band is < dtset%tolwfr
 364  if (prtvol/=5 .and. (prtvol>2 .or. ikpt <= nkpt_max)) then
 365    write(msg,'(a,i5,2x,a,3f9.5,2x,a)')' non-scf iterations; kpt # ',ikpt,', k= (',gs_hamk%kpt_k,'), band residuals:'
 366    call wrtout(std_out,msg,'PERS')
 367  end if
 368 
 369  if (has_cprj_in_memory) then
 370    if(ikpt==1) then
 371      write(msg,'(a,i3)') ' In vtowfk : use of cprj in memory with cprj_update_lvl=',dtset%cprj_update_lvl
 372      call wrtout(std_out,msg,'COLL')
 373    end if
 374    cprj_cwavef_bands => cprj(:,1+ibg:nband_k*my_nspinor+ibg)
 375  end if
 376 
 377 !Electric field: initialize dphase_k
 378  dphase_k(:) = zero
 379 
 380 !=========================================================================
 381 !==================== NON-SELF-CONSISTENT LOOP ===========================
 382 !=========================================================================
 383 
 384 !nnsclo_now=number of non-self-consistent loops for the current vtrial
 385 !(often 1 for SCF calculation, =nstep for non-SCF calculations)
 386  call timab(39,1,tsec) ! "vtowfk (loop)"
 387 
 388  do inonsc=1,nnsclo_now
 389    ABI_NVTX_START_RANGE(NVTX_VTOWFK_EXTRA1)
 390    if (iscf < 0 .and. (inonsc <= enough .or. mod(inonsc, 10) == 0)) call cwtime(cpu, wall, gflops, "start")
 391 
 392    if (dtset%rmm_diis /= 0 .and. iscf < 0) then
 393      use_rmm_diis = inonsc > 3 + dtset%rmm_diis
 394      !if (use_rmm_diis) call wrtout(std_out, " Activating RMM-DIIS eigensolver in NSCF mode.")
 395    end if
 396 
 397    ! This initialisation is needed for the MPI-parallelisation (gathering using sum)
 398    if(wfopta10 /= 1 .and. .not. newlobpcg .and. .not. newchebfi) then
 399      subham(:)=zero
 400      if (gs_hamk%usepaw==0) then
 401        if (wfopta10==4) then
 402          totvnlx(:,:)=zero
 403        else
 404          subvnlx(:)=zero
 405        end if
 406      end if
 407      if (use_subovl==1)subovl(:)=zero
 408    end if
 409 
 410    !resid_k(:)=zero
 411 
 412    !call cg_kfilter(npw_k, my_nspinor, nband_k, kinpw, cg(:, icg+1))
 413 
 414 !  Filter the WFs when modified kinetic energy is too large (see routine mkkin.f)
 415 !  !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(igs,iwavef)
 416    do iband=1,nband_k
 417      iwavef=(iband-1)*npw_k*my_nspinor+icg
 418      cwavef_iband => cg(:,1+iwavef:npw_k*my_nspinor+iwavef)
 419      update_cprj=.False.
 420      do ispinor=1,my_nspinor
 421        igs=(ispinor-1)*npw_k
 422        do ipw=1+igs,npw_k+igs
 423          if(kinpw(ipw-igs)>huge(zero)*1.d-11)then
 424            norm=cwavef_iband(1,ipw)**2+cwavef_iband(2,ipw)**2
 425            if (norm>tol15*tol15) update_cprj=.True.
 426            cwavef_iband(:,ipw)=zero
 427          end if
 428        end do
 429      end do
 430      if (has_cprj_in_memory.and.update_cprj) then
 431        cprj_cwavef => cprj_cwavef_bands(:,my_nspinor*(iband-1)+1:my_nspinor*iband)
 432        call cprj_update_oneband(cwavef_iband,cprj_cwavef,gs_hamk,mpi_enreg,tim_getcprj)
 433      end if
 434    end do
 435    ABI_NVTX_END_RANGE()
 436 
 437 
 438    ! JLJ 17/10/2014: If it is a GWLS calculation, construct the hamiltonian
 439    ! as in a usual GS calc., but skip any minimisation procedure.
 440    ! This would be equivalent to nstep=0, if the latter did work.
 441    if(dtset%optdriver/=RUNL_GWLS) then
 442 
 443      if(wfopta10==4.or.wfopta10==1) then
 444        if (istwf_k/=1.and.istwf_k/=2) then !no way to use lobpcg
 445          write(msg,'(3a)')&
 446           'Only istwfk=1 or 2 are allowed with wfoptalg=4/14 !',ch10,&
 447           'Action: put istwfk to 1 or remove k points with half integer coordinates.'
 448          ABI_ERROR(msg)
 449        end if
 450 
 451        if (dtset%gpu_option==ABI_GPU_KOKKOS) then
 452          ! Kokkos GPU branch is not OpenMP thread-safe, setting OpenMP num threads to 1
 453          call xomp_set_num_threads(1)
 454        end if
 455 
 456 !    =========================================================================
 457 !    ============ MINIMIZATION OF BANDS: LOBPCG ==============================
 458 !    =========================================================================
 459        if (wfopta10==4) then
 460 
 461          if (use_rmm_diis) then
 462            call rmm_diis(istep, ikpt, isppol, cg(:,icg+1:), dtset, eig_k, occ_k, enlx_k, gs_hamk, kinpw, gsc, &
 463                          mpi_enreg, nband_k, npw_k, my_nspinor, resid_k, rmm_diis_status)
 464          else
 465 
 466             if ( .not. newlobpcg ) then
 467 
 468              ABI_NVTX_START_RANGE(NVTX_LOBPCG1)
 469              call lobpcgwf(cg,dtset,gs_hamk,gsc,icg,igsc,kinpw,mcg,mgsc,mpi_enreg,&
 470 &             nband_k,nblockbd,npw_k,prtvol,resid_k,subham,totvnlx,use_totvnlx)
 471              ! In case of FFT parallelism, exchange subspace arrays
 472              spaceComm=mpi_enreg%comm_bandspinorfft
 473              call xmpi_sum(subham,spaceComm,ierr)
 474              if (gs_hamk%usepaw==0) then
 475                if (wfopta10==4) then
 476                  call xmpi_sum(totvnlx,spaceComm,ierr)
 477                else
 478                  call xmpi_sum(subvnlx,spaceComm,ierr)
 479                end if
 480              end if
 481              if (use_subovl==1) call xmpi_sum(subovl,spaceComm,ierr)
 482              ABI_NVTX_END_RANGE()
 483 
 484           else
 485 
 486              ABI_NVTX_START_RANGE(NVTX_LOBPCG2)
 487              call lobpcgwf2(cg(:,icg+1:),dtset,eig_k,occ_k,enlx_k,gs_hamk,isppol,ikpt,inonsc,istep,kinpw,mpi_enreg,&
 488 &             nband_k,npw_k,my_nspinor,prtvol,resid_k,nbdbuf)
 489              ABI_NVTX_END_RANGE()
 490 
 491           end if
 492 
 493          end if
 494 
 495 !    =========================================================================
 496 !    ============ MINIMIZATION OF BANDS: CHEBYSHEV FILTERING =================
 497 !    =========================================================================
 498        else if (wfopta10 == 1) then
 499          if ( .not. newchebfi) then
 500            ABI_NVTX_START_RANGE(NVTX_CHEBFI1)
 501            call chebfi(cg(:, icg+1:),dtset,eig_k,enlx_k,gs_hamk,gsc,kinpw,&
 502 &           mpi_enreg,nband_k,npw_k,my_nspinor,prtvol,resid_k)
 503            ABI_NVTX_END_RANGE()
 504         else
 505            ABI_NVTX_START_RANGE(NVTX_CHEBFI2)
 506            call chebfiwf2(cg(:, icg+1:),dtset,eig_k,enlx_k,gs_hamk,kinpw,&
 507 &           mpi_enreg,nband_k,npw_k,my_nspinor,prtvol,resid_k)
 508            ABI_NVTX_END_RANGE()
 509         end if
 510        end if
 511 
 512 !      =========================================================================
 513 !      ======== MINIMIZATION OF BANDS: CONJUGATE GRADIENT (Teter et al.) =======
 514 !      =========================================================================
 515      else
 516        ! use_subvnlx=0; if (gs_hamk%usepaw==0 .or. associated(gs_hamk%fockcommon)) use_subvnlx=1
 517        ! use_subvnlx=0; if (gs_hamk%usepaw==0) use_subvnlx=1
 518 
 519        if (.not. use_rmm_diis) then
 520 
 521          if (isppol==1.and.ikpt==1.and.inonsc==1.and.istep==1) then
 522            if (dtset%tolwfr_diago/=zero) then
 523              write(msg, '(a,es16.6)' ) ' cgwf: tolwfr_diago=',dtset%tolwfr_diago
 524              call wrtout(std_out,msg,'COLL')
 525            end if
 526          end if
 527 
 528          if (has_cprj_in_memory) then
 529            call cgwf_cprj(cg,cprj_cwavef_bands,dtset%cprj_update_lvl,eig_k,&
 530 &             gs_hamk,icg,mcg,mpi_enreg,nband_k,dtset%nline,&
 531 &             dtset%ortalg,prtvol,quit,resid_k,subham,dtset%tolrde,dtset%tolwfr_diago,wfoptalg)
 532          else
 533            call cgwf(dtset%berryopt,cg,cgq,dtset%chkexit,cpus,dphase_k,dtefield,dtfil%filnam_ds(1),&
 534 &           gsc,gs_hamk,icg,igsc,ikpt,inonsc,isppol,dtset%mband,mcg,mcgq,mgsc,mkgq,&
 535 &           mpi_enreg,mpw,nband_k,dtset%nbdblock,nkpt,dtset%nline,npw_k,npwarr,my_nspinor,&
 536 &           dtset%nsppol,dtset%ortalg,prtvol,pwind,pwind_alloc,pwnsfac,pwnsfacq,quit,resid_k,&
 537 &           subham,subovl,subvnlx,dtset%tolrde,dtset%tolwfr_diago,use_subovl,use_subvnlx,wfoptalg,zshift)
 538          end if
 539        else
 540          call rmm_diis(istep, ikpt, isppol, cg(:,icg+1:), dtset, eig_k, occ_k, enlx_k, gs_hamk, kinpw, gsc, &
 541                        mpi_enreg, nband_k, npw_k, my_nspinor, resid_k, rmm_diis_status)
 542        end if
 543 
 544        if (dtset%gpu_option==ABI_GPU_KOKKOS) then
 545          ! Kokkos GPU branch is not OpenMp thread-safe, restoring OpenMP threads num
 546          call xomp_set_num_threads(dtset%gpu_kokkos_nthrd)
 547        end if
 548 
 549      end if
 550 
 551    end if
 552 
 553 !  =========================================================================
 554 !  ===================== FIND LARGEST RESIDUAL =============================
 555 !  =========================================================================
 556 
 557 !  Find largest resid over bands at this k point
 558 !  Note that this operation is done BEFORE rotation of bands:
 559 !  it would be time-consuming to recompute the residuals after.
 560    if (nbdbuf>=0) then
 561      residk=maxval(resid_k(1:max(1,nband_k-nbdbuf)))
 562    else if (nbdbuf==-101) then
 563      residk=maxval(occ_k(1:nband_k)*resid_k(1:nband_k))
 564    else
 565      ABI_ERROR('Bad value of nbdbuf')
 566    end if
 567 
 568 !  Print residuals
 569    if(prtvol/=5.and.(prtvol>2 .or. ikpt<=nkpt_max))then
 570      do ii=0,(nband_k-1)/8
 571        write(msg,'(a,8es10.2)')' res:',(resid_k(iband),iband=1+ii*8,min(nband_k,8+ii*8))
 572        call wrtout(std_out,msg,'PERS')
 573      end do
 574    end if
 575 
 576 !  =========================================================================
 577 !  ========== DIAGONALIZATION OF HAMILTONIAN IN WFs SUBSPACE ===============
 578 !  =========================================================================
 579    do_subdiago = .not. wfopta10 == 1 .and. .not. newlobpcg
 580    if (use_rmm_diis) do_subdiago = .False.  ! subdiago is already performed before RMM-DIIS.
 581 
 582    ABI_NVTX_START_RANGE(NVTX_SUB_SPC_DIAGO)
 583    if (do_subdiago) then
 584      if (prtvol > 1) call wrtout(std_out, " Performing subspace diagonalization.")
 585      call timab(585,1,tsec) !"vtowfk(subdiago)"
 586      if (has_cprj_in_memory) then
 587        call subdiago_low_memory(cg,eig_k,evec,icg,istwf_k,&
 588 &       mcg,nband_k,npw_k,my_nspinor,dtset%paral_kgb,subham)
 589        call timab(585,2,tsec)
 590        call timab(578,1,tsec)
 591        call cprj_rotate(cprj_cwavef_bands,evec,gs_hamk%dimcprj,natom,nband_k,gs_hamk%nspinor)
 592        call timab(578,2,tsec)
 593      else
 594        call subdiago(cg, eig_k, evec, gsc, icg, igsc, istwf_k, &
 595        mcg, mgsc, nband_k, npw_k, my_nspinor, dtset%paral_kgb, &
 596        subham, subovl, use_subovl, gs_hamk%usepaw, mpi_enreg%me_g0)
 597        call timab(585,2,tsec)
 598      end if
 599    end if
 600    ABI_NVTX_END_RANGE()
 601 
 602    !  Print energies
 603    if(prtvol/=5.and.(prtvol>2 .or. ikpt<=nkpt_max))then
 604      do ii=0,(nband_k-1)/8
 605        write(msg, '(a,8es10.2)' )' ene:',(eig_k(iband),iband=1+ii*8,min(nband_k,8+ii*8))
 606        call wrtout(std_out,msg,'PERS')
 607      end do
 608    end if
 609 
 610 !  THIS CHANGE OF SHIFT DOES NOT WORK WELL
 611 !  Update zshift in the case of wfoptalg==3
 612 !  if(wfoptalg==3 .and. inonsc/=1)then
 613 !  do iband=1,nband_k
 614 !  if(eig_k(iband)<eshift .and. eig_save(iband)<eshift)then
 615 !  zshift(iband)=max(eig_k(iband),eig_save(iband))
 616 !  end if
 617 !  if(eig_k(iband)>eshift .and. eig_save(iband)>eshift)then
 618 !  zshift(iband)=min(eig_k(iband),eig_save(iband))
 619 !  end if
 620 !  end do
 621 !  eig_save(:)=eig_k(:)
 622 !  end if
 623 
 624 !  =========================================================================
 625 !  =============== ORTHOGONALIZATION OF WFs (if needed) ====================
 626 !  =========================================================================
 627 
 628 !  Re-orthonormalize the wavefunctions at this k point.
 629 !  this step is redundant but is performed to combat rounding error in wavefunction orthogonality.
 630 !  This step is performed inside rmm_diis if RMM-DIIS is activated.
 631 
 632    call timab(583,1,tsec) ! "vtowfk(pw_orthon)"
 633    ortalgo = mpi_enreg%paral_kgb
 634    ! The orthogonalization is completely disabled with ortalg<=-10.
 635    ! This option is usefull for testing only and is not documented.
 636    do_ortho = (wfoptalg/=14 .and. wfoptalg /= 1 .and. wfoptalg /= 11 .and. dtset%ortalg>-10) .or. dtset%ortalg > 0
 637    if (use_rmm_diis) do_ortho = .False.
 638 
 639    if (do_ortho) then
 640 
 641      ABI_NVTX_START_RANGE(NVTX_ORTHO_WF)
 642 
 643      if (prtvol > 0) call wrtout(std_out, " Calling pw_orthon to orthonormalize bands.")
 644      if (has_cprj_in_memory.and.ortalgo==0) then
 645        ABI_FREE(subovl)
 646        ABI_MALLOC(subovl,(nband_k*(nband_k+1)))
 647        call mksubovl(cg,cprj_cwavef_bands,gs_hamk,icg,nband_k,subovl,mpi_enreg)
 648        call pw_orthon_cprj(icg,mcg,npw_k*my_nspinor,my_nspinor,nband_k,ortalgo,subovl,cg,cprj=cprj_cwavef_bands)
 649      else
 650        call pw_orthon(icg,igsc,istwf_k,mcg,mgsc,npw_k*my_nspinor,nband_k,ortalgo,gsc,gs_hamk%usepaw,cg,&
 651 &        mpi_enreg%me_g0,mpi_enreg%comm_bandspinorfft)
 652      end if
 653 
 654      ABI_NVTX_END_RANGE()
 655    end if
 656    call timab(583,2,tsec)
 657 
 658    ABI_NVTX_START_RANGE(NVTX_VTOWFK_EXTRA2)
 659 
 660    ! DEBUG seq==par comment next block
 661    ! Fix phases of all bands
 662    if (xmpi_paral/=1 .or. mpi_enreg%paral_kgb/=1) then
 663      !call wrtout(std_out, "Calling cgtk_fixphase")
 664       if ( (.not.newlobpcg) .and. (.not.newchebfi) .and. (.not.has_cprj_in_memory) ) then
 665        call cgtk_fixphase(cg,gsc,icg,igsc,istwf_k,mcg,mgsc,mpi_enreg,nband_k,npw_k*my_nspinor,gs_hamk%usepaw)
 666      else if ( (newlobpcg .or. newchebfi) .and. (.not.has_cprj_in_memory) ) then
 667        ! GSC is local to vtowfk and is completely useless since everything
 668        ! is calculated in my lobpcg, we don't care about the phase of gsc !
 669        call cgtk_fixphase(cg,gsc,icg,igsc,istwf_k,mcg,mgsc,mpi_enreg,nband_k,npw_k*my_nspinor,0)
 670      else ! has_cprj_in_memory
 671        call cgtk_fixphase(cg,gsc,icg,igsc,istwf_k,mcg,mgsc,mpi_enreg,nband_k,npw_k*my_nspinor,0,&
 672          & cprj=cprj_cwavef_bands,nspinor=dtset%nspinor)
 673      end if
 674    end if
 675 
 676    if (iscf < 0) then
 677      if (residk > dtset%tolwfr .and. residk < tol7) then
 678        if (fftcore_mixprec == 1) call wrtout(std_out, " Approaching NSCF convergence. Activating FFT in double-precision")
 679        ii = fftcore_set_mixprec(0)
 680      end if
 681 
 682      ! Print residual and wall-time required by NSCF iteration.
 683      if (inonsc <= enough .or. mod(inonsc, 20) == 0) then
 684        call cwtime(cpu, wall, gflops, "stop")
 685        if (ikpt == 1 .or. mod(ikpt, 100) == 0) then
 686          if (inonsc == 1) call wrtout(std_out, sjoin(" k-point: [", itoa(ikpt), "/", itoa(nkpt), "], spin:", itoa(isppol)))
 687          call wrtout(std_out, sjoin("   Max resid =", ftoa(residk, fmt="es13.5"), &
 688            " (exclude nbdbuf bands). One NSCF iteration cpu-time:", &
 689            sec2str(cpu), ", wall-time:", sec2str(wall)), do_flush=.True.)
 690          if (inonsc == enough) call wrtout(std_out, "   Printing residuals every mod(20) iterations...")
 691        end if
 692      end if
 693    end if
 694    ABI_NVTX_END_RANGE()
 695 
 696    ! Exit loop over inonsc if converged
 697    if (residk < dtset%tolwfr) then
 698      if (iscf < 0 .and. (ikpt == 1 .or. mod(ikpt, 100) == 0)) then
 699        call wrtout(std_out, sjoin("   NSCF loop completed after", itoa(inonsc), "iterations"))
 700      end if
 701      exit
 702    end if
 703  end do ! inonsc (NON SELF-CONSISTENT LOOP)
 704 
 705  if (has_cprj_in_memory) then
 706    update_cprj=dtset%cprj_update_lvl<=3.and.dtset%cprj_update_lvl/=2
 707    if (update_cprj) call cprj_update(cg,cprj_cwavef_bands,gs_hamk,icg,nband_k,mpi_enreg,tim_getcprj)
 708  end if
 709 
 710  call timab(39,2,tsec)
 711  call timab(30,1,tsec) ! "vtowfk  (afterloop)"
 712 
 713  !if (dtset%prtvol > 0)
 714  !call wrtout(std_out, sjoin(" Number of Vnl|Psi> applications:", itoa(nonlop_counter)))
 715 
 716 !###################################################################
 717 
 718 !Compute kinetic energy and non-local energy for each band, and in the SCF
 719 !case, contribution to forces, and eventually accumulate rhoaug
 720 
 721  ndat=1;if (mpi_enreg%paral_kgb==1) ndat=mpi_enreg%bandpp
 722  if(iscf>0 .and. fixed_occ)  then
 723    if(dtset%gpu_option==ABI_GPU_KOKKOS) then
 724 #if defined HAVE_GPU && defined HAVE_YAKL
 725      ABI_MALLOC_MANAGED(wfraug,(/2,gs_hamk%n4,gs_hamk%n5,gs_hamk%n6*MAX(blocksize,ndat)/))
 726 #endif
 727    else
 728      ABI_MALLOC(wfraug,(2,gs_hamk%n4,gs_hamk%n5,gs_hamk%n6*MAX(blocksize,ndat)))
 729    end if
 730  end if
 731 
 732 !"nonlop" routine input parameters
 733  nnlout=3*natom*optforces
 734  signs=1;idir=0
 735  if (gs_hamk%usepaw==0) then
 736    choice=1+optforces
 737    paw_opt=0;cpopt=-1;tim_nonlop=2
 738  else
 739    choice=2*optforces
 740    paw_opt=2;cpopt=0;tim_nonlop=10-8*optforces
 741    if (has_cprj_in_memory) cpopt=2 ! cprj are in memory (but not the derivatives)
 742    if (dtset%usefock==1) then
 743 !     if (dtset%optforces/= 0) then
 744      if (optforces/= 0) then
 745        choice=2;cpopt=1; nnlout=3*natom
 746      end if
 747    end if
 748  end if
 749 
 750  ABI_MALLOC(enlout,(nnlout*blocksize))
 751 
 752  ! Allocation of memory space for one block of waveforms containing blocksize waveforms
 753  if(dtset%gpu_option==ABI_GPU_KOKKOS) then
 754 #if defined HAVE_GPU && defined HAVE_YAKL
 755    ABI_MALLOC_MANAGED(cwavef, (/2,npw_k*my_nspinor*blocksize/))
 756 #endif
 757  else
 758    ABI_MALLOC(cwavef, (2,npw_k*my_nspinor*blocksize))
 759  end if
 760 
 761  if (.not.has_cprj_in_memory) then
 762    if (gs_hamk%usepaw==1.and.(iscf>0.or.gs_hamk%usecprj==1)) then
 763      iorder_cprj=0
 764      nband_k_cprj=nband_k*(mband_cprj/dtset%mband)
 765      bandpp_cprj=mpi_enreg%bandpp
 766      ABI_MALLOC(cwaveprj,(natom,my_nspinor*bandpp_cprj))
 767      ncpgr=0;if (cpopt==1) ncpgr=cprj(1,1)%ncpgr
 768      call pawcprj_alloc(cwaveprj,ncpgr,gs_hamk%dimcprj)
 769    else
 770      ABI_MALLOC(cwaveprj,(0,0))
 771    end if
 772  end if
 773 
 774 !The code below is more efficient if paral_kgb==1 (less MPI communications)
 775 !however OMP is not compatible with paral_kgb since we should define
 776 !which threads performs the call to MPI_ALL_REDUCE.
 777 !This problem can be easily solved by removing MPI_enreg from meanvalue_g so that
 778 !the MPI call is done only once outside the OMP parallel region.
 779 
 780  !call cwtime(cpu, wall, gflops, "start")
 781 
 782 !Loop over bands or blocks of bands. Note that in sequential mode iblock=iband, nblockbd=nband_k and blocksize=1
 783  do iblock=1,nblockbd
 784    occblock=maxval(occ_k(1+(iblock-1)*blocksize:iblock*blocksize))
 785    cwavef(:,:)=cg(:,1+(iblock-1)*npw_k*my_nspinor*blocksize+icg:iblock*npw_k*my_nspinor*blocksize+icg)
 786 
 787 !  Compute kinetic energy of each band
 788    do iblocksize=1,blocksize
 789      iband=(iblock-1)*blocksize+iblocksize
 790 
 791      call meanvalue_g(ar,kinpw,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
 792 &     cg(:,1+(iband-1)*npw_k*my_nspinor+icg:iband*npw_k*my_nspinor+icg),&
 793 &     cg(:,1+(iband-1)*npw_k*my_nspinor+icg:iband*npw_k*my_nspinor+icg),0)
 794 
 795      ek_k(iband)=ar
 796      if(ANY(ABS(dtset%nucdipmom)>tol8)) then
 797        ABI_MALLOC(ghc_vectornd,(2,npw_k*my_nspinor))
 798        call getghc_nucdip(cg(:,1+(iband-1)*npw_k*my_nspinor+icg:iband*npw_k*my_nspinor+icg),&
 799          & ghc_vectornd,gs_hamk%gbound_k,gs_hamk%istwf_k,kg_k,gs_hamk%kpt_k,&
 800          & gs_hamk%mgfft,mpi_enreg,ndat,gs_hamk%ngfft,npw_k,gs_hamk%nvloc,&
 801          & gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,my_nspinor,gs_hamk%vectornd,gs_hamk%gpu_option)
 802        end_k(iband)=DOT_PRODUCT(cg(1,1+(iband-1)*npw_k*my_nspinor+icg:iband*npw_k*my_nspinor+icg),&
 803          &                      ghc_vectornd(1,1:npw_k*my_nspinor))+&
 804          &          DOT_PRODUCT(cg(2,1+(iband-1)*npw_k*my_nspinor+icg:iband*npw_k*my_nspinor+icg),&
 805          &                      ghc_vectornd(2,1:npw_k*my_nspinor))
 806        ABI_FREE(ghc_vectornd)
 807      end if
 808      if(paw_dmft%use_dmft==1) then
 809        do iband1=1,nband_k
 810          call meanvalue_g(ar,kinpw,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
 811 &         cg(:,1+(iband -1)*npw_k*my_nspinor+icg:iband *npw_k*my_nspinor+icg),&
 812 &         cg(:,1+(iband1-1)*npw_k*my_nspinor+icg:iband1*npw_k*my_nspinor+icg),paw_dmft%use_dmft,ar_im=ar_im)
 813          ek_k_nd(1,iband,iband1)=ar
 814          ek_k_nd(2,iband,iband1)=ar_im
 815        end do
 816      end if
 817    end do
 818 
 819    if (iscf>0) then
 820 
 821      ABI_NVTX_START_RANGE(NVTX_VTOWFK_FOURWF)
 822      ! In case of fixed occupation numbers, accumulates the partial density
 823      if (fixed_occ .and. mpi_enreg%paral_kgb/=1) then
 824 
 825        ! treat all bands at once on GPU
 826        if (dtset%gpu_option /= ABI_GPU_DISABLED) then
 827 
 828          ABI_MALLOC(weight_t,(blocksize))
 829 
 830          ! compute weights
 831          do iblocksize=1,blocksize
 832            iband=(iblock-1)*blocksize+iblocksize
 833            weight_t(iblocksize) = occ_k(iband) * wtk / gs_hamk%ucvol
 834            if (abs(occ_k(iband)) < tol8) weight_t(iblocksize) = zero
 835          end do
 836 
 837          if (dtset%gpu_option == ABI_GPU_LEGACY) then
 838 #if defined HAVE_GPU_CUDA
 839            call gpu_fourwf(1,&        ! cplex
 840              &     rhoaug(:,:,:,1),&  ! denpot
 841              &     cwavef(:,:),&      ! fofgin
 842              &     dummy,&            ! fofgout
 843              &     wfraug,&           ! fofr
 844              &     gs_hamk%gbound_k,& ! gboundin
 845              &     gs_hamk%gbound_k,& ! gboundout
 846              &     istwf_k,&          ! istwf_k
 847              &     kg_k,&             ! kg_kin
 848              &     kg_k,&             ! kg_kout
 849              &     gs_hamk%mgfft,&    ! mgfft
 850              &     mpi_enreg,&        ! mpi_enreg
 851              &     blocksize,&        ! ndat = number of band in current block of bands
 852              &     gs_hamk%ngfft,&    ! ngfft
 853              &     npw_k,&            ! npwin
 854              &     1,&                ! npwout
 855              &     gs_hamk%n4,&       ! n4
 856              &     gs_hamk%n5,&       ! n5
 857              &     gs_hamk%n6,&       ! n6
 858              &     1,&                ! option
 859              &     mpi_enreg%paral_kgb,& ! paral_kgb
 860              &     tim_fourwf,&          ! tim_fourwf
 861              &     weight_t,&            ! weight_r
 862              &     weight_t)             ! weight_i
 863 #endif
 864          else if (dtset%gpu_option == ABI_GPU_KOKKOS) then
 865 #if defined HAVE_GPU_CUDA
 866            call gpu_fourwf_managed(1,&        ! cplex
 867              &     rhoaug(:,:,:,1),&  ! denpot
 868              &     cwavef(:,:),&      ! fofgin
 869              &     dummy,&            ! fofgout
 870              &     wfraug,&           ! fofr
 871              &     gs_hamk%gbound_k,& ! gboundin
 872              &     gs_hamk%gbound_k,& ! gboundout
 873              &     istwf_k,&          ! istwf_k
 874              &     kg_k,&             ! kg_kin
 875              &     kg_k,&             ! kg_kout
 876              &     gs_hamk%mgfft,&    ! mgfft
 877              &     mpi_enreg,&        ! mpi_enreg
 878              &     blocksize,&        ! ndat = number of band in current block of bands
 879              &     gs_hamk%ngfft,&    ! ngfft
 880              &     npw_k,&            ! npwin
 881              &     1,&                ! npwout
 882              &     gs_hamk%n4,&       ! n4
 883              &     gs_hamk%n5,&       ! n5
 884              &     gs_hamk%n6,&       ! n6
 885              &     1,&                ! option
 886              &     mpi_enreg%paral_kgb,& ! paral_kgb
 887              &     tim_fourwf,&          ! tim_fourwf
 888              &     weight_t,&            ! weight_r
 889              &     weight_t)             ! weight_i
 890 #endif
 891          else if (dtset%gpu_option == ABI_GPU_OPENMP) then
 892 #if defined HAVE_OPENMP_OFFLOAD
 893            call ompgpu_fourwf(1,&     ! cplex
 894              &     rhoaug(:,:,:,1),&  ! denpot
 895              &     cwavef(:,:),&      ! fofgin
 896              &     dummy,&            ! fofgout
 897              &     wfraug,&           ! fofr
 898              &     gs_hamk%gbound_k,& ! gboundin
 899              &     gs_hamk%gbound_k,& ! gboundout
 900              &     istwf_k,&          ! istwf_k
 901              &     kg_k,&             ! kg_kin
 902              &     kg_k,&             ! kg_kout
 903              &     gs_hamk%mgfft,&    ! mgfft
 904              &     blocksize,&        ! ndat = number of band in current block of bands
 905              &     gs_hamk%ngfft,&    ! ngfft
 906              &     npw_k,&            ! npwin
 907              &     1,&                ! npwout
 908              &     gs_hamk%n4,&       ! n4
 909              &     gs_hamk%n5,&       ! n5
 910              &     gs_hamk%n6,&       ! n6
 911              &     1,&                ! option
 912              &     weight_t,&         ! weight_r
 913              &     weight_t)          ! weight_i
 914 #endif
 915          end if
 916 
 917              if (dtset%nspinor==2) then
 918                ABI_ERROR('The case where iscf>0, fixed_occ=True and nspinor=2 is not yet implemented on GPU. FIX ME.')
 919              end if
 920 
 921          ABI_FREE(weight_t)
 922 
 923        else
 924 
 925          do iblocksize=1,blocksize
 926            iband=(iblock-1)*blocksize+iblocksize
 927            cwavef_iband => cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor)
 928 
 929            if (abs(occ_k(iband))>=tol8) then
 930              weight = occ_k(iband) * wtk / gs_hamk%ucvol
 931 
 932              ! Accumulate charge density in real space in array rhoaug
 933 
 934              ! The same section of code is also found in mkrho.F90 : should be rationalized !
 935              call fourwf(1,rhoaug(:,:,:,1),cwavef_iband,dummy,wfraug,gs_hamk%gbound_k,gs_hamk%gbound_k,&
 936                &           istwf_k,gs_hamk%kg_k,gs_hamk%kg_k,gs_hamk%mgfft,mpi_enreg,1,gs_hamk%ngfft,npw_k,1,&
 937                &           gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,1,&
 938                &           tim_fourwf,weight,weight,gpu_option=dtset%gpu_option)
 939 
 940              if(dtset%nspinor==2)then
 941                ABI_MALLOC(cwavef1,(2,npw_k))
 942                cwavef1(:,:)=cwavef_iband(:,1+npw_k:2*npw_k) ! EB FR spin dn part and used for m_z component (cwavef_z)
 943 
 944                if(dtset%nspden==1) then
 945 
 946                  call fourwf(1,rhoaug(:,:,:,1),cwavef1,dummy,wfraug,&
 947                    &               gs_hamk%gbound_k,gs_hamk%gbound_k,&
 948                    &               istwf_k,gs_hamk%kg_k,gs_hamk%kg_k,gs_hamk%mgfft,mpi_enreg,1,gs_hamk%ngfft,npw_k,1,&
 949                    &               gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,1,&
 950                    &               tim_fourwf,weight,weight,gpu_option=dtset%gpu_option)
 951 
 952                else if(dtset%nspden==4) then
 953                  ! Build the four components of rho. We use only norm quantities and, so fourwf.
 954                  ! $\sum_{n} f_n \Psi^{* \alpha}_n \Psi^{\alpha}_n =\rho^{\alpha \alpha}$
 955                  ! $\sum_{n} f_n (\Psi^{1}+\Psi^{2})^*_n (\Psi^{1}+\Psi^{2})_n=rho+m_x$
 956                  ! $\sum_{n} f_n (\Psi^{1}-i \Psi^{2})^*_n (\Psi^{1}-i \Psi^{2})_n=rho+m_y$
 957                  ABI_MALLOC(cwavef_x,(2,npw_k))
 958                  ABI_MALLOC(cwavef_y,(2,npw_k))
 959                  !$(\Psi^{1}+\Psi^{2})$
 960                  cwavef_x(:,:)=cwavef_iband(:,1:npw_k)+cwavef1(:,1:npw_k)
 961                  !$(\Psi^{1}-i \Psi^{2})$
 962                  cwavef_y(1,:)=cwavef_iband(1,1:npw_k)+cwavef1(2,1:npw_k)
 963                  cwavef_y(2,:)=cwavef_iband(2,1:npw_k)-cwavef1(1,1:npw_k)
 964                  ! z component
 965                  call fourwf(1,rhoaug(:,:,:,4),cwavef1,dummy,wfraug,gs_hamk%gbound_k,gs_hamk%gbound_k,&
 966                    &             istwf_k,gs_hamk%kg_k,gs_hamk%kg_k,gs_hamk%mgfft,mpi_enreg,1,gs_hamk%ngfft,npw_k,1,&
 967                    &               gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,1,&
 968                    &               tim_fourwf,weight,weight,gpu_option=dtset%gpu_option)
 969                  ! x component
 970                  call fourwf(1,rhoaug(:,:,:,2),cwavef_x,dummy,wfraug,gs_hamk%gbound_k,gs_hamk%gbound_k,&
 971                    &               istwf_k,gs_hamk%kg_k,gs_hamk%kg_k,gs_hamk%mgfft,mpi_enreg,1,gs_hamk%ngfft,npw_k,1,&
 972                    &               gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,1,&
 973                    &               tim_fourwf,weight,weight,gpu_option=dtset%gpu_option)
 974                  ! y component
 975                  call fourwf(1,rhoaug(:,:,:,3),cwavef_y,dummy,wfraug,gs_hamk%gbound_k,gs_hamk%gbound_k,&
 976                    &               istwf_k,gs_hamk%kg_k,gs_hamk%kg_k,gs_hamk%mgfft,mpi_enreg,1,gs_hamk%ngfft,npw_k,1,&
 977                    &               gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,1,&
 978                    &               tim_fourwf,weight,weight,gpu_option=dtset%gpu_option)
 979 
 980                  ABI_FREE(cwavef_x)
 981                  ABI_FREE(cwavef_y)
 982 
 983                end if ! dtset%nspden/=4
 984                ABI_FREE(cwavef1)
 985              end if
 986            else
 987              nskip=nskip+1
 988            end if
 989          end do  ! Loop inside a block of bands
 990 
 991        end if ! dtset%gpu_option
 992 
 993 !      In case of fixed occupation numbers,in bandFFT mode accumulates the partial density
 994      else if (fixed_occ .and. mpi_enreg%paral_kgb==1) then
 995 
 996        if (dtset%nspinor==1) then
 997          call timab(537,1,tsec) ! "prep_fourwf%vtow"
 998          call prep_fourwf(rhoaug(:,:,:,1),blocksize,cwavef,wfraug,iblock,istwf_k,&
 999 &         gs_hamk%mgfft,mpi_enreg,nband_k,ndat,gs_hamk%ngfft,npw_k,&
1000 &         gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,occ_k,&
1001 &         1,gs_hamk%ucvol,wtk,gpu_option=dtset%gpu_option)
1002          call timab(537,2,tsec)
1003        else if (dtset%nspinor==2) then
1004          ABI_MALLOC(cwavefb,(2,npw_k*blocksize,2))
1005          ibs=(iblock-1)*npw_k*my_nspinor*blocksize+icg
1006 !        --- No parallelization over spinors ---
1007          if (mpi_enreg%paral_spinor==0) then
1008            do iband=1,blocksize
1009              cwavefb(:,(iband-1)*npw_k+1:iband*npw_k,1)=cg(:,1+(2*iband-2)*npw_k+ibs:(iband*2-1)*npw_k+ibs)
1010              cwavefb(:,(iband-1)*npw_k+1:iband*npw_k,2)=cg(:,1+(2*iband-1)*npw_k+ibs:iband*2*npw_k+ibs)
1011            end do
1012          else
1013 !          --- Parallelization over spinors ---
1014 !          (split the work between 2 procs)
1015            cwavefb(:,:,3-ispinor_index)=zero
1016            do iband=1,blocksize
1017              cwavefb(:,(iband-1)*npw_k+1:iband*npw_k,ispinor_index) = cg(:,1+(iband-1)*npw_k+ibs:iband*npw_k+ibs)
1018            end do
1019            call xmpi_sum(cwavefb,mpi_enreg%comm_spinor,ierr)
1020          end if
1021 
1022          call timab(537,1,tsec) !"prep_fourwf%vtow"
1023          if (nspinor1TreatedByThisProc) then
1024            call prep_fourwf(rhoaug(:,:,:,1),blocksize,cwavefb(:,:,1),wfraug,iblock,&
1025 &           istwf_k,gs_hamk%mgfft,mpi_enreg,nband_k,ndat,gs_hamk%ngfft,npw_k,&
1026 &           gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,occ_k,1,gs_hamk%ucvol,wtk,&
1027 &           gpu_option=dtset%gpu_option)
1028          end if
1029          if(dtset%nspden==1) then
1030            if (nspinor2TreatedByThisProc) then
1031              call prep_fourwf(rhoaug(:,:,:,1),blocksize,cwavefb(:,:,2),wfraug,&
1032 &             iblock,istwf_k,gs_hamk%mgfft,mpi_enreg,nband_k,ndat,&
1033 &             gs_hamk%ngfft,npw_k,gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,occ_k,1,&
1034 &             gs_hamk%ucvol,wtk,gpu_option=dtset%gpu_option)
1035            end if
1036          else if(dtset%nspden==4) then
1037            ABI_MALLOC(cwavef_x,(2,npw_k*blocksize))
1038            ABI_MALLOC(cwavef_y,(2,npw_k*blocksize))
1039            cwavef_x(:,:)=cwavefb(:,1:npw_k*blocksize,1)+cwavefb(:,:,2)
1040            cwavef_y(1,:)=cwavefb(1,1:npw_k*blocksize,1)+cwavefb(2,:,2)
1041            cwavef_y(2,:)=cwavefb(2,:,1)-cwavefb(1,:,2)
1042            if (nspinor1TreatedByThisProc) then
1043              call prep_fourwf(rhoaug(:,:,:,4),blocksize,cwavefb(:,:,2),wfraug,&
1044 &             iblock,istwf_k,gs_hamk%mgfft,mpi_enreg,nband_k,ndat,gs_hamk%ngfft,&
1045 &             npw_k,gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,occ_k,1,gs_hamk%ucvol,wtk,&
1046 &             gpu_option=dtset%gpu_option)
1047            end if
1048            if (nspinor2TreatedByThisProc) then
1049              call prep_fourwf(rhoaug(:,:,:,2),blocksize,cwavef_x,wfraug,&
1050 &             iblock,istwf_k,gs_hamk%mgfft,mpi_enreg,nband_k,ndat,gs_hamk%ngfft,&
1051 &             npw_k,gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,occ_k,1,gs_hamk%ucvol,wtk,&
1052 &             gpu_option=dtset%gpu_option)
1053              call prep_fourwf(rhoaug(:,:,:,3),blocksize,cwavef_y,wfraug,&
1054 &             iblock,istwf_k,gs_hamk%mgfft,mpi_enreg,nband_k,ndat,gs_hamk%ngfft,&
1055 &             npw_k,gs_hamk%n4,gs_hamk%n5,gs_hamk%n6,occ_k,1,gs_hamk%ucvol,wtk,&
1056 &             gpu_option=dtset%gpu_option)
1057            end if
1058            ABI_FREE(cwavef_x)
1059            ABI_FREE(cwavef_y)
1060          end if
1061          call timab(537,2,tsec)
1062          ABI_FREE(cwavefb)
1063        end if
1064      end if
1065      ABI_NVTX_END_RANGE()
1066    end if ! End of SCF calculation
1067 
1068 !    Call to nonlocal operator:
1069 !    - Compute nonlocal forces from most recent wfs
1070 !    - PAW: compute projections of WF onto NL projectors (cprj)
1071    ABI_NVTX_START_RANGE(NVTX_VTOWFK_NONLOP)
1072    if (has_cprj_in_memory) then
1073      if (optforces>0) then
1074 !      Treat all wavefunctions in case of PAW
1075        cwaveprj => cprj(:,1+(iblock-1)*my_nspinor*blocksize+ibg:iblock*my_nspinor*blocksize+ibg)
1076        call nonlop(choice,cpopt,cwaveprj,enlout,gs_hamk,idir,eig_k(1+(iblock-1)*blocksize:iblock*blocksize),&
1077 &       mpi_enreg,blocksize,nnlout,&
1078 &       paw_opt,signs,nonlop_dum,tim_nonlop,cwavef,cwavef)
1079 !      Acccumulate forces
1080        iband=(iblock-1)*blocksize
1081        do iblocksize=1,blocksize
1082          ii=0
1083          if (nnlout>3*natom) ii=6
1084          iband=iband+1;ibs=ii+nnlout*(iblocksize-1)
1085          grnl_k(1:nnlout,iband)=enlout(ibs+1:ibs+nnlout)
1086        end do
1087      end if ! PAW or forces
1088    else
1089      if(iscf>0.or.gs_hamk%usecprj==1)then
1090        if (gs_hamk%usepaw==1.or.optforces/=0) then
1091 !        Treat all wavefunctions in case of varying occupation numbers or PAW
1092 !        Only treat occupied bands in case of fixed occupation numbers and NCPP
1093          if(fixed_occ.and.abs(occblock)<=tol8.and.gs_hamk%usepaw==0) then
1094            if (optforces>0) grnl_k(:,(iblock-1)*blocksize+1:iblock*blocksize)=zero
1095          else
1096            if(gs_hamk%usepaw==1) then
1097              call timab(554,1,tsec)  ! "vtowfk:rhoij"
1098            end if
1099            if(cpopt==1) then
1100              iband=1+(iblock-1)*bandpp_cprj
1101              call pawcprj_copy(cprj(:,1+(iblock-1)*my_nspinor*blocksize+ibg:iblock*my_nspinor*blocksize+ibg),cwaveprj)
1102            end if
1103            if (mpi_enreg%paral_kgb==1) then
1104              call timab(572,1,tsec) ! 'prep_nonlop%vtowfk'
1105              call prep_nonlop(choice,cpopt,cwaveprj,enlout,gs_hamk,idir, &
1106 &             eig_k(1+(iblock-1)*blocksize:iblock*blocksize),blocksize,&
1107 &             mpi_enreg,nnlout,paw_opt,signs,nonlop_dum,tim_nonlop_prep,cwavef,cwavef,already_transposed=.false.)
1108              call timab(572,2,tsec)
1109            else
1110              call nonlop(choice,cpopt,cwaveprj,enlout,gs_hamk,idir,eig_k(1+(iblock-1)*blocksize:iblock*blocksize),&
1111 &             mpi_enreg,blocksize,nnlout,&
1112 &             paw_opt,signs,nonlop_dum,tim_nonlop,cwavef,cwavef)
1113            end if
1114            if(gs_hamk%usepaw==1) then
1115              call timab(554,2,tsec)
1116            end if
1117 !          Acccumulate forces
1118            if (optforces>0) then
1119              iband=(iblock-1)*blocksize
1120              do iblocksize=1,blocksize
1121                ii=0
1122                if (nnlout>3*natom) ii=6
1123                iband=iband+1;ibs=ii+nnlout*(iblocksize-1)
1124                grnl_k(1:nnlout,iband)=enlout(ibs+1:ibs+nnlout)
1125              end do
1126            end if
1127 !          Store cprj (<Pnl|Psi>)
1128            if (gs_hamk%usepaw==1.and.gs_hamk%usecprj==1) then
1129              iband=1+(iblock-1)*bandpp_cprj
1130              call pawcprj_put(gs_hamk%atindx,cwaveprj,cprj,natom,iband,ibg,ikpt,iorder_cprj,isppol,&
1131 &             mband_cprj,dtset%mkmem,natom,bandpp_cprj,nband_k_cprj,gs_hamk%dimcprj,my_nspinor,&
1132 &             dtset%nsppol,dtfil%unpaw,mpicomm=mpi_enreg%comm_kpt,proc_distrb=mpi_enreg%proc_distrb)
1133            end if
1134          end if
1135        end if ! PAW or forces
1136      end if ! iscf>0 or iscf=-3
1137    end if
1138    ABI_NVTX_END_RANGE()
1139  end do !  End of loop on blocks
1140  !call cwtime_report(" Block loop", cpu, wall, gflops)
1141 
1142  if(dtset%gpu_option==ABI_GPU_KOKKOS) then
1143 #if defined HAVE_GPU && defined HAVE_YAKL
1144    ABI_FREE_MANAGED(cwavef)
1145 #endif
1146  else
1147    ABI_FREE(cwavef)
1148  end if
1149 
1150  ABI_FREE(enlout)
1151 
1152  if (.not.has_cprj_in_memory) then
1153    if (gs_hamk%usepaw==1.and.(iscf>0.or.gs_hamk%usecprj==1)) then
1154      call pawcprj_free(cwaveprj)
1155    end if
1156    ABI_FREE(cwaveprj)
1157  else
1158    nullify(cwaveprj)
1159  end if
1160 
1161  if (fixed_occ.and.iscf>0) then
1162    if(dtset%gpu_option==ABI_GPU_KOKKOS) then
1163 #if defined HAVE_GPU && defined HAVE_YAKL
1164      ABI_FREE_MANAGED(wfraug)
1165 #endif
1166    else
1167      ABI_FREE(wfraug)
1168    end if
1169  end if
1170 
1171 !Write the number of one-way 3D ffts skipped until now (in case of fixed occupation numbers
1172  if(iscf>0 .and. fixed_occ .and. (prtvol>2 .or. ikpt<=nkpt_max) )then
1173    write(msg,'(a,i0)')' vtowfk: number of one-way 3D ffts skipped in vtowfk until now =',nskip
1174    call wrtout(std_out,msg,'PERS')
1175  end if
1176 
1177  ! Norm-conserving or FockACE: Compute nonlocal+FockACE part of total energy: rotate subvnlx elements
1178  ! Note the two calls. For (old) lobpcgwf we have a (nband_k, nband_k) matrix, whereas cgwf
1179  ! returns results in packed form.
1180  ! CHEBYSHEV, NEW LOBPCG and RMM-DIIS do not need this
1181  !
1182  rotate_subvnlx = gs_hamk%usepaw == 0 .and. wfopta10 /= 1 .and. .not. newlobpcg
1183  if (use_rmm_diis) rotate_subvnlx = .False.
1184 
1185  if (rotate_subvnlx) then
1186    call timab(586,1,tsec)   ! 'vtowfk(nonlocalpart)'
1187    if (wfopta10==4) then
1188      call cg_hrotate_and_get_diag(istwf_k, nband_k, totvnlx, evec, enlx_k)
1189    else
1190      call cg_hprotate_and_get_diag(nband_k, subvnlx, evec, enlx_k)
1191    end if
1192    call timab(586,2,tsec)
1193  end if
1194 
1195 !###################################################################
1196 
1197  if (iscf<=0 .and. residk > dtset%tolwfr) then
1198    write(msg,'(2(a,i0),a,es13.5)')&
1199     "Wavefunctions not converged for ikpt: ", ikpt, ", nnsclo: ",nnsclo_now,', max resid: ',residk
1200    ABI_WARNING(msg)
1201  end if
1202 
1203 !Print out eigenvalues (hartree)
1204  if (prtvol/=5.and.(prtvol>2 .or. ikpt<=nkpt_max)) then
1205    write(msg, '(5x,a,i5,2x,a,a,a,i4,a,i4,a)' ) &
1206     'eigenvalues (hartree) for',nband_k,'bands',ch10,&
1207     '              after ',inonsc,' non-SCF iterations with ',dtset%nline,' CG line minimizations'
1208    call wrtout(std_out,msg,'PERS')
1209    do ii=0,(nband_k-1)/6
1210      write(msg, '(1p,6e12.4)' ) (eig_k(iband),iband=1+6*ii,min(6+6*ii,nband_k))
1211      call wrtout(std_out,msg,'PERS')
1212    end do
1213  else if(ikpt==nkpt_max+1)then
1214    call wrtout(std_out,' vtowfk : prtvol=0 or 1, do not print more k-points.','PERS')
1215  end if
1216 
1217 !Print out decomposition of eigenvalues in the non-selfconsistent case or if prtvol>=10
1218  if( (iscf<0 .and. (prtvol>2 .or. ikpt<=nkpt_max)) .or. prtvol>=10)then
1219    write(msg, '(5x,a,i5,2x,a,a,a,i4,a,i4,a)' ) &
1220 &   ' mean kinetic energy (hartree) for ',nband_k,' bands',ch10,&
1221 &   '              after ',inonsc,' non-SCF iterations with ',dtset%nline,' CG line minimizations'
1222    call wrtout(std_out,msg,'PERS')
1223 
1224    do ii=0,(nband_k-1)/6
1225      write(msg, '(1p,6e12.4)' ) (ek_k(iband),iband=1+6*ii,min(6+6*ii,nband_k))
1226      call wrtout(std_out,msg,'PERS')
1227    end do
1228 
1229    if (gs_hamk%usepaw==0) then
1230      write(msg, '(5x,a,i5,2x,a,a,a,i4,a,i4,a)' ) &
1231 &     ' mean NL+Fock-type energy (hartree) for ',nband_k,' bands',ch10,&
1232 &     '              after ',inonsc,' non-SCF iterations with ',dtset%nline,' CG line minimizations'
1233      call wrtout(std_out,msg,'PERS')
1234 
1235      do ii=0,(nband_k-1)/6
1236        write(msg,'(1p,6e12.4)') (enlx_k(iband),iband=1+6*ii,min(6+6*ii,nband_k))
1237        call wrtout(std_out,msg,'PERS')
1238      end do
1239    end if
1240  end if
1241 
1242  !Hamiltonian constructor for gwls_sternheimer
1243  if(dtset%optdriver==RUNL_GWLS) then
1244    call build_H(dtset,mpi_enreg,cpopt,cg,gs_hamk,kg_k,kinpw)
1245  end if
1246 
1247  if (has_cprj_in_memory) nullify(cprj_cwavef_bands)
1248 
1249  if(wfopta10 /= 1 .and. .not. newlobpcg) then
1250    ABI_FREE(evec)
1251    ABI_FREE(subham)
1252    ABI_FREE(totvnlx)
1253    ABI_FREE(subvnlx)
1254    ABI_FREE(subovl)
1255  end if
1256 
1257  ABI_SFREE(gsc)
1258 
1259  if(wfoptalg==3) then
1260    ABI_FREE(eig_save)
1261  end if
1262 
1263  if (prtvol==-level) then
1264    ! Structured debugging: if prtvol=-level, stop here.
1265    write(msg,'(a,a,a,i0,a)')' vtowfk : exit ',ch10,'  prtvol=-',level,', debugging mode => stop '
1266    ABI_ERROR(msg)
1267  end if
1268 
1269  call timab(30,2,tsec)
1270  call timab(28,2,tsec)
1271 
1272  DBG_EXIT("COLL")
1273 
1274 end subroutine vtowfk