TABLE OF CONTENTS


ABINIT/lobpcgwf [ Functions ]

[ Top ] [ Functions ]

NAME

 lobpcgwf

FUNCTION

 this routine updates the whole wave functions at a given k-point,
 using the lobpcg method
 for a given spin-polarization, from a fixed hamiltonian
 but might also simply compute eigenvectors and eigenvalues at this k point.
 it will also update the matrix elements of the hamiltonian.

COPYRIGHT

 Copyright (C) 1998-2024 ABINIT group (FBottin,GZ,AR,MT,FDahm)
 this file is distributed under the terms of the
 gnu general public license, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 for the initials of contributors, see ~abinit/doc/developers/contributors.txt .

INPUTS

  dtset <type(dataset_type)>=all input variales for this dataset
  gs_hamk <type(gs_hamiltonian_type)>=all data for the hamiltonian at k
  icg=shift to be applied on the location of data in the array cg
  igsc=shift to be applied on the location of data in the array gsc
  kinpw(npw)=(modified) kinetic energy for each plane wave (hartree)
  mcg=second dimension of the cg array
  mgsc=second dimension of the gsc array
  mpi_enreg=information about MPI parallelization
  nband_k=number of bands at this k point for that spin polarization
  nbdblock : number of blocks
  npw_k=number of plane waves at this k point
  prtvol=control print volume and debugging output
  use_totvnlx=1 if one has to compute totvnlx

OUTPUT

  resid_k(nband_k)=residuals for each states
  subham(nband_k*(nband_k+1))=the matrix elements of h
  If gs_hamk%usepaw==0:
    gsc(2,mgsc)=<g|s|c> matrix elements (s=overlap)
    totvnlx(nband_k*use_totvnlx,nband_k*use_totvnlx)=the matrix elements of vnl+vfockACE

SIDE EFFECTS

  cg(2,mcg)=updated wavefunctions

SOURCE

  80 subroutine lobpcgwf(cg,dtset,gs_hamk,gsc,icg,igsc,kinpw,mcg,mgsc,mpi_enreg,&
  81 &                   nband_k,nbdblock,npw_k,prtvol,resid_k,subham,totvnlx,use_totvnlx)
  82 
  83 
  84  use defs_basis
  85  use m_abicore
  86  use m_lobpcg
  87  use m_abi_linalg
  88  use m_wfutils
  89  use m_xmpi
  90  use m_errors
  91  use, intrinsic :: iso_c_binding
  92  use m_dtset
  93 
  94  use defs_abitypes, only : mpi_type
  95  use m_time,        only : timab
  96  use m_hamiltonian, only : gs_hamiltonian_type
  97  use m_pawcprj,     only : pawcprj_type
  98  use m_getghc,      only : getghc
  99  use m_prep_kgb,    only : prep_getghc
 100 
 101  implicit none
 102 
 103 !Arguments ------------------------------------
 104  integer,intent(in) :: icg,igsc,mcg,mgsc,nband_k,nbdblock,npw_k,prtvol,use_totvnlx
 105  type(gs_hamiltonian_type),intent(inout) :: gs_hamk
 106  type(dataset_type),intent(in) :: dtset
 107  type(mpi_type),intent(in) :: mpi_enreg
 108  real(dp),intent(inout) :: cg(2,mcg),gsc(2,mgsc)
 109  real(dp),intent(in) :: kinpw(npw_k)
 110  real(dp),intent(out) :: resid_k(nband_k)
 111  real(dp),intent(inout) :: subham(nband_k*(nband_k+1))
 112  real(dp),intent(inout) :: totvnlx((3-gs_hamk%istwf_k)*nband_k*use_totvnlx,nband_k*use_totvnlx)
 113 
 114 !Local variables-------------------------------
 115  integer, parameter :: tim_getghc=5
 116  integer :: activepsize,activersize,bblocksize,bigorder,blocksize,cpopt
 117  integer :: cond_try
 118  integer :: iblocksize,iblock,ierr,ii,info,istwf_k,isubh
 119  integer :: iterationnumber
 120  integer :: iwavef,i1,i2,i3,i4,maxiterations,my_nspinor
 121  integer :: nrestart,optekin,optpcon,restart
 122  integer :: sij_opt,timopt,tim_wfcopy,tim_xeigen
 123  integer :: tim_xortho,tim_xprecon,use_lapack_gpu,use_linalg_gpu,vectsize
 124  logical :: gen_eigenpb
 125  integer :: cplx
 126  real(dp) :: condestgramb,deltae,deold,dum
 127  complex(dpc) :: cminusone
 128  real(dp) :: zvar(2)
 129  logical :: havetoprecon
 130  real(dp) :: tsec(2)
 131  real(dp), allocatable :: gwavef(:,:),cwavef(:,:),gvnlxc(:,:)
 132  real(dp), allocatable :: swavef(:,:)
 133  real(dp), allocatable :: residualnorms(:),eigen(:)
 134  real(dp), allocatable :: tmpeigen(:)
 135  real(dp), allocatable :: pcon(:,:)
 136  real(dp), allocatable, target :: blockvectorx(:,:),blockvectorvx(:,:),blockvectorax(:,:),blockvectorbx(:,:)
 137  real(dp), allocatable, target :: blockvectorr(:,:),blockvectorvr(:,:),blockvectorar(:,:),blockvectorbr(:,:)
 138  real(dp), allocatable, target :: blockvectorp(:,:),blockvectorvp(:,:),blockvectorap(:,:),blockvectorbp(:,:),blockvectordumm(:,:)
 139  real(dp), allocatable, target :: blockvectory(:,:),blockvectorby(:,:),blockvectorz(:,:)
 140  real(dp), allocatable, target :: gramxax(:,:),gramxar(:,:),gramxap(:,:),gramrar(:,:),gramrap(:,:),grampap(:,:)
 141  real(dp), allocatable, target :: gramxbx(:,:),gramxbr(:,:),gramxbp(:,:),gramrbr(:,:),gramrbp(:,:),grampbp(:,:)
 142  real(dp), allocatable, target :: coordx1(:,:),coordx2(:,:),coordx3(:,:),lambda(:,:),grama(:,:),gramb(:,:),gramyx(:,:)
 143  real(dp), allocatable :: tmpgramb(:,:),transf3(:,:,:),transf5(:,:,:)
 144  real(dp), allocatable :: tsubham(:,:)
 145  type(pawcprj_type) :: cprj_dum(gs_hamk%natom,0)
 146  character(len=500) :: message
 147  character, dimension(2) :: cparam
 148  type(c_ptr) :: A_gpu,C_gpu,coordx2_gpu,coordx3_gpu,bblockvector_gpu,gram_gpu
 149  type(c_ptr) :: blockvectorr_gpu,blockvectorar_gpu,blockvectorbr_gpu
 150 
 151 !Index of a given band
 152 !gramindex(iblocksize)=(iblocksize-1)*cplx+1
 153 
 154 ! *********************************************************************
 155 
 156  DBG_ENTER("COLL")
 157 
 158  call timab(530,1,tsec)
 159  if(abs(dtset%timopt)==4) then
 160    call timab(520,1,tsec)
 161  end if
 162 
 163 !###########################################################################
 164 !################ INITIALISATION  ##########################################
 165 !###########################################################################
 166 
 167 !For timing
 168  timopt=dtset%timopt
 169  tim_wfcopy=584
 170  !tim_xcopy=584
 171  tim_xeigen=587
 172  !tim_xgemm=532
 173  tim_xortho=535
 174  tim_xprecon=536
 175  !tim_xtrsm=535
 176 
 177 !Variables
 178  maxiterations=dtset%nline
 179  gen_eigenpb=(gs_hamk%usepaw==1)
 180  my_nspinor=max(1,dtset%nspinor/mpi_enreg%nproc_spinor)
 181  cminusone=-cone
 182  istwf_k=gs_hamk%istwf_k
 183  info = 0
 184  cparam(1)='t'
 185  cparam(2)='c'
 186 
 187 !Depends on istwfk
 188  if ( istwf_k == 2 ) then
 189    cplx=1
 190    if (mpi_enreg%me_g0 == 1) then
 191      vectsize=2*npw_k*my_nspinor-1
 192    else
 193      vectsize=2*npw_k*my_nspinor
 194    end if
 195  else
 196    cplx=2
 197    vectsize=npw_k*my_nspinor
 198  end if
 199 
 200 !For preconditionning
 201  optekin=0;if (dtset%wfoptalg>10) optekin=0
 202  optpcon=1;if (dtset%wfoptalg>10) optpcon=0
 203 
 204 !For communication
 205  !blocksize=mpi_enreg%nproc_fft
 206  !if(mpi_enreg%paral_kgb==1) blocksize=mpi_enreg%nproc_band*mpi_enreg%bandpp
 207  blocksize=nband_k/dtset%nblock_lobpcg
 208  !IF you want to compare with new lobpcg in sequential uncomment the following
 209  !line
 210  !blocksize=mpi_enreg%nproc_band*mpi_enreg%bandpp
 211 
 212 !Iniitializations/allocations of GPU parallelism
 213  use_linalg_gpu=0;use_lapack_gpu=0
 214  if ((dtset%gpu_option==ABI_GPU_LEGACY).and. &
 215 & (vectsize*blocksize*blocksize>dtset%gpu_linalg_limit)) use_linalg_gpu=1
 216 #if defined HAVE_LINALG_MAGMA
 217  use_lapack_gpu=use_linalg_gpu
 218 #endif
 219  if(use_linalg_gpu==1) then
 220    call alloc_on_gpu(A_gpu,             INT(cplx, c_size_t)*dp*vectsize*blocksize)
 221    call alloc_on_gpu(C_gpu,             INT(cplx, c_size_t)*dp*vectsize*blocksize)
 222    call alloc_on_gpu(blockvectorr_gpu,  INT(cplx, c_size_t)*dp*vectsize*blocksize)
 223    call alloc_on_gpu(blockvectorar_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 224    call alloc_on_gpu(blockvectorbr_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 225    call alloc_on_gpu(coordx2_gpu,       INT(cplx, c_size_t)*dp*blocksize*blocksize)
 226    call alloc_on_gpu(coordx3_gpu,       INT(cplx, c_size_t)*dp*blocksize*blocksize)
 227  end if
 228 
 229  if(abs(dtset%timopt)==4) then
 230    call timab(520,2,tsec)
 231  end if
 232 
 233 !###########################################################################
 234 !################ BIG LOOP OVER BLOCKS  ####################################
 235 !###########################################################################
 236 
 237  do iblock=1,nbdblock
 238 
 239    if(abs(dtset%timopt)==4) then
 240      call timab(521,1,tsec)
 241    end if
 242 
 243    havetoprecon=.true.
 244    nrestart=0
 245    bblocksize=(iblock-1)*blocksize
 246 
 247 !  allocations
 248    ABI_MALLOC(pcon,(npw_k,blocksize))
 249    ABI_MALLOC(blockvectorx,(cplx*vectsize,blocksize))
 250    ABI_MALLOC(blockvectorax,(cplx*vectsize,blocksize))
 251    ABI_MALLOC(blockvectorbx,(cplx*vectsize,blocksize))
 252    ABI_MALLOC(blockvectorr,(cplx*vectsize,blocksize))
 253    ABI_MALLOC(blockvectorar,(cplx*vectsize,blocksize))
 254    ABI_MALLOC(blockvectorbr,(cplx*vectsize,blocksize))
 255    ABI_MALLOC(blockvectorp,(cplx*vectsize,blocksize))
 256    ABI_MALLOC(blockvectorap,(cplx*vectsize,blocksize))
 257    ABI_MALLOC(blockvectorbp,(cplx*vectsize,blocksize))
 258    ABI_MALLOC(blockvectordumm,(cplx*vectsize,blocksize))
 259    ABI_MALLOC(gramxax,(cplx*blocksize,blocksize))
 260    ABI_MALLOC(gramxar,(cplx*blocksize,blocksize))
 261    ABI_MALLOC(gramxap,(cplx*blocksize,blocksize))
 262    ABI_MALLOC(gramrar,(cplx*blocksize,blocksize))
 263    ABI_MALLOC(gramrap,(cplx*blocksize,blocksize))
 264    ABI_MALLOC(grampap,(cplx*blocksize,blocksize))
 265    ABI_MALLOC(gramxbx,(cplx*blocksize,blocksize))
 266    ABI_MALLOC(gramxbr,(cplx*blocksize,blocksize))
 267    ABI_MALLOC(gramxbp,(cplx*blocksize,blocksize))
 268    ABI_MALLOC(gramrbr,(cplx*blocksize,blocksize))
 269    ABI_MALLOC(gramrbp,(cplx*blocksize,blocksize))
 270    ABI_MALLOC(grampbp,(cplx*blocksize,blocksize))
 271    ABI_MALLOC(transf3,(cplx*blocksize,blocksize,3))
 272    ABI_MALLOC(transf5,(cplx*blocksize,blocksize,5))
 273    ABI_MALLOC(lambda,(cplx*blocksize,blocksize))
 274    ABI_MALLOC(residualnorms,(blocksize))
 275 
 276    ABI_MALLOC(blockvectory,(cplx*vectsize,bblocksize))
 277    ABI_MALLOC(blockvectorby,(cplx*vectsize,bblocksize))
 278    ABI_MALLOC(gramyx,(cplx*bblocksize,blocksize))
 279    if (gs_hamk%usepaw==0) then
 280      ABI_MALLOC(blockvectorvx,(cplx*vectsize,blocksize))
 281      ABI_MALLOC(blockvectorvr,(cplx*vectsize,blocksize))
 282      ABI_MALLOC(blockvectorvp,(cplx*vectsize,blocksize))
 283    end if
 284 
 285    if(use_linalg_gpu==1) then
 286      if(iblock/=1) then
 287        call alloc_on_gpu(bblockvector_gpu, INT(cplx, c_size_t)*dp*vectsize*bblocksize)
 288        call alloc_on_gpu(gram_gpu,         INT(cplx, c_size_t)*dp*bblocksize*blocksize)
 289      else
 290        call alloc_on_gpu(bblockvector_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 291        call alloc_on_gpu(gram_gpu,         INT(cplx, c_size_t)*dp*blocksize*blocksize)
 292      end if
 293    end if
 294 
 295 ! Initialize global variables in m_wfutils.
 296    call setWFParameter(cplx,mpi_enreg%me_g0,npw_k,my_nspinor,icg,igsc,blocksize)
 297 
 298 !  transfer array of wf coeff in iblock to blockvectorx
 299    call wfcopy('D',blocksize*vectsize,cg,1,blockvectorx,1,blocksize,iblock,'C',withbbloc=.true.,&
 300 &   timopt=timopt,tim_wfcopy=tim_wfcopy)
 301 
 302 !  !!!!!!!!!!!!!!!!!!!!!!!! Begin if iblock /=1 !!!!!!!!!!!!!!!!!!!!!!!!!!
 303 !  transfer array of wf coeff less than iblock to blockvectory
 304    if(iblock /=1) then
 305      call wfcopy('D',bblocksize*vectsize,cg,1,blockvectory,1,bblocksize,iblock,'C',withbbloc=.false.,&
 306 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 307 
 308      if(gen_eigenpb) then
 309        call wfcopy('D',bblocksize*vectsize,gsc,1,blockvectorby,1,bblocksize,iblock,'S',withbbloc=.false.,&
 310 &       timopt=timopt,tim_wfcopy=tim_wfcopy)
 311      else
 312        call abi_xcopy(vectsize*bblocksize,blockvectory,1,blockvectorby,1,x_cplx=x_cplx)
 313      end if
 314 
 315 !    b-orthogonalize x to the constraint y (supposed b-orthonormal)
 316 !    blockvectorx=blockvectorx-matmul(blockvectory,matmul((blockvectorby)^T,blockvectorx))
 317 
 318      call abi_xgemm(cparam(cplx),'n',bblocksize,blocksize,vectsize,cone,blockvectorby,&
 319 &     vectsize,blockvectorx,vectsize,czero,gramyx,bblocksize,x_cplx=x_cplx)
 320 
 321      if(abs(dtset%timopt)==3) then
 322        call timab(533,1,tsec)
 323      end if
 324      call xmpi_sum(gramyx,mpi_enreg%comm_bandspinorfft,ierr)
 325      if(abs(dtset%timopt)==3) then
 326        call timab(533,2,tsec)
 327      end if
 328 
 329      call abi_xgemm('n','n',vectsize,blocksize,bblocksize,cminusone,blockvectory,&
 330 &     vectsize,gramyx,bblocksize,cone,blockvectorx,vectsize,x_cplx=x_cplx)
 331 
 332    end if
 333 !  !!!!!!!!!!!!!!!!!!!!!!!! End if iblock /=1 !!!!!!!!!!!!!!!!!!!!!!!!!!!
 334 
 335    ABI_MALLOC(cwavef,(2,npw_k*my_nspinor*blocksize))
 336    ABI_MALLOC(gwavef,(2,npw_k*my_nspinor*blocksize))
 337    ABI_MALLOC(gvnlxc,(2,npw_k*my_nspinor*blocksize))
 338    ABI_MALLOC(swavef,(2,npw_k*my_nspinor*blocksize))
 339 
 340    call wfcopy('I',vectsize*blocksize,blockvectorx,1,cwavef,1,blocksize,iblock,'W',withbbloc=.false.,&
 341 &   timopt=timopt,tim_wfcopy=tim_wfcopy)
 342 
 343    if(abs(dtset%timopt)==4) then
 344      call timab(521,2,tsec)
 345    end if
 346    if(abs(dtset%timopt)==4) then
 347      call timab(526,1,tsec)
 348    end if
 349 
 350    cpopt=-1;sij_opt=0;if (gen_eigenpb) sij_opt=1
 351 
 352    if (mpi_enreg%paral_kgb==0) then
 353      call getghc(cpopt,cwavef,cprj_dum,gwavef,swavef,gs_hamk,gvnlxc,dum,&
 354 &     mpi_enreg,blocksize,prtvol,sij_opt,tim_getghc,0)
 355    else
 356      call prep_getghc(cwavef,gs_hamk,gvnlxc,gwavef,swavef,dum,blocksize,mpi_enreg,&
 357 &     prtvol,sij_opt,cpopt,cprj_dum,already_transposed=.false.)
 358    end if
 359    if(abs(dtset%timopt)==4) then
 360      call timab(526,2,tsec)
 361    end if
 362    if(abs(dtset%timopt)==4) then
 363      call timab(522,1,tsec)
 364    end if
 365 
 366    if ( gen_eigenpb ) then
 367      call wfcopy('D',vectsize*blocksize,swavef,1,blockvectorbx,1,blocksize,iblock,'W',withbbloc=.false.,&
 368 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 369    else
 370      call wfcopy('D',vectsize*blocksize,gvnlxc,1,blockvectorvx,1,blocksize,iblock,'W',withbbloc=.false.,&
 371 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 372      call abi_xcopy(vectsize*blocksize,blockvectorx,1,blockvectorbx,1,x_cplx=x_cplx)
 373    end if
 374 
 375    call wfcopy('D',vectsize*blocksize,gwavef,1,blockvectorax,1,blocksize,iblock,'W',withbbloc=.false.,&
 376 &   timopt=timopt,tim_wfcopy=tim_wfcopy)
 377 
 378    ABI_FREE(cwavef)
 379    ABI_FREE(gwavef)
 380    ABI_FREE(gvnlxc)
 381    ABI_FREE(swavef)
 382 
 383    call abi_xorthonormalize(blockvectorx,blockvectorbx,blocksize,mpi_enreg%comm_bandspinorfft,gramxbx,vectsize,&
 384 &   x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 385 
 386    call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramxbx,blocksize,blockvectorbx,vectsize,x_cplx=x_cplx)
 387    call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramxbx,blocksize,blockvectorax,vectsize,x_cplx=x_cplx)
 388 
 389    if (gs_hamk%usepaw==0) then
 390      call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramxbx,blocksize,blockvectorvx,vectsize,x_cplx=x_cplx)
 391    end if
 392 
 393 !  Do rayleigh ritz on a in space x
 394 !  gramxax=matmul(transpose(blockvectorx),blockvectorax)
 395    call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorx,&
 396 &   vectsize,blockvectorax,vectsize,czero,gramxax,blocksize,x_cplx=x_cplx)
 397 
 398    if(abs(dtset%timopt)==3) then
 399      call timab(533,1,tsec)
 400    end if
 401    call xmpi_sum(gramxax,mpi_enreg%comm_bandspinorfft,ierr)
 402    if(abs(dtset%timopt)==3) then
 403      call timab(533,2,tsec)
 404    end if
 405    ABI_MALLOC(eigen,(blocksize))
 406 
 407    call abi_xheev('v','u',blocksize,gramxax,blocksize,eigen,x_cplx=cplx,istwf_k=istwf_k, &
 408    timopt=timopt,tim_xeigen=tim_xeigen,use_slk=dtset%use_slk,use_gpu_magma=use_lapack_gpu)
 409 
 410 !  blockvectorx=matmul(blockvectorx,gramxax)
 411    call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorx,&
 412 &   vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 413    call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorx,1,x_cplx=x_cplx)
 414 
 415 !  blockvectorax=matmul(blockvectorax,gramxax)
 416    call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorax,&
 417 &   vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 418    call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorax,1,x_cplx=x_cplx)
 419 
 420 !  blockvectorvx=matmul(blockvectorvx,gramxax)
 421    if (gs_hamk%usepaw==0) then
 422      call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvx,&
 423 &     vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 424      call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorvx,1,x_cplx=x_cplx)
 425    end if
 426 
 427 !  blockvectorbx=matmul(blockvectorbx,gramxax)
 428    call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbx,&
 429 &   vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 430    call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorbx,1,x_cplx=x_cplx)
 431 
 432    do iblocksize=1,blocksize
 433      zvar=(/eigen(iblocksize),zero/)
 434      call abi_xcopy(1,zvar,1,lambda(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx)
 435    end do
 436    ABI_FREE(eigen)
 437 
 438    if(abs(dtset%timopt)==4) then
 439      call timab(522,2,tsec)
 440    end if
 441 
 442 !  ###########################################################################
 443 !  ################ PERFORM LOOP ON NLINE ####################################
 444 !  ###########################################################################
 445 !  now the main alogrithm
 446    iter: do iterationnumber=1,maxiterations
 447 
 448      if(abs(dtset%timopt)==4) then
 449        call timab(523,1,tsec)
 450      end if
 451 
 452 !    Build residual
 453 !    blockvectorr=blockvectorax-matmul(blockvectorx,lambda)
 454      call xprecon(blockvectorbx,lambda,blocksize,&
 455 &     iterationnumber,kinpw,mpi_enreg,npw_k,my_nspinor,&
 456 &     optekin,optpcon,pcon,blockvectorax,blockvectorr,vectsize,timopt=timopt,tim_xprecon=tim_xprecon)
 457 
 458      residualnorms=sum(blockvectorr**2,dim=1)
 459 
 460      if(abs(dtset%timopt)==3) then
 461        call timab(533,1,tsec)
 462      end if
 463      call xmpi_sum(residualnorms,mpi_enreg%comm_bandspinorfft,ierr)
 464      if(abs(dtset%timopt)==3) then
 465        call timab(533,2,tsec)
 466      end if
 467 
 468      resid_k(bblocksize+1:bblocksize+blocksize)=residualnorms(1:blocksize)
 469 
 470 !    If residual sufficiently small stop line minimizations
 471      if (abs(maxval(residualnorms(1:blocksize)))<dtset%tolwfr_diago) then
 472        if (prtvol > 0) then
 473          write(message, '(a,i0,a,i0,a,es12.4)' ) &
 474 &         ' lobpcgwf: block ',iblock,' converged after ',iterationnumber,&
 475 &         ' line minimizations: maxval(resid(1:blocksize)) =',maxval(residualnorms(1:blocksize))
 476          call wrtout(std_out,message,'PERS')
 477        end if
 478        havetoprecon=.false.
 479        exit
 480      end if
 481 
 482      if(use_linalg_gpu==1) then
 483        call copy_on_gpu(blockvectorr, blockvectorr_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 484      end if
 485 
 486      if(iblock /=1) then
 487 !      Residuals orthogonal to blockvectorby
 488 !      blockvectorr=blockvectorr-matmul(blockvectory,matmul((blockvectorby)^T,blockvectorr))
 489 
 490        if(use_linalg_gpu==1) then
 491          call copy_on_gpu(blockvectorby, bblockvector_gpu, INT(cplx, c_size_t)*dp*vectsize*bblocksize)
 492          call gpu_xgemm(cplx,cparam(cplx),'n',bblocksize,blocksize,vectsize,cone,bblockvector_gpu,&
 493 &         vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,bblocksize)
 494          call copy_from_gpu(gramyx, gram_gpu, INT(cplx, c_size_t)*dp*bblocksize*blocksize)
 495        else
 496          call abi_xgemm(cparam(cplx),'n',bblocksize,blocksize,vectsize,cone,blockvectorby,&
 497 &         vectsize,blockvectorr,vectsize,czero,gramyx,bblocksize,x_cplx=x_cplx)
 498        end if
 499 
 500        if(abs(dtset%timopt)==3) then
 501          call timab(533,1,tsec)
 502        end if
 503        call xmpi_sum(gramyx,mpi_enreg%comm_bandspinorfft,ierr)
 504        if(abs(dtset%timopt)==3) then
 505          call timab(533,2,tsec)
 506        end if
 507 
 508        if(use_linalg_gpu==1) then
 509          call copy_on_gpu(gramyx,       gram_gpu,         INT(cplx, c_size_t)*dp*bblocksize*blocksize)
 510          call copy_on_gpu(blockvectory, bblockvector_gpu, INT(cplx, c_size_t)*dp*vectsize*bblocksize)
 511          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,bblocksize,cminusone,bblockvector_gpu,&
 512 &         vectsize,gram_gpu,bblocksize,cone,blockvectorr_gpu,vectsize)
 513        else
 514          call abi_xgemm('n','n',vectsize,blocksize,bblocksize,cminusone,blockvectory,&
 515 &         vectsize,gramyx,bblocksize,cone,blockvectorr,vectsize,x_cplx=x_cplx)
 516        end if
 517 
 518      end if
 519 
 520 !    Residuals orthogonal to blockvectorx
 521 !    blockvectorr=blockvectorr-matmul(blockvectorx,matmul((blockvectorbx)^T,blockvectorr))
 522      if(use_linalg_gpu==1) then
 523        call copy_on_gpu(blockvectorbx, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 524        call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,C_gpu,&
 525 &       vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,blocksize)
 526        call copy_from_gpu(gramxax, gram_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
 527      else
 528        call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorbx,&
 529 &       vectsize,blockvectorr,vectsize,czero,gramxax,blocksize,x_cplx=x_cplx)
 530      end if
 531 
 532      if(abs(dtset%timopt)==3) then
 533        call timab(533,1,tsec)
 534      end if
 535      call xmpi_sum(gramxax,mpi_enreg%comm_bandspinorfft,ierr)
 536      if(abs(dtset%timopt)==3) then
 537        call timab(533,2,tsec)
 538      end if
 539 
 540      if(use_linalg_gpu==1) then
 541        call copy_on_gpu(gramxax,      gram_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
 542        call copy_on_gpu(blockvectorx, C_gpu,    INT(cplx, c_size_t)*dp*vectsize*blocksize)
 543        call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cminusone,C_gpu,&
 544 &       vectsize,gram_gpu,blocksize,cone,blockvectorr_gpu,vectsize)
 545        call copy_from_gpu(blockvectorr, blockvectorr_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 546      else
 547        call abi_xgemm('n','n',vectsize,blocksize,blocksize,cminusone,blockvectorx,&
 548 &       vectsize,gramxax,blocksize,cone,blockvectorr,vectsize,x_cplx=x_cplx)
 549      end if
 550 
 551      ABI_MALLOC(cwavef,(2,npw_k*my_nspinor*blocksize))
 552      ABI_MALLOC(gwavef,(2,npw_k*my_nspinor*blocksize))
 553      ABI_MALLOC(gvnlxc,(2,npw_k*my_nspinor*blocksize))
 554      ABI_MALLOC(swavef,(2,npw_k*my_nspinor*blocksize))
 555 
 556      call wfcopy('I',vectsize*blocksize,blockvectorr,1,cwavef,1,blocksize,iblock,'W',withbbloc=.false.,&
 557 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 558 
 559      cpopt=-1;sij_opt=0;if (gen_eigenpb) sij_opt=1
 560 
 561      if(abs(dtset%timopt)==4) then
 562        call timab(523,2,tsec)
 563      end if
 564      if(abs(dtset%timopt)==4) then
 565        call timab(526,1,tsec)
 566      end if
 567 
 568      if (mpi_enreg%paral_kgb==0) then
 569        call getghc(cpopt,cwavef,cprj_dum,gwavef,swavef,gs_hamk,gvnlxc,dum,&
 570 &       mpi_enreg,blocksize,prtvol,sij_opt,tim_getghc,0)
 571      else
 572        call prep_getghc(cwavef,gs_hamk,gvnlxc,gwavef,swavef,dum,blocksize,mpi_enreg,&
 573 &       prtvol,sij_opt,cpopt,cprj_dum,already_transposed=.false.)
 574      end if
 575 
 576      if(abs(dtset%timopt)==4) then
 577        call timab(526,2,tsec)
 578      end if
 579      if(abs(dtset%timopt)==4) then
 580        call timab(524,1,tsec)
 581      end if
 582 
 583      if (gen_eigenpb) then
 584        call wfcopy('D',vectsize*blocksize,swavef,1,blockvectorbr,1,blocksize,iblock,'W',withbbloc=.false.,&
 585 &       timopt=timopt,tim_wfcopy=tim_wfcopy)
 586      else
 587        call abi_xcopy(vectsize*blocksize,blockvectorr,1,blockvectorbr,1,x_cplx=x_cplx)
 588        call wfcopy('D',vectsize*blocksize,gvnlxc,1,blockvectorvr,1,blocksize,iblock,'W',withbbloc=.false.,&
 589 &       timopt=timopt,tim_wfcopy=tim_wfcopy)
 590      end if
 591 
 592      call wfcopy('D',vectsize*blocksize,gwavef,1,blockvectorar,1,blocksize,iblock,'W',withbbloc=.false.,&
 593 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 594 
 595      ABI_FREE(cwavef)
 596      ABI_FREE(gwavef)
 597      ABI_FREE(gvnlxc)
 598      ABI_FREE(swavef)
 599 
 600      if(use_linalg_gpu==1) then
 601        call copy_on_gpu(blockvectorbr, blockvectorbr_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 602        call gpu_xorthonormalize(blockvectorr_gpu,blockvectorbr_gpu,blocksize,mpi_enreg%comm_bandspinorfft,gram_gpu,vectsize,&
 603 &       x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 604        call copy_from_gpu(blockvectorr,  blockvectorr_gpu,  INT(cplx, c_size_t)*dp*vectsize*blocksize)
 605        call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,blockvectorbr_gpu,vectsize)
 606        call copy_from_gpu(blockvectorbr, blockvectorbr_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 607        call copy_on_gpu(blockvectorar,   blockvectorar_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 608        call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,blockvectorar_gpu,vectsize)
 609        call copy_from_gpu(blockvectorar, blockvectorar_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 610        if (gs_hamk%usepaw==0) then
 611          call copy_on_gpu(blockvectorvr,   A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 612          call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,A_gpu,vectsize)
 613          call copy_from_gpu(blockvectorvr, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 614        end if
 615        call copy_from_gpu(gramrbr, gram_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
 616      else
 617        call abi_xorthonormalize(blockvectorr,blockvectorbr,blocksize,mpi_enreg%comm_bandspinorfft,gramrbr,vectsize,&
 618 &       x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 619        call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramrbr,blocksize,blockvectorbr,vectsize,x_cplx=x_cplx)
 620        call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramrbr,blocksize,blockvectorar,vectsize,x_cplx=x_cplx)
 621        if (gs_hamk%usepaw==0) then
 622          call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramrbr,blocksize,blockvectorvr,vectsize,x_cplx=x_cplx)
 623        end if
 624      end if
 625 
 626      if(iterationnumber>1) then
 627        if(use_linalg_gpu==1) then
 628          call copy_on_gpu(blockvectorp,    A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 629          call copy_on_gpu(blockvectorbp,   C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 630          call gpu_xorthonormalize(A_gpu,C_gpu,blocksize,mpi_enreg%comm_bandspinorfft,gram_gpu,vectsize,&
 631 &         x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 632          call copy_from_gpu(blockvectorp,  A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 633          call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,C_gpu,vectsize)
 634          call copy_from_gpu(blockvectorbp, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 635          call copy_on_gpu(blockvectorap,   A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 636          call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,A_gpu,vectsize)
 637          call copy_from_gpu(blockvectorap, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 638          if (gs_hamk%usepaw==0) then
 639            call copy_on_gpu(blockvectorvp, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 640            call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,A_gpu,vectsize)
 641            call copy_from_gpu(blockvectorvp, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 642          end if
 643          call copy_from_gpu(grampbp, gram_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
 644        else
 645 !        call orthonormalize(blockvectorp,blockvectorbp,blockvectorap)
 646          call abi_xorthonormalize(blockvectorp,blockvectorbp,blocksize,mpi_enreg%comm_bandspinorfft,grampbp,vectsize,&
 647 &         x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 648 !        blockvectorap=matmul(blockvectorap,grampbp)
 649          call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,grampbp,blocksize,blockvectorbp,vectsize,x_cplx=x_cplx)
 650          call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,grampbp,blocksize,blockvectorap,vectsize,x_cplx=x_cplx)
 651          if (gs_hamk%usepaw==0) then
 652            call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,grampbp,blocksize,blockvectorvp,vectsize,x_cplx=x_cplx)
 653          end if
 654        end if
 655      end if
 656 
 657      activersize=blocksize
 658      if (iterationnumber==1) then
 659        activepsize=0
 660        restart=1
 661      else
 662        activepsize=blocksize
 663        restart=0
 664      end if
 665 
 666 !    gramxar=matmul((blockvectorax)^T,blockvectorr)
 667 !    gramrar=matmul((blockvectorar)^T,blockvectorr)
 668 !    gramxax=matmul((blockvectorax)^T,blockvectorx)
 669      if(use_linalg_gpu==1) then
 670        call copy_on_gpu(blockvectorax, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 671 
 672        call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,A_gpu,&
 673 &       vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,blocksize)
 674        call copy_from_gpu(gramxar, gram_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
 675 
 676        call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorar_gpu,&
 677 &       vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,blocksize)
 678        call copy_from_gpu(gramrar, gram_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
 679 
 680        call copy_on_gpu(blockvectorx, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 681        call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,A_gpu,&
 682 &       vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 683 
 684        call copy_from_gpu(gramxax, gram_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
 685      else
 686        call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorax,&
 687 &       vectsize,blockvectorr,vectsize,czero,gramxar,blocksize,x_cplx=x_cplx)
 688        call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorar,&
 689 &       vectsize,blockvectorr,vectsize,czero,gramrar,blocksize,x_cplx=x_cplx)
 690        call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorax,&
 691 &       vectsize,blockvectorx,vectsize,czero,gramxax,blocksize,x_cplx=x_cplx)
 692      end if
 693 
 694      call abi_xcopy(blocksize*blocksize,gramxar,1,transf3(:,:,1),1,x_cplx=x_cplx)
 695      call abi_xcopy(blocksize*blocksize,gramrar,1,transf3(:,:,2),1,x_cplx=x_cplx)
 696      call abi_xcopy(blocksize*blocksize,gramxax,1,transf3(:,:,3),1,x_cplx=x_cplx)
 697      if(abs(dtset%timopt)==3) then
 698        call timab(533,1,tsec)
 699      end if
 700      call xmpi_sum(transf3,mpi_enreg%comm_bandspinorfft,ierr)
 701      if(abs(dtset%timopt)==3) then
 702        call timab(533,2,tsec)
 703      end if
 704 
 705      call abi_xcopy(blocksize*blocksize,transf3(:,:,1),1,gramxar,1,x_cplx=x_cplx)
 706      call abi_xcopy(blocksize*blocksize,transf3(:,:,2),1,gramrar,1,x_cplx=x_cplx)
 707      call abi_xcopy(blocksize*blocksize,transf3(:,:,3),1,gramxax,1,x_cplx=x_cplx)
 708 
 709 !    gramxbx=matmul((blockvectorbx)^T,blockvectorx)
 710 !    gramrbr=matmul((blockvectorbr)^T,blockvectorr)
 711 !    gramxbr=matmul((blockvectorbx)^T,blockvectorr)
 712 !    Note that the gramb matrix is more easier to construct than grama:
 713 !    i) <x|B|x>=<r|B|r>=<p|B|p>=(1;0)
 714 !    since the x, r and p blockvector are normalized
 715 !    ii) <r|B|x>=(0;0)
 716 !    since the x and r blockvector are orthogonalized
 717 !    iii) The <p|B|r> and <p|B|x> have to be computed.
 718      gramxbx(:,:)=zero
 719      gramrbr(:,:)=zero
 720      gramxbr(:,:)=zero
 721      do iblocksize=1,blocksize
 722        zvar=(/one,zero/)
 723        call abi_xcopy(1,zvar,1,gramxbx(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx)
 724        call abi_xcopy(1,zvar,1,gramrbr(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx)
 725      end do
 726 
 727 !    ###########################################################################
 728 !    ################ PERFORM LOOP ON COND #####################################
 729 !    ###########################################################################
 730 
 731      i1=0;i2=blocksize;i3=2*blocksize;i4=3*blocksize
 732      cond: do cond_try=1,2 !2 when restart
 733        if (restart==0) then
 734 
 735 !        gramxap=matmul((blockvectorax)^T,blockvectorp)
 736 !        gramrap=matmul((blockvectorar)^T,blockvectorp)
 737 !        grampap=matmul((blockvectorap)^T,blockvectorp)
 738 !        gramxbp=matmul((blockvectorbx)^T,blockvectorp)
 739 !        gramrbp=matmul((blockvectorbr)^T,blockvectorp)
 740 !        grampbp=matmul((blockvectorbp)^T,blockvectorp)
 741          if(use_linalg_gpu==1) then
 742            call copy_on_gpu(blockvectorp, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 743            call copy_on_gpu(blockvectorax,A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 744            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 745 &           cone,A_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 746            call copy_from_gpu(gramxap, gram_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
 747            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 748 &           cone,blockvectorar_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 749            call copy_from_gpu(gramrap, gram_gpu,  INT(cplx, c_size_t)*dp*blocksize*blocksize)
 750            call copy_on_gpu(blockvectorap, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 751            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 752 &           cone,A_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 753            call copy_from_gpu(grampap, gram_gpu,  INT(cplx, c_size_t)*dp*blocksize*blocksize)
 754            call copy_on_gpu(blockvectorbx, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 755            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 756 &           cone,A_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 757            call copy_from_gpu(gramxbp, gram_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
 758            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 759 &           cone,blockvectorbr_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 760            call copy_from_gpu(gramrbp, gram_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
 761          else
 762            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorax,&
 763 &           vectsize,blockvectorp,vectsize,czero,gramxap,blocksize,x_cplx=x_cplx)
 764            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorar,&
 765 &           vectsize,blockvectorp,vectsize,czero,gramrap,blocksize,x_cplx=x_cplx)
 766            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorap,&
 767 &           vectsize,blockvectorp,vectsize,czero,grampap,blocksize,x_cplx=x_cplx)
 768            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorbx,&
 769 &           vectsize,blockvectorp,vectsize,czero,gramxbp,blocksize,x_cplx=x_cplx)
 770            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorbr,&
 771 &           vectsize,blockvectorp,vectsize,czero,gramrbp,blocksize,x_cplx=x_cplx)
 772          end if
 773 !        It's not necessary to compute the last one: <p|B|p>=(1;0) (see above)
 774          transf5(:,:,1)=gramxap(:,:)
 775          transf5(:,:,2)=gramrap(:,:)
 776          transf5(:,:,3)=grampap(:,:)
 777          transf5(:,:,4)=gramxbp(:,:)
 778          transf5(:,:,5)=gramrbp(:,:)
 779          if(abs(dtset%timopt)==3) then
 780            call timab(533,1,tsec)
 781          end if
 782          call xmpi_sum(transf5,mpi_enreg%comm_bandspinorfft,ierr)
 783          if(abs(dtset%timopt)==3) then
 784            call timab(533,2,tsec)
 785          end if
 786          gramxap(:,:)=transf5(:,:,1)
 787          gramrap(:,:)=transf5(:,:,2)
 788          grampap(:,:)=transf5(:,:,3)
 789          gramxbp(:,:)=transf5(:,:,4)
 790          gramrbp(:,:)=transf5(:,:,5)
 791          grampbp(:,:)=zero
 792          do iblocksize=1,blocksize
 793            zvar=(/one,zero/)
 794            call abi_xcopy(1,zvar,1,grampbp(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx)
 795          end do
 796          bigorder=i4
 797          ABI_MALLOC(grama,(cplx*i4,i4))
 798          ABI_MALLOC(gramb,(cplx*i4,i4))
 799          ABI_MALLOC(eigen,(i4))
 800 !        ABI_MALLOC(coordx,(cplx*i4,blocksize))
 801          ABI_MALLOC(coordx1,(cplx*blocksize,blocksize))
 802          ABI_MALLOC(coordx2,(cplx*blocksize,blocksize))
 803          ABI_MALLOC(coordx3,(cplx*blocksize,blocksize))
 804          grama(:,:)=zero;gramb(:,:)=zero
 805          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxax
 806          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxar
 807          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i3+1:i4)=gramxap
 808          grama(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrar
 809          grama(gramindex(i2+1):gramindex(i3)+cplx-1,i3+1:i4)=gramrap
 810          grama(gramindex(i3+1):gramindex(i4)+cplx-1,i3+1:i4)=grampap
 811          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxbx
 812          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxbr
 813          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i3+1:i4)=gramxbp
 814          gramb(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrbr
 815          gramb(gramindex(i2+1):gramindex(i3)+cplx-1,i3+1:i4)=gramrbp
 816          gramb(gramindex(i3+1):gramindex(i4)+cplx-1,i3+1:i4)=grampbp
 817        else
 818          bigorder=i3
 819          ABI_MALLOC(grama,(cplx*i3,i3))
 820          ABI_MALLOC(gramb,(cplx*i3,i3))
 821          ABI_MALLOC(eigen,(i3))
 822 !        ABI_MALLOC(coordx,(cplx*i3,blocksize))
 823          ABI_MALLOC(coordx1,(cplx*blocksize,blocksize))
 824          ABI_MALLOC(coordx2,(cplx*blocksize,blocksize))
 825          grama(:,:)=zero;gramb(:,:)=zero
 826          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxax
 827          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxar
 828          grama(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrar
 829          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxbx
 830          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxbr
 831          gramb(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrbr
 832        end if
 833 
 834        ABI_MALLOC(tmpgramb,(cplx*bigorder,bigorder))
 835        ABI_MALLOC(tmpeigen,(bigorder))
 836        tmpgramb=gramb
 837 
 838        call abi_xheev('v','u',bigorder,tmpgramb,bigorder,tmpeigen,x_cplx=cplx,istwf_k=istwf_k, &
 839 &       timopt=timopt,tim_xeigen=tim_xeigen,use_slk=dtset%use_slk,use_gpu_magma=use_lapack_gpu)
 840 
 841        condestgramb=tmpeigen(bigorder)/tmpeigen(1)
 842        ABI_FREE(tmpgramb)
 843        ABI_FREE(tmpeigen)
 844 
 845        if (condestgramb.gt.1d+5.or.condestgramb.lt.0.d0.or.info/=0) then
 846          write(std_out,*)'condition number of the Gram matrix = ',condestgramb
 847          if (cond_try==1.and.restart==0) then
 848            ABI_FREE(grama)
 849            ABI_FREE(gramb)
 850            ABI_FREE(eigen)
 851 !          ABI_FREE(coordx)
 852            ABI_FREE(coordx1)
 853            ABI_FREE(coordx2)
 854            if(bigorder==i4) then
 855              ABI_FREE(coordx3)
 856            end if
 857            if (nrestart.gt.1) then
 858              ABI_WARNING('the minimization is stopped for this block')
 859              exit iter
 860            else
 861              restart=1
 862              nrestart=nrestart+1
 863              call wrtout(std_out,'Lobpcgwf: restart performed',"PERS")
 864            end if
 865          else
 866            ABI_WARNING('Gramm matrix ill-conditionned: results may be unpredictable')
 867          end if
 868        else
 869          exit cond
 870        end if
 871      end do cond
 872 
 873 !    ###########################################################################
 874 !    ################ END LOOP ON COND #########################################
 875 !    ###########################################################################
 876 
 877      call abi_xhegv(1,'v','u',bigorder,grama,bigorder,gramb,bigorder,eigen,x_cplx=cplx,istwf_k=istwf_k, &
 878      timopt=timopt,tim_xeigen=tim_xeigen,use_slk=dtset%use_slk,use_gpu_magma=use_lapack_gpu)
 879 
 880      deltae=-one
 881      do iblocksize=1,blocksize
 882        call abi_xcopy(1,lambda(cplx*(iblocksize-1)+1,iblocksize),1,zvar,1,x_cplx=x_cplx)
 883        deltae=max(deltae,abs(cmplx(zvar(1),zvar(2))-eigen(iblocksize)))
 884        zvar=(/eigen(iblocksize),zero/)
 885        call abi_xcopy(1,zvar,1,lambda(cplx*(iblocksize-1)+1,iblocksize),1,x_cplx=x_cplx)
 886      end do
 887 
 888 !    DEBUG
 889 !    write(std_out,*)'eigen',eigen(1:blocksize)
 890 !    ENDDEBUG
 891 
 892 !    coordx(1:bigorder*cplx,1:blocksize)=grama(1:bigorder*cplx,1:blocksize)
 893      coordx1(:,:) =  grama(1+i1*cplx : i2*cplx,1:blocksize)
 894      coordx2(:,:) =  grama(1+i2*cplx : i3*cplx,1:blocksize)
 895      if(bigorder==i4) then
 896        coordx3(:,:) =  grama(1+i3*cplx : i4*cplx,1:blocksize)
 897      end if
 898 
 899 
 900      if(use_linalg_gpu==1) then
 901        call copy_on_gpu(coordx2, coordx2_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
 902        if(bigorder==i4) then
 903          call copy_on_gpu(coordx3, coordx3_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
 904        end if
 905      end if
 906 
 907      ABI_FREE(grama)
 908      ABI_FREE(gramb)
 909      ABI_FREE(eigen)
 910      if (restart==0 .and. iterationnumber >1) then
 911 
 912 !      blockvectorp=matmul(blockvectorr,coordx(i2+1:i3,:))+&
 913 !      &               matmul(blockvectorp,coordx(i3+1:i4,:))
 914        if(use_linalg_gpu==1) then
 915 !        call copy_on_gpu(blockvectorr, blockvectorr_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 916          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorr_gpu,&
 917 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 918          call copy_on_gpu(blockvectorp, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 919          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,vectsize,&
 920 &         coordx3_gpu,blocksize,cone,C_gpu,vectsize)
 921          call copy_from_gpu(blockvectorp, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 922        else
 923          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorr,&
 924 &         vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 925          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorp,&
 926 &         vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx)
 927          call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorp,1,x_cplx=x_cplx)
 928        end if
 929 
 930 !      blockvectorap=matmul(blockvectorar,coordx(i2+1:i3,:))+&
 931 !      &                matmul(blockvectorap,coordx(i3+1:i4,:))
 932        if(use_linalg_gpu==1) then
 933 !        call copy_on_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize)
 934          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorar_gpu,&
 935 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 936          call copy_on_gpu(blockvectorap, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 937          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,vectsize,&
 938 &         coordx3_gpu,blocksize,cone,C_gpu,vectsize)
 939          call copy_from_gpu(blockvectorap, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 940        else
 941          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorar,&
 942 &         vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 943          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorap,&
 944 &         vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx)
 945          call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorap,1,x_cplx=x_cplx)
 946        end if
 947 
 948 
 949 !      blockvectorvp=matmul(blockvectorvr,coordx(i2+1:i3,:))+&
 950 !      &                matmul(blockvectorvp,coordx(i3+1:i4,:))
 951        if (gs_hamk%usepaw==0) then
 952          if(use_linalg_gpu==1) then
 953            call copy_on_gpu(blockvectorvr, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 954            call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
 955 &           vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 956            call copy_on_gpu(blockvectorvp, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 957            call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
 958 &           vectsize,coordx3_gpu,blocksize,cone,C_gpu,vectsize)
 959            call copy_from_gpu(blockvectorvp, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 960          else
 961            call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvr,&
 962 &           vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 963            call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvp,&
 964 &           vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx)
 965            call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorvp,1,x_cplx=x_cplx)
 966          end if
 967        end if
 968 
 969 !      blockvectorbp=matmul(blockvectorbr,coordx(i2+1:i3,:))+&
 970 !      &                matmul(blockvectorbp,coordx(i3+1:i4,:))
 971        if(use_linalg_gpu==1) then
 972 !        call copy_on_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize)
 973          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorbr_gpu,&
 974 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 975          call copy_on_gpu(blockvectorbp, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 976          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,vectsize,&
 977 &         coordx3_gpu,blocksize,cone,C_gpu,vectsize)
 978          call copy_from_gpu(blockvectorbp, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 979        else
 980          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbr,&
 981 &         vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 982          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbp,&
 983 &         vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx)
 984          call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorbp,1,x_cplx=x_cplx)
 985        end if
 986 
 987      else
 988 
 989 !      blockvectoSz =matmul(blockvectorr,coordx(i2+1:i3,:))
 990        if(use_linalg_gpu==1) then
 991 !        call copy_on_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize)
 992          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorr_gpu,&
 993 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 994          call copy_from_gpu(blockvectorp, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
 995        else
 996          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorr,&
 997 &         vectsize,coordx2,blocksize,czero,blockvectorp,vectsize,x_cplx=x_cplx)
 998        end if
 999 
1000 !      blockvectorap=matmul(blockvectorar,coordx(i2+1:i3,:))
1001        if(use_linalg_gpu==1) then
1002 !        call copy_on_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize)
1003          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorar_gpu,&
1004 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1005          call copy_from_gpu(blockvectorap, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
1006        else
1007          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorar,&
1008 &         vectsize,coordx2,blocksize,czero,blockvectorap,vectsize,x_cplx=x_cplx)
1009        end if
1010 !      blockvectorvp=matmul(blockvectorvr,coordx(i2+1:i3,:))
1011        if (gs_hamk%usepaw==0) then
1012          if(use_linalg_gpu==1) then
1013            call copy_on_gpu(blockvectorvr, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
1014            call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1015 &           vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1016            call copy_from_gpu(blockvectorvp, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
1017          else
1018            call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvr,&
1019 &           vectsize,coordx2,blocksize,czero,blockvectorvp,vectsize,x_cplx=x_cplx)
1020          end if
1021        end if
1022 
1023 !      blockvectorbp=matmul(blockvectorbr,coordx(i2+1:i3,:))
1024        if(use_linalg_gpu==1) then
1025 !        call copy_on_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize)
1026          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorbr_gpu,&
1027 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1028          call copy_from_gpu(blockvectorbp, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
1029        else
1030          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbr,&
1031 &         vectsize,coordx2,blocksize,czero,blockvectorbp,vectsize,x_cplx=x_cplx)
1032        end if
1033      end if
1034 
1035      if(use_linalg_gpu==1) then
1036        call copy_on_gpu(coordx1, coordx2_gpu, INT(cplx, c_size_t)*dp*blocksize*blocksize)
1037      end if
1038 
1039 !    blockvectorx = matmul(blockvectorx,coordx(i1+1:i2,:))+blockvectorp
1040      if(use_linalg_gpu==1) then
1041        call copy_on_gpu(blockvectorx, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
1042        call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1043 &       vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1044        call copy_from_gpu(blockvectordumm, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
1045      else
1046        call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorx,&
1047 &       vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
1048      end if
1049      blockvectorx = blockvectordumm+blockvectorp
1050 
1051 !    blockvectorax= matmul(blockvectorax,coordx(i1+1:i2,:))+blockvectorap
1052      if(use_linalg_gpu==1) then
1053        call copy_on_gpu(blockvectorax, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
1054        call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1055 &       vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1056        call copy_from_gpu(blockvectordumm, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
1057      else
1058        call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorax,&
1059 &       vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
1060      end if
1061      blockvectorax = blockvectordumm+blockvectorap
1062 
1063 !    blockvectorvx= matmul(blockvectorvx,coordx(i1+1:i2,:))+blockvectorvp
1064      if (gs_hamk%usepaw==0) then
1065        if(use_linalg_gpu==1) then
1066          call copy_on_gpu(blockvectorvx, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
1067          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1068 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1069          call copy_from_gpu(blockvectordumm, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
1070        else
1071          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvx,&
1072 &         vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
1073        end if
1074        blockvectorvx = blockvectordumm+blockvectorvp
1075      end if
1076 
1077 !    blockvectorbx= matmul(blockvectorbx,coordx(i1+1:i2,:))+blockvectorbp
1078      if(use_linalg_gpu==1) then
1079        call copy_on_gpu(blockvectorbx, A_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
1080        call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1081 &       vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1082        call copy_from_gpu(blockvectordumm, C_gpu, INT(cplx, c_size_t)*dp*vectsize*blocksize)
1083      else
1084        call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbx,&
1085 &       vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
1086      end if
1087      blockvectorbx = blockvectordumm+blockvectorbp
1088 
1089 !    ABI_FREE(coordx)
1090      ABI_FREE(coordx1)
1091      ABI_FREE(coordx2)
1092      if(bigorder==i4) then
1093        ABI_FREE(coordx3)
1094      end if
1095 
1096 !    Check convergence on energy and eventually exit
1097      if (iterationnumber==1) then
1098        deold=deltae
1099      else if (iterationnumber>1) then
1100        if ((abs(deltae)<0.005*abs(deold)).and.(iterationnumber/=maxiterations))then
1101          if(prtvol>=10)then
1102            write(message, '(2(a,i4),1x,a,1p,e12.4,a,e12.4,a)' ) &
1103 &           ' lobpcgwf: block',iblock,', line',iterationnumber,&
1104 &           ', deltae=',deltae,' < 0.005*',deold,' =>skip lines !'
1105            call wrtout(std_out,message,'PERS')
1106          end if
1107          exit
1108        else if (abs(deltae)>0.005*abs(deold)) then
1109          if(prtvol>=10)then
1110            write(message, '(2(a,i4),1x,a,1p,e12.4,a,e12.4,a)' ) &
1111 &           ' lobpcgwf: block',iblock,', line',iterationnumber,&
1112 &           ', deltae=',deltae,' > 0.005*',deold,' =>keep on working !'
1113            call wrtout(std_out,message,'PERS')
1114          end if
1115        end if
1116      end if
1117 
1118      if(abs(dtset%timopt)==4) then
1119        call timab(524,2,tsec)
1120      end if
1121 
1122    end do iter
1123 
1124 !  ###########################################################################
1125 !  ################## END LOOP ON NLINE ######################################
1126 !  ###########################################################################
1127 
1128    if(abs(dtset%timopt)==4) then
1129      call timab(525,1,tsec)
1130    end if
1131 
1132    if (havetoprecon) then
1133      call xprecon(blockvectorbx,lambda,blocksize,&
1134 &     iterationnumber,kinpw,mpi_enreg,npw_k,my_nspinor,&
1135 &     optekin,optpcon,pcon,blockvectorax,blockvectorr,vectsize,timopt=timopt,tim_xprecon=tim_xprecon)
1136 
1137      residualnorms=sum(abs(blockvectorr)**2,dim=1)
1138 
1139      if(abs(dtset%timopt)==3) then
1140        call timab(533,1,tsec)
1141      end if
1142      call xmpi_sum(residualnorms,mpi_enreg%comm_bandspinorfft,ierr)
1143      if(abs(dtset%timopt)==3) then
1144        call timab(533,2,tsec)
1145      end if
1146 
1147      resid_k(bblocksize+1:bblocksize+blocksize)=residualnorms(1:blocksize)
1148    end if
1149 
1150    call wfcopy('I',vectsize*blocksize,blockvectorx,1,cg,1,blocksize,iblock,'C',withbbloc=.true.,&
1151 &   timopt=timopt,tim_wfcopy=tim_wfcopy)
1152 
1153    if(gen_eigenpb) then
1154      call wfcopy('I',vectsize*blocksize,blockvectorbx,1,gsc,1,blocksize,iblock,'S',withbbloc=.true.,&
1155 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
1156    end if
1157 
1158 !  The Vnl+VFockACE part of the Hamiltonian is no more stored in the packed form such as it was the case for subvnlx(:).
1159 !  Now, the full matrix is stored in totvnlx(:,:). This trick permits:
1160 !  1) to avoid the reconstruction of the total matrix in vtowfk.F90 (double loop over bands)
1161 !  2) to use two optimized matrix-matrix blas routine for general (in lobpcgccwf.F90) or hermitian (in vtowfk.F90)
1162 !  operators, zgemm.f and zhemm.f respectively, rather than a triple loop in both cases.
1163    iwavef=iblock*blocksize
1164    isubh=1+2*bblocksize*(bblocksize+1)/2
1165 
1166    ABI_MALLOC(blockvectorz,(cplx*vectsize,iwavef))
1167    if(bblocksize > 0 ) then
1168      call abi_xcopy(bblocksize*vectsize,blockvectory(:,1:bblocksize),1,blockvectorz(:,1:bblocksize),1,x_cplx=x_cplx)
1169    end if
1170    call abi_xcopy( blocksize*vectsize,blockvectorx(:,1:blocksize) ,1,blockvectorz(:,bblocksize+1:iwavef),1,x_cplx=x_cplx)
1171 
1172    ABI_MALLOC(tsubham,(cplx*iwavef,blocksize))
1173    tsubham(:,:)=zero
1174    call abi_xgemm(cparam(cplx),'n',iwavef,blocksize,vectsize,cone,blockvectorz,vectsize,&
1175 &   blockvectorax,vectsize,czero,tsubham,iwavef,x_cplx=x_cplx)
1176 
1177    if (gs_hamk%usepaw==0) then
1178      ! MG FIXME: Here gfortran4.9 allocates temporary array for C in abi_d2zgemm.
1179      call abi_xgemm(cparam(cplx),'n',blocksize,iwavef,vectsize,cone,blockvectorvx,vectsize,&
1180 &     blockvectorz,vectsize,czero,totvnlx(cplx*bblocksize+1:cplx*iwavef,1:iwavef),blocksize,x_cplx=x_cplx)
1181    end if
1182 
1183    do iblocksize=1,blocksize
1184      do ii=1,bblocksize+iblocksize
1185        if ( cplx == 1 ) then
1186          subham(isubh)  = tsubham(ii,iblocksize)
1187          subham(isubh+1)= zero
1188        else
1189          subham(isubh)  = tsubham(2*ii-1,iblocksize)
1190          subham(isubh+1)= tsubham(2*ii  ,iblocksize)
1191        end if
1192        isubh=isubh+2
1193      end do
1194    end do
1195    ABI_FREE(tsubham)
1196    ABI_FREE(blockvectorz)
1197 !  comm for subham and subvnlx are made in vtowfk
1198 
1199    ABI_FREE(pcon)
1200    ABI_FREE(blockvectory)
1201    ABI_FREE(blockvectorby)
1202    ABI_FREE(gramyx)
1203    ABI_FREE(blockvectorx)
1204    ABI_FREE(blockvectorax)
1205    ABI_FREE(blockvectorbx)
1206    ABI_FREE(blockvectorr)
1207    ABI_FREE(blockvectorar)
1208    ABI_FREE(blockvectorbr)
1209    ABI_FREE(blockvectorp)
1210    ABI_FREE(blockvectorap)
1211    ABI_FREE(blockvectorbp)
1212    if (gs_hamk%usepaw==0) then
1213      ABI_FREE(blockvectorvx)
1214      ABI_FREE(blockvectorvp)
1215      ABI_FREE(blockvectorvr)
1216    end if
1217    ABI_FREE(blockvectordumm)
1218    ABI_FREE(gramxax)
1219    ABI_FREE(gramxar)
1220    ABI_FREE(gramxap)
1221    ABI_FREE(gramrar)
1222    ABI_FREE(gramrap)
1223    ABI_FREE(grampap)
1224    ABI_FREE(gramxbx)
1225    ABI_FREE(gramxbr)
1226    ABI_FREE(gramxbp)
1227    ABI_FREE(gramrbr)
1228    ABI_FREE(gramrbp)
1229    ABI_FREE(grampbp)
1230    ABI_FREE(transf3)
1231    ABI_FREE(transf5)
1232    ABI_FREE(lambda)
1233    ABI_FREE(residualnorms)
1234    if(use_linalg_gpu==1) then
1235      call dealloc_on_gpu(bblockvector_gpu)
1236      call dealloc_on_gpu(gram_gpu)
1237    end if
1238 
1239  end do  ! End big loop over bands inside blocks
1240 
1241  if(use_linalg_gpu==1) then
1242    call dealloc_on_gpu(blockvectorr_gpu)
1243    call dealloc_on_gpu(blockvectorar_gpu)
1244    call dealloc_on_gpu(blockvectorbr_gpu)
1245    call dealloc_on_gpu(A_gpu)
1246    call dealloc_on_gpu(C_gpu)
1247    call dealloc_on_gpu(coordx2_gpu)
1248    call dealloc_on_gpu(coordx3_gpu)
1249    !call gpu_linalg_shutdown()
1250  end if
1251 
1252  if(abs(dtset%timopt)==4) then
1253    call timab(525,2,tsec)
1254  end if
1255  call timab(530,2,tsec)
1256 
1257  DBG_ENTER("COLL")
1258 
1259  contains
1260 
1261    function gramindex(iblocksize)
1262 
1263    integer :: gramindex,iblocksize
1264    gramindex=(iblocksize-1)*cplx+1
1265  end function gramindex
1266 
1267 end subroutine lobpcgwf

ABINIT/m_lobpcgwf_old [ Modules ]

[ Top ] [ Modules ]

NAME

   m_lobpcgwf_old

FUNCTION

COPYRIGHT

  Copyright (C) 2008-2024 ABINIT group ()
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

16 #if defined HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include "abi_common.h"
21 
22 module m_lobpcgwf_old
23 
24  implicit none
25 
26  private