TABLE OF CONTENTS


ABINIT/lobpcgwf [ Functions ]

[ Top ] [ Functions ]

NAME

 lobpcgwf

FUNCTION

 this routine updates the whole wave functions at a given k-point,
 using the lobpcg method
 for a given spin-polarization, from a fixed hamiltonian
 but might also simply compute eigenvectors and eigenvalues at this k point.
 it will also update the matrix elements of the hamiltonian.

COPYRIGHT

 Copyright (C) 1998-2018 ABINIT group (FBottin,GZ,AR,MT,FDahm)
 this file is distributed under the terms of the
 gnu general public license, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 for the initials of contributors, see ~abinit/doc/developers/contributors.txt .

INPUTS

  dtset <type(dataset_type)>=all input variales for this dataset
  gs_hamk <type(gs_hamiltonian_type)>=all data for the hamiltonian at k
  icg=shift to be applied on the location of data in the array cg
  igsc=shift to be applied on the location of data in the array gsc
  kinpw(npw)=(modified) kinetic energy for each plane wave (hartree)
  mcg=second dimension of the cg array
  mgsc=second dimension of the gsc array
  mpi_enreg=informations about MPI parallelization
  nband_k=number of bands at this k point for that spin polarization
  nbdblock : number of blocks
  npw_k=number of plane waves at this k point
  prtvol=control print volume and debugging output

OUTPUT

  resid_k(nband_k)=residuals for each states
  subham(nband_k*(nband_k+1))=the matrix elements of h
  If gs_hamk%usepaw==0:
    gsc(2,mgsc)=<g|s|c> matrix elements (s=overlap)
    totvnl(nband_k*(1-gs_hamk%usepaw),nband_k*(1-gs_hamk%usepaw))=the matrix elements of vnl

SIDE EFFECTS

  cg(2,mcg)=updated wavefunctions

PARENTS

      vtowfk

CHILDREN

      abi_xcopy,abi_xgemm,abi_xheev,abi_xhegv,abi_xorthonormalize,abi_xtrsm
      alloc_on_gpu,copy_from_gpu,copy_on_gpu,dealloc_on_gpu,getghc,gpu_xgemm
      gpu_xorthonormalize,gpu_xtrsm,prep_getghc,setwfparameter,timab,wfcopy
      wrtout,xmpi_sum,xprecon

SOURCE

  55 #if defined HAVE_CONFIG_H
  56 #include "config.h"
  57 #endif
  58 
  59 #include "abi_common.h"
  60 
  61 subroutine lobpcgwf(cg,dtset,gs_hamk,gsc,icg,igsc,kinpw,mcg,mgsc,mpi_enreg,&
  62 &                   nband_k,nbdblock,npw_k,prtvol,resid_k,subham,totvnl)
  63 
  64 
  65  use defs_abitypes
  66  use defs_basis
  67  use m_profiling_abi
  68  use m_lobpcg
  69  use m_abi_linalg
  70  use m_wfutils
  71  use m_xmpi
  72  use m_errors
  73  use iso_c_binding
  74 
  75  use m_hamiltonian, only : gs_hamiltonian_type
  76  use m_pawcprj,     only : pawcprj_type
  77 
  78 !This section has been created automatically by the script Abilint (TD).
  79 !Do not modify the following lines by hand.
  80 #undef ABI_FUNC
  81 #define ABI_FUNC 'lobpcgwf'
  82  use interfaces_14_hidewrite
  83  use interfaces_18_timing
  84  use interfaces_66_wfs
  85 !End of the abilint section
  86 
  87  implicit none
  88 
  89 !Arguments ------------------------------------
  90  integer,intent(in) :: icg,igsc,mcg,mgsc,nband_k,nbdblock,npw_k,prtvol
  91  type(gs_hamiltonian_type),intent(inout) :: gs_hamk
  92  type(dataset_type),intent(in) :: dtset
  93  type(mpi_type),intent(inout) :: mpi_enreg
  94  real(dp),intent(inout) :: cg(2,mcg),gsc(2,mgsc)
  95  real(dp),intent(in) :: kinpw(npw_k)
  96  real(dp),intent(out) :: resid_k(nband_k)
  97  real(dp),intent(inout) :: subham(nband_k*(nband_k+1))
  98  real(dp),intent(inout) :: totvnl((3-gs_hamk%istwf_k)*nband_k*(1-gs_hamk%usepaw),nband_k*(1-gs_hamk%usepaw))
  99 
 100 !Local variables-------------------------------
 101  integer, parameter :: tim_getghc=5
 102  integer :: activepsize,activersize,bblocksize,bigorder,blocksize,cpopt
 103  integer :: cond_try
 104  integer :: iblocksize,iblock,ierr,ii,info,istwf_k,isubh
 105  integer :: iterationnumber
 106  integer :: iwavef,i1,i2,i3,i4,maxiterations,my_nspinor
 107  integer :: nrestart,optekin,optpcon,restart
 108  integer :: sij_opt,timopt,tim_wfcopy,tim_xeigen
 109  integer :: tim_xortho,tim_xprecon,use_lapack_gpu,use_linalg_gpu,vectsize
 110  logical :: gen_eigenpb
 111  integer :: cplx
 112  real(dp) :: condestgramb,deltae,deold,dum
 113  complex(dpc) :: cminusone
 114  real(dp) :: zvar(2)
 115  logical :: havetoprecon
 116  real(dp) :: tsec(2)
 117  real(dp), allocatable :: gwavef(:,:),cwavef(:,:),gvnlc(:,:)
 118  real(dp), allocatable :: swavef(:,:)
 119  real(dp), allocatable :: residualnorms(:),eigen(:)
 120  real(dp), allocatable :: tmpeigen(:)
 121  real(dp), allocatable :: pcon(:,:)
 122  real(dp), allocatable :: blockvectorx(:,:),blockvectorvx(:,:),blockvectorax(:,:),blockvectorbx(:,:)
 123  real(dp),allocatable  :: blockvectorr(:,:),blockvectorvr(:,:),blockvectorar(:,:),blockvectorbr(:,:)
 124  real(dp),allocatable  :: blockvectorp(:,:),blockvectorvp(:,:),blockvectorap(:,:),blockvectorbp(:,:),blockvectordumm(:,:)
 125  real(dp),allocatable  :: blockvectory(:,:),blockvectorby(:,:),blockvectorz(:,:)
 126  real(dp),allocatable  :: gramxax(:,:),gramxar(:,:),gramxap(:,:),gramrar(:,:),gramrap(:,:),grampap(:,:)
 127  real(dp),allocatable  :: gramxbx(:,:),gramxbr(:,:),gramxbp(:,:),gramrbr(:,:),gramrbp(:,:),grampbp(:,:)
 128  real(dp),allocatable  :: coordx1(:,:),coordx2(:,:),coordx3(:,:),lambda(:,:),grama(:,:),gramb(:,:),gramyx(:,:)
 129  real(dp),allocatable  :: tmpgramb(:,:),transf3(:,:,:),transf5(:,:,:)
 130  real(dp), allocatable :: tsubham(:,:)
 131  type(pawcprj_type) :: cprj_dum(gs_hamk%natom,0)
 132  character(len=500) :: message
 133  character, dimension(2) :: cparam
 134  type(c_ptr) :: A_gpu,C_gpu,coordx2_gpu,coordx3_gpu,bblockvector_gpu,gram_gpu
 135  type(c_ptr) :: blockvectorr_gpu,blockvectorar_gpu,blockvectorbr_gpu
 136 
 137 !Index of a given band
 138 !gramindex(iblocksize)=(iblocksize-1)*cplx+1
 139 
 140 ! *********************************************************************
 141 
 142  DBG_ENTER("COLL")
 143 
 144  call timab(530,1,tsec)
 145  if(abs(dtset%timopt)==4) then
 146    call timab(520,1,tsec)
 147  end if
 148 
 149 !###########################################################################
 150 !################ INITIALISATION  ##########################################
 151 !###########################################################################
 152 
 153 !For timing
 154  timopt=dtset%timopt
 155  tim_wfcopy=584
 156  !tim_xcopy=584
 157  tim_xeigen=587
 158  !tim_xgemm=532
 159  tim_xortho=535
 160  tim_xprecon=536
 161  !tim_xtrsm=535
 162 
 163 !Variables
 164  maxiterations=dtset%nline
 165  gen_eigenpb=(gs_hamk%usepaw==1)
 166  my_nspinor=max(1,dtset%nspinor/mpi_enreg%nproc_spinor)
 167  cminusone=-cone
 168  istwf_k=gs_hamk%istwf_k
 169  info = 0
 170  cparam(1)='t'
 171  cparam(2)='c'
 172 
 173 !Depends on istwfk
 174  if ( istwf_k == 2 ) then
 175    cplx=1
 176    if (mpi_enreg%me_g0 == 1) then
 177      vectsize=2*npw_k*my_nspinor-1
 178    else
 179      vectsize=2*npw_k*my_nspinor
 180    end if
 181  else
 182    cplx=2
 183    vectsize=npw_k*my_nspinor
 184  end if
 185 
 186 !For preconditionning
 187  optekin=0;if (dtset%wfoptalg>10) optekin=0
 188  optpcon=1;if (dtset%wfoptalg>10) optpcon=0
 189 
 190 !For communication
 191  blocksize=mpi_enreg%nproc_fft
 192  if(mpi_enreg%paral_kgb==1) blocksize=mpi_enreg%nproc_band*mpi_enreg%bandpp
 193  !IF you want to compare with new lobpcg in sequential uncomment the following
 194  !line
 195  !blocksize=mpi_enreg%nproc_band*mpi_enreg%bandpp
 196 
 197 !Iniitializations/allocations of GPU parallelism
 198  use_linalg_gpu=0;use_lapack_gpu=0
 199  if ((dtset%use_gpu_cuda==1).and. &
 200 & (vectsize*blocksize*blocksize>dtset%gpu_linalg_limit)) use_linalg_gpu=1
 201 #if defined HAVE_LINALG_MAGMA
 202  use_lapack_gpu=use_linalg_gpu
 203 #endif
 204  if(use_linalg_gpu==1) then
 205 !  call gpu_linalg_init()
 206    call alloc_on_gpu(A_gpu,cplx*dp*vectsize*blocksize)
 207    call alloc_on_gpu(C_gpu,cplx*dp*vectsize*blocksize)
 208    call alloc_on_gpu(blockvectorr_gpu,cplx*dp*vectsize*blocksize)
 209    call alloc_on_gpu(blockvectorar_gpu,cplx*dp*vectsize*blocksize)
 210    call alloc_on_gpu(blockvectorbr_gpu,cplx*dp*vectsize*blocksize)
 211    call alloc_on_gpu(coordx2_gpu,cplx*dp*blocksize*blocksize)
 212    call alloc_on_gpu(coordx3_gpu,cplx*dp*blocksize*blocksize)
 213  end if
 214 
 215  if(abs(dtset%timopt)==4) then
 216    call timab(520,2,tsec)
 217  end if
 218 
 219 !###########################################################################
 220 !################ BIG LOOP OVER BLOCKS  ####################################
 221 !###########################################################################
 222 
 223  do iblock=1,nbdblock
 224 
 225    if(abs(dtset%timopt)==4) then
 226      call timab(521,1,tsec)
 227    end if
 228 
 229    havetoprecon=.true.
 230    nrestart=0
 231    bblocksize=(iblock-1)*blocksize
 232 
 233 !  allocations
 234    ABI_ALLOCATE(pcon,(npw_k,blocksize))
 235    ABI_ALLOCATE(blockvectorx,(cplx*vectsize,blocksize))
 236    ABI_ALLOCATE(blockvectorax,(cplx*vectsize,blocksize))
 237    ABI_ALLOCATE(blockvectorbx,(cplx*vectsize,blocksize))
 238    ABI_ALLOCATE(blockvectorr,(cplx*vectsize,blocksize))
 239    ABI_ALLOCATE(blockvectorar,(cplx*vectsize,blocksize))
 240    ABI_ALLOCATE(blockvectorbr,(cplx*vectsize,blocksize))
 241    ABI_ALLOCATE(blockvectorp,(cplx*vectsize,blocksize))
 242    ABI_ALLOCATE(blockvectorap,(cplx*vectsize,blocksize))
 243    ABI_ALLOCATE(blockvectorbp,(cplx*vectsize,blocksize))
 244    ABI_ALLOCATE(blockvectordumm,(cplx*vectsize,blocksize))
 245    ABI_ALLOCATE(gramxax,(cplx*blocksize,blocksize))
 246    ABI_ALLOCATE(gramxar,(cplx*blocksize,blocksize))
 247    ABI_ALLOCATE(gramxap,(cplx*blocksize,blocksize))
 248    ABI_ALLOCATE(gramrar,(cplx*blocksize,blocksize))
 249    ABI_ALLOCATE(gramrap,(cplx*blocksize,blocksize))
 250    ABI_ALLOCATE(grampap,(cplx*blocksize,blocksize))
 251    ABI_ALLOCATE(gramxbx,(cplx*blocksize,blocksize))
 252    ABI_ALLOCATE(gramxbr,(cplx*blocksize,blocksize))
 253    ABI_ALLOCATE(gramxbp,(cplx*blocksize,blocksize))
 254    ABI_ALLOCATE(gramrbr,(cplx*blocksize,blocksize))
 255    ABI_ALLOCATE(gramrbp,(cplx*blocksize,blocksize))
 256    ABI_ALLOCATE(grampbp,(cplx*blocksize,blocksize))
 257    ABI_ALLOCATE(transf3,(cplx*blocksize,blocksize,3))
 258    ABI_ALLOCATE(transf5,(cplx*blocksize,blocksize,5))
 259    ABI_ALLOCATE(lambda,(cplx*blocksize,blocksize))
 260    ABI_ALLOCATE(residualnorms,(blocksize))
 261 
 262    ABI_ALLOCATE(blockvectory,(cplx*vectsize,bblocksize))
 263    ABI_ALLOCATE(blockvectorby,(cplx*vectsize,bblocksize))
 264    ABI_ALLOCATE(gramyx,(cplx*bblocksize,blocksize))
 265    if (gs_hamk%usepaw==0) then
 266      ABI_ALLOCATE(blockvectorvx,(cplx*vectsize,blocksize))
 267      ABI_ALLOCATE(blockvectorvr,(cplx*vectsize,blocksize))
 268      ABI_ALLOCATE(blockvectorvp,(cplx*vectsize,blocksize))
 269    end if
 270 
 271    if(use_linalg_gpu==1) then
 272      if(iblock/=1) then
 273        call alloc_on_gpu(bblockvector_gpu,cplx*dp*vectsize*bblocksize)
 274        call alloc_on_gpu(gram_gpu,cplx*dp*bblocksize*blocksize)
 275      else
 276        call alloc_on_gpu(bblockvector_gpu,cplx*dp*vectsize*blocksize)
 277        call alloc_on_gpu(gram_gpu,cplx*dp*blocksize*blocksize)
 278      end if
 279    end if
 280 
 281 ! Initialize global variables in m_wfutils.
 282    call setWFParameter(cplx,mpi_enreg%me_g0,npw_k,my_nspinor,icg,igsc,blocksize)
 283 
 284 !  transfer array of wf coeff in iblock to blockvectorx
 285    call wfcopy('D',blocksize*vectsize,cg,1,blockvectorx,1,blocksize,iblock,'C',withbbloc=.true.,&
 286 &   timopt=timopt,tim_wfcopy=tim_wfcopy)
 287 
 288 !  !!!!!!!!!!!!!!!!!!!!!!!! Begin if iblock /=1 !!!!!!!!!!!!!!!!!!!!!!!!!!
 289 !  transfer array of wf coeff less than iblock to blockvectory
 290    if(iblock /=1) then
 291      call wfcopy('D',bblocksize*vectsize,cg,1,blockvectory,1,bblocksize,iblock,'C',withbbloc=.false.,&
 292 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 293 
 294      if(gen_eigenpb) then
 295        call wfcopy('D',bblocksize*vectsize,gsc,1,blockvectorby,1,bblocksize,iblock,'S',withbbloc=.false.,&
 296 &       timopt=timopt,tim_wfcopy=tim_wfcopy)
 297      else
 298        call abi_xcopy(vectsize*bblocksize,blockvectory,1,blockvectorby,1,x_cplx=x_cplx)
 299      end if
 300 
 301 !    b-orthogonalize x to the constraint y (supposed b-orthonormal)
 302 !    blockvectorx=blockvectorx-matmul(blockvectory,matmul((blockvectorby)^T,blockvectorx))
 303 
 304      call abi_xgemm(cparam(cplx),'n',bblocksize,blocksize,vectsize,cone,blockvectorby,&
 305 &     vectsize,blockvectorx,vectsize,czero,gramyx,bblocksize,x_cplx=x_cplx)
 306 
 307      if(abs(dtset%timopt)==3) then
 308        call timab(533,1,tsec)
 309      end if
 310      call xmpi_sum(gramyx,mpi_enreg%comm_bandspinorfft,ierr)
 311      if(abs(dtset%timopt)==3) then
 312        call timab(533,2,tsec)
 313      end if
 314 
 315      call abi_xgemm('n','n',vectsize,blocksize,bblocksize,cminusone,blockvectory,&
 316 &     vectsize,gramyx,bblocksize,cone,blockvectorx,vectsize,x_cplx=x_cplx)
 317 
 318    end if
 319 !  !!!!!!!!!!!!!!!!!!!!!!!! End if iblock /=1 !!!!!!!!!!!!!!!!!!!!!!!!!!!
 320 
 321    ABI_ALLOCATE(cwavef,(2,npw_k*my_nspinor*blocksize))
 322    ABI_ALLOCATE(gwavef,(2,npw_k*my_nspinor*blocksize))
 323    ABI_ALLOCATE(gvnlc,(2,npw_k*my_nspinor*blocksize))
 324    ABI_ALLOCATE(swavef,(2,npw_k*my_nspinor*blocksize))
 325 
 326    call wfcopy('I',vectsize*blocksize,blockvectorx,1,cwavef,1,blocksize,iblock,'W',withbbloc=.false.,&
 327 &   timopt=timopt,tim_wfcopy=tim_wfcopy)
 328 
 329    if(abs(dtset%timopt)==4) then
 330      call timab(521,2,tsec)
 331    end if
 332    if(abs(dtset%timopt)==4) then
 333      call timab(526,1,tsec)
 334    end if
 335 
 336    cpopt=-1;sij_opt=0;if (gen_eigenpb) sij_opt=1
 337 
 338    if (mpi_enreg%paral_kgb==0) then
 339      call getghc(cpopt,cwavef,cprj_dum,gwavef,swavef,gs_hamk,gvnlc,dum,&
 340 &     mpi_enreg,blocksize,prtvol,sij_opt,tim_getghc,0)
 341    else
 342      call prep_getghc(cwavef,gs_hamk,gvnlc,gwavef,swavef,dum,blocksize,mpi_enreg,&
 343 &     prtvol,sij_opt,cpopt,cprj_dum,already_transposed=.false.)
 344    end if
 345    if(abs(dtset%timopt)==4) then
 346      call timab(526,2,tsec)
 347    end if
 348    if(abs(dtset%timopt)==4) then
 349      call timab(522,1,tsec)
 350    end if
 351 
 352    if ( gen_eigenpb ) then
 353      call wfcopy('D',vectsize*blocksize,swavef,1,blockvectorbx,1,blocksize,iblock,'W',withbbloc=.false.,&
 354 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 355    else
 356      call wfcopy('D',vectsize*blocksize,gvnlc,1,blockvectorvx,1,blocksize,iblock,'W',withbbloc=.false.,&
 357 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 358      call abi_xcopy(vectsize*blocksize,blockvectorx,1,blockvectorbx,1,x_cplx=x_cplx)
 359    end if
 360 
 361    call wfcopy('D',vectsize*blocksize,gwavef,1,blockvectorax,1,blocksize,iblock,'W',withbbloc=.false.,&
 362 &   timopt=timopt,tim_wfcopy=tim_wfcopy)
 363 
 364    ABI_DEALLOCATE(cwavef)
 365    ABI_DEALLOCATE(gwavef)
 366    ABI_DEALLOCATE(gvnlc)
 367    ABI_DEALLOCATE(swavef)
 368 
 369    call abi_xorthonormalize(blockvectorx,blockvectorbx,blocksize,mpi_enreg%comm_bandspinorfft,gramxbx,vectsize,&
 370 &   x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 371 
 372    call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramxbx,blocksize,blockvectorbx,vectsize,x_cplx=x_cplx)
 373    call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramxbx,blocksize,blockvectorax,vectsize,x_cplx=x_cplx)
 374 
 375    if (gs_hamk%usepaw==0) then
 376      call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramxbx,blocksize,blockvectorvx,vectsize,x_cplx=x_cplx)
 377    end if
 378 
 379 !  Do rayleigh ritz on a in space x
 380 !  gramxax=matmul(transpose(blockvectorx),blockvectorax)
 381    call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorx,&
 382 &   vectsize,blockvectorax,vectsize,czero,gramxax,blocksize,x_cplx=x_cplx)
 383 
 384    if(abs(dtset%timopt)==3) then
 385      call timab(533,1,tsec)
 386    end if
 387    call xmpi_sum(gramxax,mpi_enreg%comm_bandspinorfft,ierr)
 388    if(abs(dtset%timopt)==3) then
 389      call timab(533,2,tsec)
 390    end if
 391    ABI_ALLOCATE(eigen,(blocksize))
 392 
 393    call abi_xheev('v','u',blocksize,gramxax,eigen,x_cplx=cplx,istwf_k=istwf_k, &
 394    timopt=timopt,tim_xeigen=tim_xeigen,use_slk=dtset%use_slk,use_gpu=use_lapack_gpu)
 395 
 396 !  blockvectorx=matmul(blockvectorx,gramxax)
 397    call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorx,&
 398 &   vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 399    call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorx,1,x_cplx=x_cplx)
 400 
 401 !  blockvectorax=matmul(blockvectorax,gramxax)
 402    call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorax,&
 403 &   vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 404    call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorax,1,x_cplx=x_cplx)
 405 
 406 !  blockvectorvx=matmul(blockvectorvx,gramxax)
 407    if (gs_hamk%usepaw==0) then
 408      call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvx,&
 409 &     vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 410      call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorvx,1,x_cplx=x_cplx)
 411    end if
 412 
 413 !  blockvectorbx=matmul(blockvectorbx,gramxax)
 414    call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbx,&
 415 &   vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 416    call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorbx,1,x_cplx=x_cplx)
 417 
 418    do iblocksize=1,blocksize
 419      zvar=(/eigen(iblocksize),zero/)
 420      call abi_xcopy(1,zvar,1,lambda(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx)
 421    end do
 422    ABI_DEALLOCATE(eigen)
 423 
 424    if(abs(dtset%timopt)==4) then
 425      call timab(522,2,tsec)
 426    end if
 427 
 428 !  ###########################################################################
 429 !  ################ PERFORM LOOP ON NLINE ####################################
 430 !  ###########################################################################
 431 !  now the main alogrithm
 432    iter: do iterationnumber=1,maxiterations
 433 
 434      if(abs(dtset%timopt)==4) then
 435        call timab(523,1,tsec)
 436      end if
 437 
 438 !    Build residual
 439 !    blockvectorr=blockvectorax-matmul(blockvectorx,lambda)
 440      call xprecon(blockvectorbx,lambda,blocksize,&
 441 &     iterationnumber,kinpw,mpi_enreg,npw_k,my_nspinor,&
 442 &     optekin,optpcon,pcon,blockvectorax,blockvectorr,vectsize,timopt=timopt,tim_xprecon=tim_xprecon)
 443 
 444      residualnorms=sum(blockvectorr**2,dim=1)
 445 
 446      if(abs(dtset%timopt)==3) then
 447        call timab(533,1,tsec)
 448      end if
 449      call xmpi_sum(residualnorms,mpi_enreg%comm_bandspinorfft,ierr)
 450      if(abs(dtset%timopt)==3) then
 451        call timab(533,2,tsec)
 452      end if
 453 
 454      resid_k(bblocksize+1:bblocksize+blocksize)=residualnorms(1:blocksize)
 455 
 456 !    If residual sufficiently small stop line minimizations
 457      if (abs(maxval(residualnorms(1:blocksize)))<dtset%tolwfr) then
 458        if (prtvol > 0) then
 459          write(message, '(a,i0,a,i0,a,es12.4)' ) &
 460 &         ' lobpcgwf: block ',iblock,' converged after ',iterationnumber,&
 461 &         ' line minimizations: maxval(resid(1:blocksize)) =',maxval(residualnorms(1:blocksize))
 462          call wrtout(std_out,message,'PERS')
 463        end if
 464        havetoprecon=.false.
 465        exit
 466      end if
 467 
 468      if(use_linalg_gpu==1) then
 469        call copy_on_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize)
 470      end if
 471 
 472      if(iblock /=1) then
 473 !      Residuals orthogonal to blockvectorby
 474 !      blockvectorr=blockvectorr-matmul(blockvectory,matmul((blockvectorby)^T,blockvectorr))
 475 
 476        if(use_linalg_gpu==1) then
 477          call copy_on_gpu(blockvectorby,bblockvector_gpu,cplx*dp*vectsize*bblocksize)
 478          call gpu_xgemm(cplx,cparam(cplx),'n',bblocksize,blocksize,vectsize,cone,bblockvector_gpu,&
 479 &         vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,bblocksize)
 480          call copy_from_gpu(gramyx,gram_gpu,cplx*dp*bblocksize*blocksize)
 481        else
 482          call abi_xgemm(cparam(cplx),'n',bblocksize,blocksize,vectsize,cone,blockvectorby,&
 483 &         vectsize,blockvectorr,vectsize,czero,gramyx,bblocksize,x_cplx=x_cplx)
 484        end if
 485 
 486        if(abs(dtset%timopt)==3) then
 487          call timab(533,1,tsec)
 488        end if
 489        call xmpi_sum(gramyx,mpi_enreg%comm_bandspinorfft,ierr)
 490        if(abs(dtset%timopt)==3) then
 491          call timab(533,2,tsec)
 492        end if
 493 
 494        if(use_linalg_gpu==1) then
 495          call copy_on_gpu(gramyx,gram_gpu,cplx*dp*bblocksize*blocksize)
 496          call copy_on_gpu(blockvectory,bblockvector_gpu,cplx*dp*vectsize*bblocksize)
 497          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,bblocksize,cminusone,bblockvector_gpu,&
 498 &         vectsize,gram_gpu,bblocksize,cone,blockvectorr_gpu,vectsize)
 499        else
 500          call abi_xgemm('n','n',vectsize,blocksize,bblocksize,cminusone,blockvectory,&
 501 &         vectsize,gramyx,bblocksize,cone,blockvectorr,vectsize,x_cplx=x_cplx)
 502        end if
 503 
 504      end if
 505 
 506 !    Residuals orthogonal to blockvectorx
 507 !    blockvectorr=blockvectorr-matmul(blockvectorx,matmul((blockvectorbx)^T,blockvectorr))
 508      if(use_linalg_gpu==1) then
 509        call copy_on_gpu(blockvectorbx,C_gpu,cplx*dp*vectsize*blocksize)
 510        call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,C_gpu,&
 511 &       vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,blocksize)
 512        call copy_from_gpu(gramxax,gram_gpu,cplx*dp*blocksize*blocksize)
 513      else
 514        call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorbx,&
 515 &       vectsize,blockvectorr,vectsize,czero,gramxax,blocksize,x_cplx=x_cplx)
 516      end if
 517 
 518      if(abs(dtset%timopt)==3) then
 519        call timab(533,1,tsec)
 520      end if
 521      call xmpi_sum(gramxax,mpi_enreg%comm_bandspinorfft,ierr)
 522      if(abs(dtset%timopt)==3) then
 523        call timab(533,2,tsec)
 524      end if
 525 
 526      if(use_linalg_gpu==1) then
 527        call copy_on_gpu(gramxax,gram_gpu,cplx*dp*blocksize*blocksize)
 528        call copy_on_gpu(blockvectorx,C_gpu,cplx*dp*vectsize*blocksize)
 529        call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cminusone,C_gpu,&
 530 &       vectsize,gram_gpu,blocksize,cone,blockvectorr_gpu,vectsize)
 531        call copy_from_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize)
 532      else
 533        call abi_xgemm('n','n',vectsize,blocksize,blocksize,cminusone,blockvectorx,&
 534 &       vectsize,gramxax,blocksize,cone,blockvectorr,vectsize,x_cplx=x_cplx)
 535      end if
 536 
 537      ABI_ALLOCATE(cwavef,(2,npw_k*my_nspinor*blocksize))
 538      ABI_ALLOCATE(gwavef,(2,npw_k*my_nspinor*blocksize))
 539      ABI_ALLOCATE(gvnlc,(2,npw_k*my_nspinor*blocksize))
 540      ABI_ALLOCATE(swavef,(2,npw_k*my_nspinor*blocksize))
 541 
 542      call wfcopy('I',vectsize*blocksize,blockvectorr,1,cwavef,1,blocksize,iblock,'W',withbbloc=.false.,&
 543 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 544 
 545      cpopt=-1;sij_opt=0;if (gen_eigenpb) sij_opt=1
 546 
 547      if(abs(dtset%timopt)==4) then
 548        call timab(523,2,tsec)
 549      end if
 550      if(abs(dtset%timopt)==4) then
 551        call timab(526,1,tsec)
 552      end if
 553 
 554      if (mpi_enreg%paral_kgb==0) then
 555        call getghc(cpopt,cwavef,cprj_dum,gwavef,swavef,gs_hamk,gvnlc,dum,&
 556 &       mpi_enreg,blocksize,prtvol,sij_opt,tim_getghc,0)
 557      else
 558        call prep_getghc(cwavef,gs_hamk,gvnlc,gwavef,swavef,dum,blocksize,mpi_enreg,&
 559 &       prtvol,sij_opt,cpopt,cprj_dum,already_transposed=.false.)
 560      end if
 561 
 562      if(abs(dtset%timopt)==4) then
 563        call timab(526,2,tsec)
 564      end if
 565      if(abs(dtset%timopt)==4) then
 566        call timab(524,1,tsec)
 567      end if
 568 
 569      if (gen_eigenpb) then
 570        call wfcopy('D',vectsize*blocksize,swavef,1,blockvectorbr,1,blocksize,iblock,'W',withbbloc=.false.,&
 571 &       timopt=timopt,tim_wfcopy=tim_wfcopy)
 572      else
 573        call abi_xcopy(vectsize*blocksize,blockvectorr,1,blockvectorbr,1,x_cplx=x_cplx)
 574        call wfcopy('D',vectsize*blocksize,gvnlc,1,blockvectorvr,1,blocksize,iblock,'W',withbbloc=.false.,&
 575 &       timopt=timopt,tim_wfcopy=tim_wfcopy)
 576      end if
 577 
 578      call wfcopy('D',vectsize*blocksize,gwavef,1,blockvectorar,1,blocksize,iblock,'W',withbbloc=.false.,&
 579 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 580 
 581      ABI_DEALLOCATE(cwavef)
 582      ABI_DEALLOCATE(gwavef)
 583      ABI_DEALLOCATE(gvnlc)
 584      ABI_DEALLOCATE(swavef)
 585 
 586      if(use_linalg_gpu==1) then
 587        call copy_on_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize)
 588        call gpu_xorthonormalize(blockvectorr_gpu,blockvectorbr_gpu,blocksize,mpi_enreg%comm_bandspinorfft,gram_gpu,vectsize,&
 589 &       x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 590        call copy_from_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize)
 591        call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,blockvectorbr_gpu,vectsize)
 592        call copy_from_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize)
 593        call copy_on_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize)
 594        call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,blockvectorar_gpu,vectsize)
 595        call copy_from_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize)
 596        if (gs_hamk%usepaw==0) then
 597          call copy_on_gpu(blockvectorvr,A_gpu,cplx*dp*vectsize*blocksize)
 598          call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,A_gpu,vectsize)
 599          call copy_from_gpu(blockvectorvr,A_gpu,cplx*dp*vectsize*blocksize)
 600        end if
 601        call copy_from_gpu(gramrbr,gram_gpu,cplx*dp*blocksize*blocksize)
 602      else
 603        call abi_xorthonormalize(blockvectorr,blockvectorbr,blocksize,mpi_enreg%comm_bandspinorfft,gramrbr,vectsize,&
 604 &       x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 605        call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramrbr,blocksize,blockvectorbr,vectsize,x_cplx=x_cplx)
 606        call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramrbr,blocksize,blockvectorar,vectsize,x_cplx=x_cplx)
 607        if (gs_hamk%usepaw==0) then
 608          call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramrbr,blocksize,blockvectorvr,vectsize,x_cplx=x_cplx)
 609        end if
 610      end if
 611 
 612      if(iterationnumber>1) then
 613        if(use_linalg_gpu==1) then
 614          call copy_on_gpu(blockvectorp,A_gpu,cplx*dp*vectsize*blocksize)
 615          call copy_on_gpu(blockvectorbp,C_gpu,cplx*dp*vectsize*blocksize)
 616          call gpu_xorthonormalize(A_gpu,C_gpu,blocksize,mpi_enreg%comm_bandspinorfft,gram_gpu,vectsize,&
 617 &         x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 618          call copy_from_gpu(blockvectorp,A_gpu,cplx*dp*vectsize*blocksize)
 619          call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,C_gpu,vectsize)
 620          call copy_from_gpu(blockvectorbp,C_gpu,cplx*dp*vectsize*blocksize)
 621          call copy_on_gpu(blockvectorap,A_gpu,cplx*dp*vectsize*blocksize)
 622          call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,A_gpu,vectsize)
 623          call copy_from_gpu(blockvectorap,A_gpu,cplx*dp*vectsize*blocksize)
 624          if (gs_hamk%usepaw==0) then
 625            call copy_on_gpu(blockvectorvp,A_gpu,cplx*dp*vectsize*blocksize)
 626            call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,A_gpu,vectsize)
 627            call copy_from_gpu(blockvectorvp,A_gpu,cplx*dp*vectsize*blocksize)
 628          end if
 629          call copy_from_gpu(grampbp,gram_gpu,cplx*dp*blocksize*blocksize)
 630        else
 631 !        call orthonormalize(blockvectorp,blockvectorbp,blockvectorap)
 632          call abi_xorthonormalize(blockvectorp,blockvectorbp,blocksize,mpi_enreg%comm_bandspinorfft,grampbp,vectsize,&
 633 &         x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 634 !        blockvectorap=matmul(blockvectorap,grampbp)
 635          call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,grampbp,blocksize,blockvectorbp,vectsize,x_cplx=x_cplx)
 636          call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,grampbp,blocksize,blockvectorap,vectsize,x_cplx=x_cplx)
 637          if (gs_hamk%usepaw==0) then
 638            call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,grampbp,blocksize,blockvectorvp,vectsize,x_cplx=x_cplx)
 639          end if
 640        end if
 641      end if
 642 
 643      activersize=blocksize
 644      if (iterationnumber==1) then
 645        activepsize=0
 646        restart=1
 647      else
 648        activepsize=blocksize
 649        restart=0
 650      end if
 651 
 652 !    gramxar=matmul((blockvectorax)^T,blockvectorr)
 653 !    gramrar=matmul((blockvectorar)^T,blockvectorr)
 654 !    gramxax=matmul((blockvectorax)^T,blockvectorx)
 655      if(use_linalg_gpu==1) then
 656        call copy_on_gpu(blockvectorax,A_gpu,cplx*dp*vectsize*blocksize)
 657        call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,A_gpu,&
 658 &       vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,blocksize)
 659        call copy_from_gpu(gramxar,gram_gpu,cplx*dp*blocksize*blocksize)
 660        call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorar_gpu,&
 661 &       vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,blocksize)
 662        call copy_from_gpu(gramrar,gram_gpu,cplx*dp*blocksize*blocksize)
 663        call copy_on_gpu(blockvectorx,C_gpu,cplx*dp*vectsize*blocksize)
 664        call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,A_gpu,&
 665 &       vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 666        call copy_from_gpu(gramxax,gram_gpu,cplx*dp*blocksize*blocksize)
 667      else
 668        call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorax,&
 669 &       vectsize,blockvectorr,vectsize,czero,gramxar,blocksize,x_cplx=x_cplx)
 670        call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorar,&
 671 &       vectsize,blockvectorr,vectsize,czero,gramrar,blocksize,x_cplx=x_cplx)
 672        call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorax,&
 673 &       vectsize,blockvectorx,vectsize,czero,gramxax,blocksize,x_cplx=x_cplx)
 674      end if
 675 
 676      call abi_xcopy(blocksize*blocksize,gramxar,1,transf3(:,:,1),1,x_cplx=x_cplx)
 677      call abi_xcopy(blocksize*blocksize,gramrar,1,transf3(:,:,2),1,x_cplx=x_cplx)
 678      call abi_xcopy(blocksize*blocksize,gramxax,1,transf3(:,:,3),1,x_cplx=x_cplx)
 679      if(abs(dtset%timopt)==3) then
 680        call timab(533,1,tsec)
 681      end if
 682      call xmpi_sum(transf3,mpi_enreg%comm_bandspinorfft,ierr)
 683      if(abs(dtset%timopt)==3) then
 684        call timab(533,2,tsec)
 685      end if
 686 
 687      call abi_xcopy(blocksize*blocksize,transf3(:,:,1),1,gramxar,1,x_cplx=x_cplx)
 688      call abi_xcopy(blocksize*blocksize,transf3(:,:,2),1,gramrar,1,x_cplx=x_cplx)
 689      call abi_xcopy(blocksize*blocksize,transf3(:,:,3),1,gramxax,1,x_cplx=x_cplx)
 690 
 691 !    gramxbx=matmul((blockvectorbx)^T,blockvectorx)
 692 !    gramrbr=matmul((blockvectorbr)^T,blockvectorr)
 693 !    gramxbr=matmul((blockvectorbx)^T,blockvectorr)
 694 !    Note that the gramb matrix is more easier to construct than grama:
 695 !    i) <x|B|x>=<r|B|r>=<p|B|p>=(1;0)
 696 !    since the x, r and p blockvector are normalized
 697 !    ii) <r|B|x>=(0;0)
 698 !    since the x and r blockvector are orthogonalized
 699 !    iii) The <p|B|r> and <p|B|x> have to be computed.
 700      gramxbx(:,:)=zero
 701      gramrbr(:,:)=zero
 702      gramxbr(:,:)=zero
 703      do iblocksize=1,blocksize
 704        zvar=(/one,zero/)
 705        call abi_xcopy(1,zvar,1,gramxbx(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx)
 706        call abi_xcopy(1,zvar,1,gramrbr(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx)
 707      end do
 708 
 709 !    ###########################################################################
 710 !    ################ PERFORM LOOP ON COND #####################################
 711 !    ###########################################################################
 712 
 713      i1=0;i2=blocksize;i3=2*blocksize;i4=3*blocksize
 714      cond: do cond_try=1,2 !2 when restart
 715        if (restart==0) then
 716 
 717 !        gramxap=matmul((blockvectorax)^T,blockvectorp)
 718 !        gramrap=matmul((blockvectorar)^T,blockvectorp)
 719 !        grampap=matmul((blockvectorap)^T,blockvectorp)
 720 !        gramxbp=matmul((blockvectorbx)^T,blockvectorp)
 721 !        gramrbp=matmul((blockvectorbr)^T,blockvectorp)
 722 !        grampbp=matmul((blockvectorbp)^T,blockvectorp)
 723          if(use_linalg_gpu==1) then
 724            call copy_on_gpu(blockvectorp,C_gpu,cplx*dp*vectsize*blocksize)
 725            call copy_on_gpu(blockvectorax,A_gpu,cplx*dp*vectsize*blocksize)
 726            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 727 &           cone,A_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 728            call copy_from_gpu(gramxap,gram_gpu,cplx*dp*blocksize*blocksize)
 729            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 730 &           cone,blockvectorar_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 731            call copy_from_gpu(gramrap,gram_gpu,cplx*dp*blocksize*blocksize)
 732            call copy_on_gpu(blockvectorap,A_gpu,cplx*dp*vectsize*blocksize)
 733            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 734 &           cone,A_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 735            call copy_from_gpu(grampap,gram_gpu,cplx*dp*blocksize*blocksize)
 736            call copy_on_gpu(blockvectorbx,A_gpu,cplx*dp*vectsize*blocksize)
 737            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 738 &           cone,A_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 739            call copy_from_gpu(gramxbp,gram_gpu,cplx*dp*blocksize*blocksize)
 740            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 741 &           cone,blockvectorbr_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 742            call copy_from_gpu(gramrbp,gram_gpu,cplx*dp*blocksize*blocksize)
 743          else
 744            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorax,&
 745 &           vectsize,blockvectorp,vectsize,czero,gramxap,blocksize,x_cplx=x_cplx)
 746            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorar,&
 747 &           vectsize,blockvectorp,vectsize,czero,gramrap,blocksize,x_cplx=x_cplx)
 748            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorap,&
 749 &           vectsize,blockvectorp,vectsize,czero,grampap,blocksize,x_cplx=x_cplx)
 750            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorbx,&
 751 &           vectsize,blockvectorp,vectsize,czero,gramxbp,blocksize,x_cplx=x_cplx)
 752            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorbr,&
 753 &           vectsize,blockvectorp,vectsize,czero,gramrbp,blocksize,x_cplx=x_cplx)
 754          end if
 755 !        It's not necessary to compute the last one: <p|B|p>=(1;0) (see above)
 756          transf5(:,:,1)=gramxap(:,:)
 757          transf5(:,:,2)=gramrap(:,:)
 758          transf5(:,:,3)=grampap(:,:)
 759          transf5(:,:,4)=gramxbp(:,:)
 760          transf5(:,:,5)=gramrbp(:,:)
 761          if(abs(dtset%timopt)==3) then
 762            call timab(533,1,tsec)
 763          end if
 764          call xmpi_sum(transf5,mpi_enreg%comm_bandspinorfft,ierr)
 765          if(abs(dtset%timopt)==3) then
 766            call timab(533,2,tsec)
 767          end if
 768          gramxap(:,:)=transf5(:,:,1)
 769          gramrap(:,:)=transf5(:,:,2)
 770          grampap(:,:)=transf5(:,:,3)
 771          gramxbp(:,:)=transf5(:,:,4)
 772          gramrbp(:,:)=transf5(:,:,5)
 773          grampbp(:,:)=zero
 774          do iblocksize=1,blocksize
 775            zvar=(/one,zero/)
 776            call abi_xcopy(1,zvar,1,grampbp(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx)
 777          end do
 778          bigorder=i4
 779          ABI_ALLOCATE(grama,(cplx*i4,i4))
 780          ABI_ALLOCATE(gramb,(cplx*i4,i4))
 781          ABI_ALLOCATE(eigen,(i4))
 782 !        ABI_ALLOCATE(coordx,(cplx*i4,blocksize))
 783          ABI_ALLOCATE(coordx1,(cplx*blocksize,blocksize))
 784          ABI_ALLOCATE(coordx2,(cplx*blocksize,blocksize))
 785          ABI_ALLOCATE(coordx3,(cplx*blocksize,blocksize))
 786          grama(:,:)=zero;gramb(:,:)=zero
 787          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxax
 788          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxar
 789          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i3+1:i4)=gramxap
 790          grama(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrar
 791          grama(gramindex(i2+1):gramindex(i3)+cplx-1,i3+1:i4)=gramrap
 792          grama(gramindex(i3+1):gramindex(i4)+cplx-1,i3+1:i4)=grampap
 793          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxbx
 794          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxbr
 795          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i3+1:i4)=gramxbp
 796          gramb(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrbr
 797          gramb(gramindex(i2+1):gramindex(i3)+cplx-1,i3+1:i4)=gramrbp
 798          gramb(gramindex(i3+1):gramindex(i4)+cplx-1,i3+1:i4)=grampbp
 799        else
 800          bigorder=i3
 801          ABI_ALLOCATE(grama,(cplx*i3,i3))
 802          ABI_ALLOCATE(gramb,(cplx*i3,i3))
 803          ABI_ALLOCATE(eigen,(i3))
 804 !        ABI_ALLOCATE(coordx,(cplx*i3,blocksize))
 805          ABI_ALLOCATE(coordx1,(cplx*blocksize,blocksize))
 806          ABI_ALLOCATE(coordx2,(cplx*blocksize,blocksize))
 807          grama(:,:)=zero;gramb(:,:)=zero
 808          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxax
 809          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxar
 810          grama(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrar
 811          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxbx
 812          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxbr
 813          gramb(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrbr
 814        end if
 815 
 816        ABI_ALLOCATE(tmpgramb,(cplx*bigorder,bigorder))
 817        ABI_ALLOCATE(tmpeigen,(bigorder))
 818        tmpgramb=gramb
 819 
 820        call abi_xheev('v','u',bigorder,tmpgramb,tmpeigen,x_cplx=cplx,istwf_k=istwf_k, &
 821 &       timopt=timopt,tim_xeigen=tim_xeigen,use_slk=dtset%use_slk,use_gpu=use_lapack_gpu)
 822 
 823        condestgramb=tmpeigen(bigorder)/tmpeigen(1)
 824        ABI_DEALLOCATE(tmpgramb)
 825        ABI_DEALLOCATE(tmpeigen)
 826 
 827        if (condestgramb.gt.1d+5.or.condestgramb.lt.0.d0.or.info/=0) then
 828          write(std_out,*)'condition number of the Gram matrix = ',condestgramb
 829          if (cond_try==1.and.restart==0) then
 830            ABI_DEALLOCATE(grama)
 831            ABI_DEALLOCATE(gramb)
 832            ABI_DEALLOCATE(eigen)
 833 !          ABI_DEALLOCATE(coordx)
 834            ABI_DEALLOCATE(coordx1)
 835            ABI_DEALLOCATE(coordx2)
 836            if(bigorder==i4) then
 837              ABI_DEALLOCATE(coordx3)
 838            end if
 839            if (nrestart.gt.1) then
 840              MSG_WARNING('the minimization is stopped for this block')
 841              exit iter
 842            else
 843              restart=1
 844              nrestart=nrestart+1
 845              call wrtout(std_out,'Lobpcgwf: restart performed',"PERS")
 846            end if
 847          else
 848            MSG_WARNING('Gramm matrix ill-conditionned: results may be unpredictable')
 849          end if
 850        else
 851          exit cond
 852        end if
 853      end do cond
 854 
 855 !    ###########################################################################
 856 !    ################ END LOOP ON COND #########################################
 857 !    ###########################################################################
 858 
 859      call abi_xhegv(1,'v','u',bigorder,grama,gramb,eigen,x_cplx=cplx,istwf_k=istwf_k, &
 860      timopt=timopt,tim_xeigen=tim_xeigen,use_slk=dtset%use_slk,use_gpu=use_lapack_gpu)
 861 
 862      deltae=-one
 863      do iblocksize=1,blocksize
 864        call abi_xcopy(1,lambda(cplx*(iblocksize-1)+1,iblocksize),1,zvar,1,x_cplx=x_cplx)
 865        deltae=max(deltae,abs(cmplx(zvar(1),zvar(2))-eigen(iblocksize)))
 866        zvar=(/eigen(iblocksize),zero/)
 867        call abi_xcopy(1,zvar,1,lambda(cplx*(iblocksize-1)+1,iblocksize),1,x_cplx=x_cplx)
 868      end do
 869 
 870 !    DEBUG
 871 !    write(std_out,*)'eigen',eigen(1:blocksize)
 872 !    ENDDEBUG
 873 
 874 !    coordx(1:bigorder*cplx,1:blocksize)=grama(1:bigorder*cplx,1:blocksize)
 875      coordx1(:,:) =  grama(1+i1*cplx : i2*cplx,1:blocksize)
 876      coordx2(:,:) =  grama(1+i2*cplx : i3*cplx,1:blocksize)
 877      if(bigorder==i4) then
 878        coordx3(:,:) =  grama(1+i3*cplx : i4*cplx,1:blocksize)
 879      end if
 880 
 881 
 882      if(use_linalg_gpu==1) then
 883        call copy_on_gpu(coordx2,coordx2_gpu,cplx*dp*blocksize*blocksize)
 884        if(bigorder==i4) then
 885          call copy_on_gpu(coordx3,coordx3_gpu,cplx*dp*blocksize*blocksize)
 886        end if
 887      end if
 888 
 889      ABI_DEALLOCATE(grama)
 890      ABI_DEALLOCATE(gramb)
 891      ABI_DEALLOCATE(eigen)
 892      if (restart==0 .and. iterationnumber >1) then
 893 
 894 !      blockvectorp=matmul(blockvectorr,coordx(i2+1:i3,:))+&
 895 !      &               matmul(blockvectorp,coordx(i3+1:i4,:))
 896        if(use_linalg_gpu==1) then
 897 !        call copy_on_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize)
 898          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorr_gpu,&
 899 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 900          call copy_on_gpu(blockvectorp,A_gpu,cplx*dp*vectsize*blocksize)
 901          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,vectsize,&
 902 &         coordx3_gpu,blocksize,cone,C_gpu,vectsize)
 903          call copy_from_gpu(blockvectorp,C_gpu,cplx*dp*vectsize*blocksize)
 904        else
 905          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorr,&
 906 &         vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 907          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorp,&
 908 &         vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx)
 909          call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorp,1,x_cplx=x_cplx)
 910        end if
 911 
 912 !      blockvectorap=matmul(blockvectorar,coordx(i2+1:i3,:))+&
 913 !      &                matmul(blockvectorap,coordx(i3+1:i4,:))
 914        if(use_linalg_gpu==1) then
 915 !        call copy_on_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize)
 916          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorar_gpu,&
 917 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 918          call copy_on_gpu(blockvectorap,A_gpu,cplx*dp*vectsize*blocksize)
 919          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,vectsize,&
 920 &         coordx3_gpu,blocksize,cone,C_gpu,vectsize)
 921          call copy_from_gpu(blockvectorap,C_gpu,cplx*dp*vectsize*blocksize)
 922        else
 923          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorar,&
 924 &         vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 925          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorap,&
 926 &         vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx)
 927          call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorap,1,x_cplx=x_cplx)
 928        end if
 929 
 930 
 931 !      blockvectorvp=matmul(blockvectorvr,coordx(i2+1:i3,:))+&
 932 !      &                matmul(blockvectorvp,coordx(i3+1:i4,:))
 933        if (gs_hamk%usepaw==0) then
 934          if(use_linalg_gpu==1) then
 935            call copy_on_gpu(blockvectorvr,A_gpu,cplx*dp*vectsize*blocksize)
 936            call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
 937 &           vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 938            call copy_on_gpu(blockvectorvp,A_gpu,cplx*dp*vectsize*blocksize)
 939            call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
 940 &           vectsize,coordx3_gpu,blocksize,cone,C_gpu,vectsize)
 941            call copy_from_gpu(blockvectorvp,C_gpu,cplx*dp*vectsize*blocksize)
 942          else
 943            call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvr,&
 944 &           vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 945            call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvp,&
 946 &           vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx)
 947            call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorvp,1,x_cplx=x_cplx)
 948          end if
 949        end if
 950 
 951 !      blockvectorbp=matmul(blockvectorbr,coordx(i2+1:i3,:))+&
 952 !      &                matmul(blockvectorbp,coordx(i3+1:i4,:))
 953        if(use_linalg_gpu==1) then
 954 !        call copy_on_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize)
 955          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorbr_gpu,&
 956 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 957          call copy_on_gpu(blockvectorbp,A_gpu,cplx*dp*vectsize*blocksize)
 958          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,vectsize,&
 959 &         coordx3_gpu,blocksize,cone,C_gpu,vectsize)
 960          call copy_from_gpu(blockvectorbp,C_gpu,cplx*dp*vectsize*blocksize)
 961        else
 962          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbr,&
 963 &         vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 964          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbp,&
 965 &         vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx)
 966          call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorbp,1,x_cplx=x_cplx)
 967        end if
 968 
 969      else
 970 
 971 !      blockvectoSz =matmul(blockvectorr,coordx(i2+1:i3,:))
 972        if(use_linalg_gpu==1) then
 973 !        call copy_on_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize)
 974          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorr_gpu,&
 975 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 976          call copy_from_gpu(blockvectorp,C_gpu,cplx*dp*vectsize*blocksize)
 977        else
 978          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorr,&
 979 &         vectsize,coordx2,blocksize,czero,blockvectorp,vectsize,x_cplx=x_cplx)
 980        end if
 981 
 982 !      blockvectorap=matmul(blockvectorar,coordx(i2+1:i3,:))
 983        if(use_linalg_gpu==1) then
 984 !        call copy_on_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize)
 985          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorar_gpu,&
 986 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 987          call copy_from_gpu(blockvectorap,C_gpu,cplx*dp*vectsize*blocksize)
 988        else
 989          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorar,&
 990 &         vectsize,coordx2,blocksize,czero,blockvectorap,vectsize,x_cplx=x_cplx)
 991        end if
 992 !      blockvectorvp=matmul(blockvectorvr,coordx(i2+1:i3,:))
 993        if (gs_hamk%usepaw==0) then
 994          if(use_linalg_gpu==1) then
 995            call copy_on_gpu(blockvectorvr,A_gpu,cplx*dp*vectsize*blocksize)
 996            call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
 997 &           vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 998            call copy_from_gpu(blockvectorvp,C_gpu,cplx*dp*vectsize*blocksize)
 999          else
1000            call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvr,&
1001 &           vectsize,coordx2,blocksize,czero,blockvectorvp,vectsize,x_cplx=x_cplx)
1002          end if
1003        end if
1004 
1005 !      blockvectorbp=matmul(blockvectorbr,coordx(i2+1:i3,:))
1006        if(use_linalg_gpu==1) then
1007 !        call copy_on_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize)
1008          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorbr_gpu,&
1009 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1010          call copy_from_gpu(blockvectorbp,C_gpu,cplx*dp*vectsize*blocksize)
1011        else
1012          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbr,&
1013 &         vectsize,coordx2,blocksize,czero,blockvectorbp,vectsize,x_cplx=x_cplx)
1014        end if
1015      end if
1016 
1017      if(use_linalg_gpu==1) then
1018        call copy_on_gpu(coordx1,coordx2_gpu,cplx*dp*blocksize*blocksize)
1019      end if
1020 
1021 !    blockvectorx = matmul(blockvectorx,coordx(i1+1:i2,:))+blockvectorp
1022      if(use_linalg_gpu==1) then
1023        call copy_on_gpu(blockvectorx,A_gpu,cplx*dp*vectsize*blocksize)
1024        call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1025 &       vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1026        call copy_from_gpu(blockvectordumm,C_gpu,cplx*dp*vectsize*blocksize)
1027      else
1028        call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorx,&
1029 &       vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
1030      end if
1031      blockvectorx = blockvectordumm+blockvectorp
1032 
1033 !    blockvectorax= matmul(blockvectorax,coordx(i1+1:i2,:))+blockvectorap
1034      if(use_linalg_gpu==1) then
1035        call copy_on_gpu(blockvectorax,A_gpu,cplx*dp*vectsize*blocksize)
1036        call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1037 &       vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1038        call copy_from_gpu(blockvectordumm,C_gpu,cplx*dp*vectsize*blocksize)
1039      else
1040        call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorax,&
1041 &       vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
1042      end if
1043      blockvectorax = blockvectordumm+blockvectorap
1044 
1045 !    blockvectorvx= matmul(blockvectorvx,coordx(i1+1:i2,:))+blockvectorvp
1046      if (gs_hamk%usepaw==0) then
1047        if(use_linalg_gpu==1) then
1048          call copy_on_gpu(blockvectorvx,A_gpu,cplx*dp*vectsize*blocksize)
1049          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1050 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1051          call copy_from_gpu(blockvectordumm,C_gpu,cplx*dp*vectsize*blocksize)
1052        else
1053          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvx,&
1054 &         vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
1055        end if
1056        blockvectorvx = blockvectordumm+blockvectorvp
1057      end if
1058 
1059 !    blockvectorbx= matmul(blockvectorbx,coordx(i1+1:i2,:))+blockvectorbp
1060      if(use_linalg_gpu==1) then
1061        call copy_on_gpu(blockvectorbx,A_gpu,cplx*dp*vectsize*blocksize)
1062        call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1063 &       vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1064        call copy_from_gpu(blockvectordumm,C_gpu,cplx*dp*vectsize*blocksize)
1065      else
1066        call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbx,&
1067 &       vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
1068      end if
1069      blockvectorbx = blockvectordumm+blockvectorbp
1070 
1071 !    ABI_DEALLOCATE(coordx)
1072      ABI_DEALLOCATE(coordx1)
1073      ABI_DEALLOCATE(coordx2)
1074      if(bigorder==i4) then
1075        ABI_DEALLOCATE(coordx3)
1076      end if
1077 
1078 !    Check convergence on energy and eventually exit
1079      if (iterationnumber==1) then
1080        deold=deltae
1081      else if (iterationnumber>1) then
1082        if ((abs(deltae)<0.005*abs(deold)).and.(iterationnumber/=maxiterations))then
1083          if(prtvol>=10)then
1084            write(message, '(2(a,i4),1x,a,1p,e12.4,a,e12.4,a)' ) &
1085 &           ' lobpcgwf: block',iblock,', line',iterationnumber,&
1086 &           ', deltae=',deltae,' < 0.005*',deold,' =>skip lines !'
1087            call wrtout(std_out,message,'PERS')
1088          end if
1089          exit
1090        else if (abs(deltae)>0.005*abs(deold)) then
1091          if(prtvol>=10)then
1092            write(message, '(2(a,i4),1x,a,1p,e12.4,a,e12.4,a)' ) &
1093 &           ' lobpcgwf: block',iblock,', line',iterationnumber,&
1094 &           ', deltae=',deltae,' > 0.005*',deold,' =>keep on working !'
1095            call wrtout(std_out,message,'PERS')
1096          end if
1097        end if
1098      end if
1099 
1100      if(abs(dtset%timopt)==4) then
1101        call timab(524,2,tsec)
1102      end if
1103 
1104    end do iter
1105 
1106 !  ###########################################################################
1107 !  ################## END LOOP ON NLINE ######################################
1108 !  ###########################################################################
1109 
1110    if(abs(dtset%timopt)==4) then
1111      call timab(525,1,tsec)
1112    end if
1113 
1114    if (havetoprecon) then
1115      call xprecon(blockvectorbx,lambda,blocksize,&
1116 &     iterationnumber,kinpw,mpi_enreg,npw_k,my_nspinor,&
1117 &     optekin,optpcon,pcon,blockvectorax,blockvectorr,vectsize,timopt=timopt,tim_xprecon=tim_xprecon)
1118 
1119      residualnorms=sum(abs(blockvectorr)**2,dim=1)
1120 
1121      if(abs(dtset%timopt)==3) then
1122        call timab(533,1,tsec)
1123      end if
1124      call xmpi_sum(residualnorms,mpi_enreg%comm_bandspinorfft,ierr)
1125      if(abs(dtset%timopt)==3) then
1126        call timab(533,2,tsec)
1127      end if
1128 
1129      resid_k(bblocksize+1:bblocksize+blocksize)=residualnorms(1:blocksize)
1130    end if
1131 
1132    call wfcopy('I',vectsize*blocksize,blockvectorx,1,cg,1,blocksize,iblock,'C',withbbloc=.true.,&
1133 &   timopt=timopt,tim_wfcopy=tim_wfcopy)
1134 
1135    if(gen_eigenpb) then
1136      call wfcopy('I',vectsize*blocksize,blockvectorbx,1,gsc,1,blocksize,iblock,'S',withbbloc=.true.,&
1137 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
1138    end if
1139 
1140 !  The Vnl part of the Hamiltonian is no more stored in the packed form such as it was the case for subvnl(:).
1141 !  Now, the full matrix is stored in totvnl(:,:). This trick permits:
1142 !  1) to avoid the reconstruction of the total matrix in vtowfk.F90 (double loop over bands)
1143 !  2) to use two optimized matrix-matrix blas routine for general (in lobpcgccwf.F90) or hermitian (in vtowfk.F90)
1144 !  operators, zgemm.f and zhemm.f respectively, rather than a triple loop in both cases.
1145    iwavef=iblock*blocksize
1146    isubh=1+2*bblocksize*(bblocksize+1)/2
1147 
1148    ABI_ALLOCATE(blockvectorz,(cplx*vectsize,iwavef))
1149    if(bblocksize > 0 ) then
1150      call abi_xcopy(bblocksize*vectsize,blockvectory(:,1:bblocksize),1,blockvectorz(:,1:bblocksize),1,x_cplx=x_cplx)
1151    end if
1152    call abi_xcopy( blocksize*vectsize,blockvectorx(:,1:blocksize) ,1,blockvectorz(:,bblocksize+1:iwavef),1,x_cplx=x_cplx)
1153 
1154    ABI_ALLOCATE(tsubham,(cplx*iwavef,blocksize))
1155    tsubham(:,:)=zero
1156    call abi_xgemm(cparam(cplx),'n',iwavef,blocksize,vectsize,cone,blockvectorz,vectsize,&
1157 &   blockvectorax,vectsize,czero,tsubham,iwavef,x_cplx=x_cplx)
1158 
1159    if (gs_hamk%usepaw==0) then
1160      ! MG FIXME: Here gfortran4.9 allocates temporary array for C in abi_d2zgemm.
1161      call abi_xgemm(cparam(cplx),'n',blocksize,iwavef,vectsize,cone,blockvectorvx,vectsize,&
1162 &     blockvectorz,vectsize,czero,totvnl(cplx*bblocksize+1:cplx*iwavef,1:iwavef),blocksize,x_cplx=x_cplx)
1163    end if
1164 
1165    do iblocksize=1,blocksize
1166      do ii=1,bblocksize+iblocksize
1167        if ( cplx == 1 ) then
1168          subham(isubh)  = tsubham(ii,iblocksize)
1169          subham(isubh+1)= zero
1170        else
1171          subham(isubh)  = tsubham(2*ii-1,iblocksize)
1172          subham(isubh+1)= tsubham(2*ii  ,iblocksize)
1173        end if
1174        isubh=isubh+2
1175      end do
1176    end do
1177    ABI_DEALLOCATE(tsubham)
1178    ABI_DEALLOCATE(blockvectorz)
1179 !  comm for subham and subvnl are made in vtowfk
1180 
1181    ABI_DEALLOCATE(pcon)
1182    ABI_DEALLOCATE(blockvectory)
1183    ABI_DEALLOCATE(blockvectorby)
1184    ABI_DEALLOCATE(gramyx)
1185    ABI_DEALLOCATE(blockvectorx)
1186    ABI_DEALLOCATE(blockvectorax)
1187    ABI_DEALLOCATE(blockvectorbx)
1188    ABI_DEALLOCATE(blockvectorr)
1189    ABI_DEALLOCATE(blockvectorar)
1190    ABI_DEALLOCATE(blockvectorbr)
1191    ABI_DEALLOCATE(blockvectorp)
1192    ABI_DEALLOCATE(blockvectorap)
1193    ABI_DEALLOCATE(blockvectorbp)
1194    if (gs_hamk%usepaw==0) then
1195      ABI_DEALLOCATE(blockvectorvx)
1196      ABI_DEALLOCATE(blockvectorvp)
1197      ABI_DEALLOCATE(blockvectorvr)
1198    end if
1199    ABI_DEALLOCATE(blockvectordumm)
1200    ABI_DEALLOCATE(gramxax)
1201    ABI_DEALLOCATE(gramxar)
1202    ABI_DEALLOCATE(gramxap)
1203    ABI_DEALLOCATE(gramrar)
1204    ABI_DEALLOCATE(gramrap)
1205    ABI_DEALLOCATE(grampap)
1206    ABI_DEALLOCATE(gramxbx)
1207    ABI_DEALLOCATE(gramxbr)
1208    ABI_DEALLOCATE(gramxbp)
1209    ABI_DEALLOCATE(gramrbr)
1210    ABI_DEALLOCATE(gramrbp)
1211    ABI_DEALLOCATE(grampbp)
1212    ABI_DEALLOCATE(transf3)
1213    ABI_DEALLOCATE(transf5)
1214    ABI_DEALLOCATE(lambda)
1215    ABI_DEALLOCATE(residualnorms)
1216    if(use_linalg_gpu==1) then
1217      call dealloc_on_gpu(bblockvector_gpu)
1218      call dealloc_on_gpu(gram_gpu)
1219    end if
1220 
1221  end do  ! End big loop over bands inside blocks
1222 
1223  if(use_linalg_gpu==1) then
1224    call dealloc_on_gpu(blockvectorr_gpu)
1225    call dealloc_on_gpu(blockvectorar_gpu)
1226    call dealloc_on_gpu(blockvectorbr_gpu)
1227    call dealloc_on_gpu(A_gpu)
1228    call dealloc_on_gpu(C_gpu)
1229    call dealloc_on_gpu(coordx2_gpu)
1230    call dealloc_on_gpu(coordx3_gpu)
1231    !call gpu_linalg_shutdown()
1232  end if
1233 
1234  if(abs(dtset%timopt)==4) then
1235    call timab(525,2,tsec)
1236  end if
1237  call timab(530,2,tsec)
1238 
1239  DBG_ENTER("COLL")
1240 
1241  contains
1242 
1243    function gramindex(iblocksize)
1244 
1245 
1246 !This section has been created automatically by the script Abilint (TD).
1247 !Do not modify the following lines by hand.
1248 #undef ABI_FUNC
1249 #define ABI_FUNC 'gramindex'
1250 !End of the abilint section
1251 
1252    integer :: gramindex,iblocksize
1253    gramindex=(iblocksize-1)*cplx+1
1254  end function gramindex
1255 
1256 end subroutine lobpcgwf