TABLE OF CONTENTS


ABINIT/lobpcgwf [ Functions ]

[ Top ] [ Functions ]

NAME

 lobpcgwf

FUNCTION

 this routine updates the whole wave functions at a given k-point,
 using the lobpcg method
 for a given spin-polarization, from a fixed hamiltonian
 but might also simply compute eigenvectors and eigenvalues at this k point.
 it will also update the matrix elements of the hamiltonian.

COPYRIGHT

 Copyright (C) 1998-2018 ABINIT group (FBottin,GZ,AR,MT,FDahm)
 this file is distributed under the terms of the
 gnu general public license, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 for the initials of contributors, see ~abinit/doc/developers/contributors.txt .

INPUTS

  dtset <type(dataset_type)>=all input variales for this dataset
  gs_hamk <type(gs_hamiltonian_type)>=all data for the hamiltonian at k
  icg=shift to be applied on the location of data in the array cg
  igsc=shift to be applied on the location of data in the array gsc
  kinpw(npw)=(modified) kinetic energy for each plane wave (hartree)
  mcg=second dimension of the cg array
  mgsc=second dimension of the gsc array
  mpi_enreg=informations about MPI parallelization
  nband_k=number of bands at this k point for that spin polarization
  nbdblock : number of blocks
  npw_k=number of plane waves at this k point
  prtvol=control print volume and debugging output

OUTPUT

  resid_k(nband_k)=residuals for each states
  subham(nband_k*(nband_k+1))=the matrix elements of h
  If gs_hamk%usepaw==0:
    gsc(2,mgsc)=<g|s|c> matrix elements (s=overlap)
    totvnl(nband_k*(1-gs_hamk%usepaw),nband_k*(1-gs_hamk%usepaw))=the matrix elements of vnl

SIDE EFFECTS

  cg(2,mcg)=updated wavefunctions

PARENTS

      vtowfk

CHILDREN

      abi_xcopy,abi_xgemm,abi_xheev,abi_xhegv,abi_xorthonormalize,abi_xtrsm
      alloc_on_gpu,copy_from_gpu,copy_on_gpu,dealloc_on_gpu,getghc,gpu_xgemm
      gpu_xorthonormalize,gpu_xtrsm,prep_getghc,setwfparameter,timab,wfcopy
      wrtout,xmpi_sum,xprecon

SOURCE

  93 subroutine lobpcgwf(cg,dtset,gs_hamk,gsc,icg,igsc,kinpw,mcg,mgsc,mpi_enreg,&
  94 &                   nband_k,nbdblock,npw_k,prtvol,resid_k,subham,totvnl)
  95 
  96 
  97  use defs_abitypes
  98  use defs_basis
  99  use m_abicore
 100  use m_lobpcg
 101  use m_abi_linalg
 102  use m_wfutils
 103  use m_xmpi
 104  use m_errors
 105  use iso_c_binding
 106 
 107  use m_time,        only : timab
 108  use m_hamiltonian, only : gs_hamiltonian_type
 109  use m_pawcprj,     only : pawcprj_type
 110  use m_getghc,      only : getghc
 111  use m_prep_kgb,    only : prep_getghc
 112 
 113 !This section has been created automatically by the script Abilint (TD).
 114 !Do not modify the following lines by hand.
 115 #undef ABI_FUNC
 116 #define ABI_FUNC 'lobpcgwf'
 117 !End of the abilint section
 118 
 119  implicit none
 120 
 121 !Arguments ------------------------------------
 122  integer,intent(in) :: icg,igsc,mcg,mgsc,nband_k,nbdblock,npw_k,prtvol
 123  type(gs_hamiltonian_type),intent(inout) :: gs_hamk
 124  type(dataset_type),intent(in) :: dtset
 125  type(mpi_type),intent(inout) :: mpi_enreg
 126  real(dp),intent(inout) :: cg(2,mcg),gsc(2,mgsc)
 127  real(dp),intent(in) :: kinpw(npw_k)
 128  real(dp),intent(out) :: resid_k(nband_k)
 129  real(dp),intent(inout) :: subham(nband_k*(nband_k+1))
 130  real(dp),intent(inout) :: totvnl((3-gs_hamk%istwf_k)*nband_k*(1-gs_hamk%usepaw),nband_k*(1-gs_hamk%usepaw))
 131 
 132 !Local variables-------------------------------
 133  integer, parameter :: tim_getghc=5
 134  integer :: activepsize,activersize,bblocksize,bigorder,blocksize,cpopt
 135  integer :: cond_try
 136  integer :: iblocksize,iblock,ierr,ii,info,istwf_k,isubh
 137  integer :: iterationnumber
 138  integer :: iwavef,i1,i2,i3,i4,maxiterations,my_nspinor
 139  integer :: nrestart,optekin,optpcon,restart
 140  integer :: sij_opt,timopt,tim_wfcopy,tim_xeigen
 141  integer :: tim_xortho,tim_xprecon,use_lapack_gpu,use_linalg_gpu,vectsize
 142  logical :: gen_eigenpb
 143  integer :: cplx
 144  real(dp) :: condestgramb,deltae,deold,dum
 145  complex(dpc) :: cminusone
 146  real(dp) :: zvar(2)
 147  logical :: havetoprecon
 148  real(dp) :: tsec(2)
 149  real(dp), allocatable :: gwavef(:,:),cwavef(:,:),gvnlc(:,:)
 150  real(dp), allocatable :: swavef(:,:)
 151  real(dp), allocatable :: residualnorms(:),eigen(:)
 152  real(dp), allocatable :: tmpeigen(:)
 153  real(dp), allocatable :: pcon(:,:)
 154  real(dp), allocatable :: blockvectorx(:,:),blockvectorvx(:,:),blockvectorax(:,:),blockvectorbx(:,:)
 155  real(dp),allocatable  :: blockvectorr(:,:),blockvectorvr(:,:),blockvectorar(:,:),blockvectorbr(:,:)
 156  real(dp),allocatable  :: blockvectorp(:,:),blockvectorvp(:,:),blockvectorap(:,:),blockvectorbp(:,:),blockvectordumm(:,:)
 157  real(dp),allocatable  :: blockvectory(:,:),blockvectorby(:,:),blockvectorz(:,:)
 158  real(dp),allocatable  :: gramxax(:,:),gramxar(:,:),gramxap(:,:),gramrar(:,:),gramrap(:,:),grampap(:,:)
 159  real(dp),allocatable  :: gramxbx(:,:),gramxbr(:,:),gramxbp(:,:),gramrbr(:,:),gramrbp(:,:),grampbp(:,:)
 160  real(dp),allocatable  :: coordx1(:,:),coordx2(:,:),coordx3(:,:),lambda(:,:),grama(:,:),gramb(:,:),gramyx(:,:)
 161  real(dp),allocatable  :: tmpgramb(:,:),transf3(:,:,:),transf5(:,:,:)
 162  real(dp), allocatable :: tsubham(:,:)
 163  type(pawcprj_type) :: cprj_dum(gs_hamk%natom,0)
 164  character(len=500) :: message
 165  character, dimension(2) :: cparam
 166  type(c_ptr) :: A_gpu,C_gpu,coordx2_gpu,coordx3_gpu,bblockvector_gpu,gram_gpu
 167  type(c_ptr) :: blockvectorr_gpu,blockvectorar_gpu,blockvectorbr_gpu
 168 
 169 !Index of a given band
 170 !gramindex(iblocksize)=(iblocksize-1)*cplx+1
 171 
 172 ! *********************************************************************
 173 
 174  DBG_ENTER("COLL")
 175 
 176  call timab(530,1,tsec)
 177  if(abs(dtset%timopt)==4) then
 178    call timab(520,1,tsec)
 179  end if
 180 
 181 !###########################################################################
 182 !################ INITIALISATION  ##########################################
 183 !###########################################################################
 184 
 185 !For timing
 186  timopt=dtset%timopt
 187  tim_wfcopy=584
 188  !tim_xcopy=584
 189  tim_xeigen=587
 190  !tim_xgemm=532
 191  tim_xortho=535
 192  tim_xprecon=536
 193  !tim_xtrsm=535
 194 
 195 !Variables
 196  maxiterations=dtset%nline
 197  gen_eigenpb=(gs_hamk%usepaw==1)
 198  my_nspinor=max(1,dtset%nspinor/mpi_enreg%nproc_spinor)
 199  cminusone=-cone
 200  istwf_k=gs_hamk%istwf_k
 201  info = 0
 202  cparam(1)='t'
 203  cparam(2)='c'
 204 
 205 !Depends on istwfk
 206  if ( istwf_k == 2 ) then
 207    cplx=1
 208    if (mpi_enreg%me_g0 == 1) then
 209      vectsize=2*npw_k*my_nspinor-1
 210    else
 211      vectsize=2*npw_k*my_nspinor
 212    end if
 213  else
 214    cplx=2
 215    vectsize=npw_k*my_nspinor
 216  end if
 217 
 218 !For preconditionning
 219  optekin=0;if (dtset%wfoptalg>10) optekin=0
 220  optpcon=1;if (dtset%wfoptalg>10) optpcon=0
 221 
 222 !For communication
 223  blocksize=mpi_enreg%nproc_fft
 224  if(mpi_enreg%paral_kgb==1) blocksize=mpi_enreg%nproc_band*mpi_enreg%bandpp
 225  !IF you want to compare with new lobpcg in sequential uncomment the following
 226  !line
 227  !blocksize=mpi_enreg%nproc_band*mpi_enreg%bandpp
 228 
 229 !Iniitializations/allocations of GPU parallelism
 230  use_linalg_gpu=0;use_lapack_gpu=0
 231  if ((dtset%use_gpu_cuda==1).and. &
 232 & (vectsize*blocksize*blocksize>dtset%gpu_linalg_limit)) use_linalg_gpu=1
 233 #if defined HAVE_LINALG_MAGMA
 234  use_lapack_gpu=use_linalg_gpu
 235 #endif
 236  if(use_linalg_gpu==1) then
 237 !  call gpu_linalg_init()
 238    call alloc_on_gpu(A_gpu,cplx*dp*vectsize*blocksize)
 239    call alloc_on_gpu(C_gpu,cplx*dp*vectsize*blocksize)
 240    call alloc_on_gpu(blockvectorr_gpu,cplx*dp*vectsize*blocksize)
 241    call alloc_on_gpu(blockvectorar_gpu,cplx*dp*vectsize*blocksize)
 242    call alloc_on_gpu(blockvectorbr_gpu,cplx*dp*vectsize*blocksize)
 243    call alloc_on_gpu(coordx2_gpu,cplx*dp*blocksize*blocksize)
 244    call alloc_on_gpu(coordx3_gpu,cplx*dp*blocksize*blocksize)
 245  end if
 246 
 247  if(abs(dtset%timopt)==4) then
 248    call timab(520,2,tsec)
 249  end if
 250 
 251 !###########################################################################
 252 !################ BIG LOOP OVER BLOCKS  ####################################
 253 !###########################################################################
 254 
 255  do iblock=1,nbdblock
 256 
 257    if(abs(dtset%timopt)==4) then
 258      call timab(521,1,tsec)
 259    end if
 260 
 261    havetoprecon=.true.
 262    nrestart=0
 263    bblocksize=(iblock-1)*blocksize
 264 
 265 !  allocations
 266    ABI_ALLOCATE(pcon,(npw_k,blocksize))
 267    ABI_ALLOCATE(blockvectorx,(cplx*vectsize,blocksize))
 268    ABI_ALLOCATE(blockvectorax,(cplx*vectsize,blocksize))
 269    ABI_ALLOCATE(blockvectorbx,(cplx*vectsize,blocksize))
 270    ABI_ALLOCATE(blockvectorr,(cplx*vectsize,blocksize))
 271    ABI_ALLOCATE(blockvectorar,(cplx*vectsize,blocksize))
 272    ABI_ALLOCATE(blockvectorbr,(cplx*vectsize,blocksize))
 273    ABI_ALLOCATE(blockvectorp,(cplx*vectsize,blocksize))
 274    ABI_ALLOCATE(blockvectorap,(cplx*vectsize,blocksize))
 275    ABI_ALLOCATE(blockvectorbp,(cplx*vectsize,blocksize))
 276    ABI_ALLOCATE(blockvectordumm,(cplx*vectsize,blocksize))
 277    ABI_ALLOCATE(gramxax,(cplx*blocksize,blocksize))
 278    ABI_ALLOCATE(gramxar,(cplx*blocksize,blocksize))
 279    ABI_ALLOCATE(gramxap,(cplx*blocksize,blocksize))
 280    ABI_ALLOCATE(gramrar,(cplx*blocksize,blocksize))
 281    ABI_ALLOCATE(gramrap,(cplx*blocksize,blocksize))
 282    ABI_ALLOCATE(grampap,(cplx*blocksize,blocksize))
 283    ABI_ALLOCATE(gramxbx,(cplx*blocksize,blocksize))
 284    ABI_ALLOCATE(gramxbr,(cplx*blocksize,blocksize))
 285    ABI_ALLOCATE(gramxbp,(cplx*blocksize,blocksize))
 286    ABI_ALLOCATE(gramrbr,(cplx*blocksize,blocksize))
 287    ABI_ALLOCATE(gramrbp,(cplx*blocksize,blocksize))
 288    ABI_ALLOCATE(grampbp,(cplx*blocksize,blocksize))
 289    ABI_ALLOCATE(transf3,(cplx*blocksize,blocksize,3))
 290    ABI_ALLOCATE(transf5,(cplx*blocksize,blocksize,5))
 291    ABI_ALLOCATE(lambda,(cplx*blocksize,blocksize))
 292    ABI_ALLOCATE(residualnorms,(blocksize))
 293 
 294    ABI_ALLOCATE(blockvectory,(cplx*vectsize,bblocksize))
 295    ABI_ALLOCATE(blockvectorby,(cplx*vectsize,bblocksize))
 296    ABI_ALLOCATE(gramyx,(cplx*bblocksize,blocksize))
 297    if (gs_hamk%usepaw==0) then
 298      ABI_ALLOCATE(blockvectorvx,(cplx*vectsize,blocksize))
 299      ABI_ALLOCATE(blockvectorvr,(cplx*vectsize,blocksize))
 300      ABI_ALLOCATE(blockvectorvp,(cplx*vectsize,blocksize))
 301    end if
 302 
 303    if(use_linalg_gpu==1) then
 304      if(iblock/=1) then
 305        call alloc_on_gpu(bblockvector_gpu,cplx*dp*vectsize*bblocksize)
 306        call alloc_on_gpu(gram_gpu,cplx*dp*bblocksize*blocksize)
 307      else
 308        call alloc_on_gpu(bblockvector_gpu,cplx*dp*vectsize*blocksize)
 309        call alloc_on_gpu(gram_gpu,cplx*dp*blocksize*blocksize)
 310      end if
 311    end if
 312 
 313 ! Initialize global variables in m_wfutils.
 314    call setWFParameter(cplx,mpi_enreg%me_g0,npw_k,my_nspinor,icg,igsc,blocksize)
 315 
 316 !  transfer array of wf coeff in iblock to blockvectorx
 317    call wfcopy('D',blocksize*vectsize,cg,1,blockvectorx,1,blocksize,iblock,'C',withbbloc=.true.,&
 318 &   timopt=timopt,tim_wfcopy=tim_wfcopy)
 319 
 320 !  !!!!!!!!!!!!!!!!!!!!!!!! Begin if iblock /=1 !!!!!!!!!!!!!!!!!!!!!!!!!!
 321 !  transfer array of wf coeff less than iblock to blockvectory
 322    if(iblock /=1) then
 323      call wfcopy('D',bblocksize*vectsize,cg,1,blockvectory,1,bblocksize,iblock,'C',withbbloc=.false.,&
 324 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 325 
 326      if(gen_eigenpb) then
 327        call wfcopy('D',bblocksize*vectsize,gsc,1,blockvectorby,1,bblocksize,iblock,'S',withbbloc=.false.,&
 328 &       timopt=timopt,tim_wfcopy=tim_wfcopy)
 329      else
 330        call abi_xcopy(vectsize*bblocksize,blockvectory,1,blockvectorby,1,x_cplx=x_cplx)
 331      end if
 332 
 333 !    b-orthogonalize x to the constraint y (supposed b-orthonormal)
 334 !    blockvectorx=blockvectorx-matmul(blockvectory,matmul((blockvectorby)^T,blockvectorx))
 335 
 336      call abi_xgemm(cparam(cplx),'n',bblocksize,blocksize,vectsize,cone,blockvectorby,&
 337 &     vectsize,blockvectorx,vectsize,czero,gramyx,bblocksize,x_cplx=x_cplx)
 338 
 339      if(abs(dtset%timopt)==3) then
 340        call timab(533,1,tsec)
 341      end if
 342      call xmpi_sum(gramyx,mpi_enreg%comm_bandspinorfft,ierr)
 343      if(abs(dtset%timopt)==3) then
 344        call timab(533,2,tsec)
 345      end if
 346 
 347      call abi_xgemm('n','n',vectsize,blocksize,bblocksize,cminusone,blockvectory,&
 348 &     vectsize,gramyx,bblocksize,cone,blockvectorx,vectsize,x_cplx=x_cplx)
 349 
 350    end if
 351 !  !!!!!!!!!!!!!!!!!!!!!!!! End if iblock /=1 !!!!!!!!!!!!!!!!!!!!!!!!!!!
 352 
 353    ABI_ALLOCATE(cwavef,(2,npw_k*my_nspinor*blocksize))
 354    ABI_ALLOCATE(gwavef,(2,npw_k*my_nspinor*blocksize))
 355    ABI_ALLOCATE(gvnlc,(2,npw_k*my_nspinor*blocksize))
 356    ABI_ALLOCATE(swavef,(2,npw_k*my_nspinor*blocksize))
 357 
 358    call wfcopy('I',vectsize*blocksize,blockvectorx,1,cwavef,1,blocksize,iblock,'W',withbbloc=.false.,&
 359 &   timopt=timopt,tim_wfcopy=tim_wfcopy)
 360 
 361    if(abs(dtset%timopt)==4) then
 362      call timab(521,2,tsec)
 363    end if
 364    if(abs(dtset%timopt)==4) then
 365      call timab(526,1,tsec)
 366    end if
 367 
 368    cpopt=-1;sij_opt=0;if (gen_eigenpb) sij_opt=1
 369 
 370    if (mpi_enreg%paral_kgb==0) then
 371      call getghc(cpopt,cwavef,cprj_dum,gwavef,swavef,gs_hamk,gvnlc,dum,&
 372 &     mpi_enreg,blocksize,prtvol,sij_opt,tim_getghc,0)
 373    else
 374      call prep_getghc(cwavef,gs_hamk,gvnlc,gwavef,swavef,dum,blocksize,mpi_enreg,&
 375 &     prtvol,sij_opt,cpopt,cprj_dum,already_transposed=.false.)
 376    end if
 377    if(abs(dtset%timopt)==4) then
 378      call timab(526,2,tsec)
 379    end if
 380    if(abs(dtset%timopt)==4) then
 381      call timab(522,1,tsec)
 382    end if
 383 
 384    if ( gen_eigenpb ) then
 385      call wfcopy('D',vectsize*blocksize,swavef,1,blockvectorbx,1,blocksize,iblock,'W',withbbloc=.false.,&
 386 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 387    else
 388      call wfcopy('D',vectsize*blocksize,gvnlc,1,blockvectorvx,1,blocksize,iblock,'W',withbbloc=.false.,&
 389 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 390      call abi_xcopy(vectsize*blocksize,blockvectorx,1,blockvectorbx,1,x_cplx=x_cplx)
 391    end if
 392 
 393    call wfcopy('D',vectsize*blocksize,gwavef,1,blockvectorax,1,blocksize,iblock,'W',withbbloc=.false.,&
 394 &   timopt=timopt,tim_wfcopy=tim_wfcopy)
 395 
 396    ABI_DEALLOCATE(cwavef)
 397    ABI_DEALLOCATE(gwavef)
 398    ABI_DEALLOCATE(gvnlc)
 399    ABI_DEALLOCATE(swavef)
 400 
 401    call abi_xorthonormalize(blockvectorx,blockvectorbx,blocksize,mpi_enreg%comm_bandspinorfft,gramxbx,vectsize,&
 402 &   x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 403 
 404    call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramxbx,blocksize,blockvectorbx,vectsize,x_cplx=x_cplx)
 405    call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramxbx,blocksize,blockvectorax,vectsize,x_cplx=x_cplx)
 406 
 407    if (gs_hamk%usepaw==0) then
 408      call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramxbx,blocksize,blockvectorvx,vectsize,x_cplx=x_cplx)
 409    end if
 410 
 411 !  Do rayleigh ritz on a in space x
 412 !  gramxax=matmul(transpose(blockvectorx),blockvectorax)
 413    call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorx,&
 414 &   vectsize,blockvectorax,vectsize,czero,gramxax,blocksize,x_cplx=x_cplx)
 415 
 416    if(abs(dtset%timopt)==3) then
 417      call timab(533,1,tsec)
 418    end if
 419    call xmpi_sum(gramxax,mpi_enreg%comm_bandspinorfft,ierr)
 420    if(abs(dtset%timopt)==3) then
 421      call timab(533,2,tsec)
 422    end if
 423    ABI_ALLOCATE(eigen,(blocksize))
 424 
 425    call abi_xheev('v','u',blocksize,gramxax,eigen,x_cplx=cplx,istwf_k=istwf_k, &
 426    timopt=timopt,tim_xeigen=tim_xeigen,use_slk=dtset%use_slk,use_gpu=use_lapack_gpu)
 427 
 428 !  blockvectorx=matmul(blockvectorx,gramxax)
 429    call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorx,&
 430 &   vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 431    call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorx,1,x_cplx=x_cplx)
 432 
 433 !  blockvectorax=matmul(blockvectorax,gramxax)
 434    call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorax,&
 435 &   vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 436    call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorax,1,x_cplx=x_cplx)
 437 
 438 !  blockvectorvx=matmul(blockvectorvx,gramxax)
 439    if (gs_hamk%usepaw==0) then
 440      call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvx,&
 441 &     vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 442      call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorvx,1,x_cplx=x_cplx)
 443    end if
 444 
 445 !  blockvectorbx=matmul(blockvectorbx,gramxax)
 446    call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbx,&
 447 &   vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 448    call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorbx,1,x_cplx=x_cplx)
 449 
 450    do iblocksize=1,blocksize
 451      zvar=(/eigen(iblocksize),zero/)
 452      call abi_xcopy(1,zvar,1,lambda(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx)
 453    end do
 454    ABI_DEALLOCATE(eigen)
 455 
 456    if(abs(dtset%timopt)==4) then
 457      call timab(522,2,tsec)
 458    end if
 459 
 460 !  ###########################################################################
 461 !  ################ PERFORM LOOP ON NLINE ####################################
 462 !  ###########################################################################
 463 !  now the main alogrithm
 464    iter: do iterationnumber=1,maxiterations
 465 
 466      if(abs(dtset%timopt)==4) then
 467        call timab(523,1,tsec)
 468      end if
 469 
 470 !    Build residual
 471 !    blockvectorr=blockvectorax-matmul(blockvectorx,lambda)
 472      call xprecon(blockvectorbx,lambda,blocksize,&
 473 &     iterationnumber,kinpw,mpi_enreg,npw_k,my_nspinor,&
 474 &     optekin,optpcon,pcon,blockvectorax,blockvectorr,vectsize,timopt=timopt,tim_xprecon=tim_xprecon)
 475 
 476      residualnorms=sum(blockvectorr**2,dim=1)
 477 
 478      if(abs(dtset%timopt)==3) then
 479        call timab(533,1,tsec)
 480      end if
 481      call xmpi_sum(residualnorms,mpi_enreg%comm_bandspinorfft,ierr)
 482      if(abs(dtset%timopt)==3) then
 483        call timab(533,2,tsec)
 484      end if
 485 
 486      resid_k(bblocksize+1:bblocksize+blocksize)=residualnorms(1:blocksize)
 487 
 488 !    If residual sufficiently small stop line minimizations
 489      if (abs(maxval(residualnorms(1:blocksize)))<dtset%tolwfr) then
 490        if (prtvol > 0) then
 491          write(message, '(a,i0,a,i0,a,es12.4)' ) &
 492 &         ' lobpcgwf: block ',iblock,' converged after ',iterationnumber,&
 493 &         ' line minimizations: maxval(resid(1:blocksize)) =',maxval(residualnorms(1:blocksize))
 494          call wrtout(std_out,message,'PERS')
 495        end if
 496        havetoprecon=.false.
 497        exit
 498      end if
 499 
 500      if(use_linalg_gpu==1) then
 501        call copy_on_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize)
 502      end if
 503 
 504      if(iblock /=1) then
 505 !      Residuals orthogonal to blockvectorby
 506 !      blockvectorr=blockvectorr-matmul(blockvectory,matmul((blockvectorby)^T,blockvectorr))
 507 
 508        if(use_linalg_gpu==1) then
 509          call copy_on_gpu(blockvectorby,bblockvector_gpu,cplx*dp*vectsize*bblocksize)
 510          call gpu_xgemm(cplx,cparam(cplx),'n',bblocksize,blocksize,vectsize,cone,bblockvector_gpu,&
 511 &         vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,bblocksize)
 512          call copy_from_gpu(gramyx,gram_gpu,cplx*dp*bblocksize*blocksize)
 513        else
 514          call abi_xgemm(cparam(cplx),'n',bblocksize,blocksize,vectsize,cone,blockvectorby,&
 515 &         vectsize,blockvectorr,vectsize,czero,gramyx,bblocksize,x_cplx=x_cplx)
 516        end if
 517 
 518        if(abs(dtset%timopt)==3) then
 519          call timab(533,1,tsec)
 520        end if
 521        call xmpi_sum(gramyx,mpi_enreg%comm_bandspinorfft,ierr)
 522        if(abs(dtset%timopt)==3) then
 523          call timab(533,2,tsec)
 524        end if
 525 
 526        if(use_linalg_gpu==1) then
 527          call copy_on_gpu(gramyx,gram_gpu,cplx*dp*bblocksize*blocksize)
 528          call copy_on_gpu(blockvectory,bblockvector_gpu,cplx*dp*vectsize*bblocksize)
 529          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,bblocksize,cminusone,bblockvector_gpu,&
 530 &         vectsize,gram_gpu,bblocksize,cone,blockvectorr_gpu,vectsize)
 531        else
 532          call abi_xgemm('n','n',vectsize,blocksize,bblocksize,cminusone,blockvectory,&
 533 &         vectsize,gramyx,bblocksize,cone,blockvectorr,vectsize,x_cplx=x_cplx)
 534        end if
 535 
 536      end if
 537 
 538 !    Residuals orthogonal to blockvectorx
 539 !    blockvectorr=blockvectorr-matmul(blockvectorx,matmul((blockvectorbx)^T,blockvectorr))
 540      if(use_linalg_gpu==1) then
 541        call copy_on_gpu(blockvectorbx,C_gpu,cplx*dp*vectsize*blocksize)
 542        call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,C_gpu,&
 543 &       vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,blocksize)
 544        call copy_from_gpu(gramxax,gram_gpu,cplx*dp*blocksize*blocksize)
 545      else
 546        call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorbx,&
 547 &       vectsize,blockvectorr,vectsize,czero,gramxax,blocksize,x_cplx=x_cplx)
 548      end if
 549 
 550      if(abs(dtset%timopt)==3) then
 551        call timab(533,1,tsec)
 552      end if
 553      call xmpi_sum(gramxax,mpi_enreg%comm_bandspinorfft,ierr)
 554      if(abs(dtset%timopt)==3) then
 555        call timab(533,2,tsec)
 556      end if
 557 
 558      if(use_linalg_gpu==1) then
 559        call copy_on_gpu(gramxax,gram_gpu,cplx*dp*blocksize*blocksize)
 560        call copy_on_gpu(blockvectorx,C_gpu,cplx*dp*vectsize*blocksize)
 561        call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cminusone,C_gpu,&
 562 &       vectsize,gram_gpu,blocksize,cone,blockvectorr_gpu,vectsize)
 563        call copy_from_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize)
 564      else
 565        call abi_xgemm('n','n',vectsize,blocksize,blocksize,cminusone,blockvectorx,&
 566 &       vectsize,gramxax,blocksize,cone,blockvectorr,vectsize,x_cplx=x_cplx)
 567      end if
 568 
 569      ABI_ALLOCATE(cwavef,(2,npw_k*my_nspinor*blocksize))
 570      ABI_ALLOCATE(gwavef,(2,npw_k*my_nspinor*blocksize))
 571      ABI_ALLOCATE(gvnlc,(2,npw_k*my_nspinor*blocksize))
 572      ABI_ALLOCATE(swavef,(2,npw_k*my_nspinor*blocksize))
 573 
 574      call wfcopy('I',vectsize*blocksize,blockvectorr,1,cwavef,1,blocksize,iblock,'W',withbbloc=.false.,&
 575 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 576 
 577      cpopt=-1;sij_opt=0;if (gen_eigenpb) sij_opt=1
 578 
 579      if(abs(dtset%timopt)==4) then
 580        call timab(523,2,tsec)
 581      end if
 582      if(abs(dtset%timopt)==4) then
 583        call timab(526,1,tsec)
 584      end if
 585 
 586      if (mpi_enreg%paral_kgb==0) then
 587        call getghc(cpopt,cwavef,cprj_dum,gwavef,swavef,gs_hamk,gvnlc,dum,&
 588 &       mpi_enreg,blocksize,prtvol,sij_opt,tim_getghc,0)
 589      else
 590        call prep_getghc(cwavef,gs_hamk,gvnlc,gwavef,swavef,dum,blocksize,mpi_enreg,&
 591 &       prtvol,sij_opt,cpopt,cprj_dum,already_transposed=.false.)
 592      end if
 593 
 594      if(abs(dtset%timopt)==4) then
 595        call timab(526,2,tsec)
 596      end if
 597      if(abs(dtset%timopt)==4) then
 598        call timab(524,1,tsec)
 599      end if
 600 
 601      if (gen_eigenpb) then
 602        call wfcopy('D',vectsize*blocksize,swavef,1,blockvectorbr,1,blocksize,iblock,'W',withbbloc=.false.,&
 603 &       timopt=timopt,tim_wfcopy=tim_wfcopy)
 604      else
 605        call abi_xcopy(vectsize*blocksize,blockvectorr,1,blockvectorbr,1,x_cplx=x_cplx)
 606        call wfcopy('D',vectsize*blocksize,gvnlc,1,blockvectorvr,1,blocksize,iblock,'W',withbbloc=.false.,&
 607 &       timopt=timopt,tim_wfcopy=tim_wfcopy)
 608      end if
 609 
 610      call wfcopy('D',vectsize*blocksize,gwavef,1,blockvectorar,1,blocksize,iblock,'W',withbbloc=.false.,&
 611 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
 612 
 613      ABI_DEALLOCATE(cwavef)
 614      ABI_DEALLOCATE(gwavef)
 615      ABI_DEALLOCATE(gvnlc)
 616      ABI_DEALLOCATE(swavef)
 617 
 618      if(use_linalg_gpu==1) then
 619        call copy_on_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize)
 620        call gpu_xorthonormalize(blockvectorr_gpu,blockvectorbr_gpu,blocksize,mpi_enreg%comm_bandspinorfft,gram_gpu,vectsize,&
 621 &       x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 622        call copy_from_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize)
 623        call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,blockvectorbr_gpu,vectsize)
 624        call copy_from_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize)
 625        call copy_on_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize)
 626        call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,blockvectorar_gpu,vectsize)
 627        call copy_from_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize)
 628        if (gs_hamk%usepaw==0) then
 629          call copy_on_gpu(blockvectorvr,A_gpu,cplx*dp*vectsize*blocksize)
 630          call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,A_gpu,vectsize)
 631          call copy_from_gpu(blockvectorvr,A_gpu,cplx*dp*vectsize*blocksize)
 632        end if
 633        call copy_from_gpu(gramrbr,gram_gpu,cplx*dp*blocksize*blocksize)
 634      else
 635        call abi_xorthonormalize(blockvectorr,blockvectorbr,blocksize,mpi_enreg%comm_bandspinorfft,gramrbr,vectsize,&
 636 &       x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 637        call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramrbr,blocksize,blockvectorbr,vectsize,x_cplx=x_cplx)
 638        call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramrbr,blocksize,blockvectorar,vectsize,x_cplx=x_cplx)
 639        if (gs_hamk%usepaw==0) then
 640          call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramrbr,blocksize,blockvectorvr,vectsize,x_cplx=x_cplx)
 641        end if
 642      end if
 643 
 644      if(iterationnumber>1) then
 645        if(use_linalg_gpu==1) then
 646          call copy_on_gpu(blockvectorp,A_gpu,cplx*dp*vectsize*blocksize)
 647          call copy_on_gpu(blockvectorbp,C_gpu,cplx*dp*vectsize*blocksize)
 648          call gpu_xorthonormalize(A_gpu,C_gpu,blocksize,mpi_enreg%comm_bandspinorfft,gram_gpu,vectsize,&
 649 &         x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 650          call copy_from_gpu(blockvectorp,A_gpu,cplx*dp*vectsize*blocksize)
 651          call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,C_gpu,vectsize)
 652          call copy_from_gpu(blockvectorbp,C_gpu,cplx*dp*vectsize*blocksize)
 653          call copy_on_gpu(blockvectorap,A_gpu,cplx*dp*vectsize*blocksize)
 654          call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,A_gpu,vectsize)
 655          call copy_from_gpu(blockvectorap,A_gpu,cplx*dp*vectsize*blocksize)
 656          if (gs_hamk%usepaw==0) then
 657            call copy_on_gpu(blockvectorvp,A_gpu,cplx*dp*vectsize*blocksize)
 658            call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,A_gpu,vectsize)
 659            call copy_from_gpu(blockvectorvp,A_gpu,cplx*dp*vectsize*blocksize)
 660          end if
 661          call copy_from_gpu(grampbp,gram_gpu,cplx*dp*blocksize*blocksize)
 662        else
 663 !        call orthonormalize(blockvectorp,blockvectorbp,blockvectorap)
 664          call abi_xorthonormalize(blockvectorp,blockvectorbp,blocksize,mpi_enreg%comm_bandspinorfft,grampbp,vectsize,&
 665 &         x_cplx,timopt=timopt,tim_xortho=tim_xortho)
 666 !        blockvectorap=matmul(blockvectorap,grampbp)
 667          call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,grampbp,blocksize,blockvectorbp,vectsize,x_cplx=x_cplx)
 668          call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,grampbp,blocksize,blockvectorap,vectsize,x_cplx=x_cplx)
 669          if (gs_hamk%usepaw==0) then
 670            call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,grampbp,blocksize,blockvectorvp,vectsize,x_cplx=x_cplx)
 671          end if
 672        end if
 673      end if
 674 
 675      activersize=blocksize
 676      if (iterationnumber==1) then
 677        activepsize=0
 678        restart=1
 679      else
 680        activepsize=blocksize
 681        restart=0
 682      end if
 683 
 684 !    gramxar=matmul((blockvectorax)^T,blockvectorr)
 685 !    gramrar=matmul((blockvectorar)^T,blockvectorr)
 686 !    gramxax=matmul((blockvectorax)^T,blockvectorx)
 687      if(use_linalg_gpu==1) then
 688        call copy_on_gpu(blockvectorax,A_gpu,cplx*dp*vectsize*blocksize)
 689        call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,A_gpu,&
 690 &       vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,blocksize)
 691        call copy_from_gpu(gramxar,gram_gpu,cplx*dp*blocksize*blocksize)
 692        call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorar_gpu,&
 693 &       vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,blocksize)
 694        call copy_from_gpu(gramrar,gram_gpu,cplx*dp*blocksize*blocksize)
 695        call copy_on_gpu(blockvectorx,C_gpu,cplx*dp*vectsize*blocksize)
 696        call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,A_gpu,&
 697 &       vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 698        call copy_from_gpu(gramxax,gram_gpu,cplx*dp*blocksize*blocksize)
 699      else
 700        call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorax,&
 701 &       vectsize,blockvectorr,vectsize,czero,gramxar,blocksize,x_cplx=x_cplx)
 702        call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorar,&
 703 &       vectsize,blockvectorr,vectsize,czero,gramrar,blocksize,x_cplx=x_cplx)
 704        call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorax,&
 705 &       vectsize,blockvectorx,vectsize,czero,gramxax,blocksize,x_cplx=x_cplx)
 706      end if
 707 
 708      call abi_xcopy(blocksize*blocksize,gramxar,1,transf3(:,:,1),1,x_cplx=x_cplx)
 709      call abi_xcopy(blocksize*blocksize,gramrar,1,transf3(:,:,2),1,x_cplx=x_cplx)
 710      call abi_xcopy(blocksize*blocksize,gramxax,1,transf3(:,:,3),1,x_cplx=x_cplx)
 711      if(abs(dtset%timopt)==3) then
 712        call timab(533,1,tsec)
 713      end if
 714      call xmpi_sum(transf3,mpi_enreg%comm_bandspinorfft,ierr)
 715      if(abs(dtset%timopt)==3) then
 716        call timab(533,2,tsec)
 717      end if
 718 
 719      call abi_xcopy(blocksize*blocksize,transf3(:,:,1),1,gramxar,1,x_cplx=x_cplx)
 720      call abi_xcopy(blocksize*blocksize,transf3(:,:,2),1,gramrar,1,x_cplx=x_cplx)
 721      call abi_xcopy(blocksize*blocksize,transf3(:,:,3),1,gramxax,1,x_cplx=x_cplx)
 722 
 723 !    gramxbx=matmul((blockvectorbx)^T,blockvectorx)
 724 !    gramrbr=matmul((blockvectorbr)^T,blockvectorr)
 725 !    gramxbr=matmul((blockvectorbx)^T,blockvectorr)
 726 !    Note that the gramb matrix is more easier to construct than grama:
 727 !    i) <x|B|x>=<r|B|r>=<p|B|p>=(1;0)
 728 !    since the x, r and p blockvector are normalized
 729 !    ii) <r|B|x>=(0;0)
 730 !    since the x and r blockvector are orthogonalized
 731 !    iii) The <p|B|r> and <p|B|x> have to be computed.
 732      gramxbx(:,:)=zero
 733      gramrbr(:,:)=zero
 734      gramxbr(:,:)=zero
 735      do iblocksize=1,blocksize
 736        zvar=(/one,zero/)
 737        call abi_xcopy(1,zvar,1,gramxbx(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx)
 738        call abi_xcopy(1,zvar,1,gramrbr(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx)
 739      end do
 740 
 741 !    ###########################################################################
 742 !    ################ PERFORM LOOP ON COND #####################################
 743 !    ###########################################################################
 744 
 745      i1=0;i2=blocksize;i3=2*blocksize;i4=3*blocksize
 746      cond: do cond_try=1,2 !2 when restart
 747        if (restart==0) then
 748 
 749 !        gramxap=matmul((blockvectorax)^T,blockvectorp)
 750 !        gramrap=matmul((blockvectorar)^T,blockvectorp)
 751 !        grampap=matmul((blockvectorap)^T,blockvectorp)
 752 !        gramxbp=matmul((blockvectorbx)^T,blockvectorp)
 753 !        gramrbp=matmul((blockvectorbr)^T,blockvectorp)
 754 !        grampbp=matmul((blockvectorbp)^T,blockvectorp)
 755          if(use_linalg_gpu==1) then
 756            call copy_on_gpu(blockvectorp,C_gpu,cplx*dp*vectsize*blocksize)
 757            call copy_on_gpu(blockvectorax,A_gpu,cplx*dp*vectsize*blocksize)
 758            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 759 &           cone,A_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 760            call copy_from_gpu(gramxap,gram_gpu,cplx*dp*blocksize*blocksize)
 761            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 762 &           cone,blockvectorar_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 763            call copy_from_gpu(gramrap,gram_gpu,cplx*dp*blocksize*blocksize)
 764            call copy_on_gpu(blockvectorap,A_gpu,cplx*dp*vectsize*blocksize)
 765            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 766 &           cone,A_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 767            call copy_from_gpu(grampap,gram_gpu,cplx*dp*blocksize*blocksize)
 768            call copy_on_gpu(blockvectorbx,A_gpu,cplx*dp*vectsize*blocksize)
 769            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 770 &           cone,A_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 771            call copy_from_gpu(gramxbp,gram_gpu,cplx*dp*blocksize*blocksize)
 772            call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,&
 773 &           cone,blockvectorbr_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize)
 774            call copy_from_gpu(gramrbp,gram_gpu,cplx*dp*blocksize*blocksize)
 775          else
 776            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorax,&
 777 &           vectsize,blockvectorp,vectsize,czero,gramxap,blocksize,x_cplx=x_cplx)
 778            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorar,&
 779 &           vectsize,blockvectorp,vectsize,czero,gramrap,blocksize,x_cplx=x_cplx)
 780            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorap,&
 781 &           vectsize,blockvectorp,vectsize,czero,grampap,blocksize,x_cplx=x_cplx)
 782            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorbx,&
 783 &           vectsize,blockvectorp,vectsize,czero,gramxbp,blocksize,x_cplx=x_cplx)
 784            call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorbr,&
 785 &           vectsize,blockvectorp,vectsize,czero,gramrbp,blocksize,x_cplx=x_cplx)
 786          end if
 787 !        It's not necessary to compute the last one: <p|B|p>=(1;0) (see above)
 788          transf5(:,:,1)=gramxap(:,:)
 789          transf5(:,:,2)=gramrap(:,:)
 790          transf5(:,:,3)=grampap(:,:)
 791          transf5(:,:,4)=gramxbp(:,:)
 792          transf5(:,:,5)=gramrbp(:,:)
 793          if(abs(dtset%timopt)==3) then
 794            call timab(533,1,tsec)
 795          end if
 796          call xmpi_sum(transf5,mpi_enreg%comm_bandspinorfft,ierr)
 797          if(abs(dtset%timopt)==3) then
 798            call timab(533,2,tsec)
 799          end if
 800          gramxap(:,:)=transf5(:,:,1)
 801          gramrap(:,:)=transf5(:,:,2)
 802          grampap(:,:)=transf5(:,:,3)
 803          gramxbp(:,:)=transf5(:,:,4)
 804          gramrbp(:,:)=transf5(:,:,5)
 805          grampbp(:,:)=zero
 806          do iblocksize=1,blocksize
 807            zvar=(/one,zero/)
 808            call abi_xcopy(1,zvar,1,grampbp(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx)
 809          end do
 810          bigorder=i4
 811          ABI_ALLOCATE(grama,(cplx*i4,i4))
 812          ABI_ALLOCATE(gramb,(cplx*i4,i4))
 813          ABI_ALLOCATE(eigen,(i4))
 814 !        ABI_ALLOCATE(coordx,(cplx*i4,blocksize))
 815          ABI_ALLOCATE(coordx1,(cplx*blocksize,blocksize))
 816          ABI_ALLOCATE(coordx2,(cplx*blocksize,blocksize))
 817          ABI_ALLOCATE(coordx3,(cplx*blocksize,blocksize))
 818          grama(:,:)=zero;gramb(:,:)=zero
 819          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxax
 820          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxar
 821          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i3+1:i4)=gramxap
 822          grama(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrar
 823          grama(gramindex(i2+1):gramindex(i3)+cplx-1,i3+1:i4)=gramrap
 824          grama(gramindex(i3+1):gramindex(i4)+cplx-1,i3+1:i4)=grampap
 825          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxbx
 826          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxbr
 827          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i3+1:i4)=gramxbp
 828          gramb(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrbr
 829          gramb(gramindex(i2+1):gramindex(i3)+cplx-1,i3+1:i4)=gramrbp
 830          gramb(gramindex(i3+1):gramindex(i4)+cplx-1,i3+1:i4)=grampbp
 831        else
 832          bigorder=i3
 833          ABI_ALLOCATE(grama,(cplx*i3,i3))
 834          ABI_ALLOCATE(gramb,(cplx*i3,i3))
 835          ABI_ALLOCATE(eigen,(i3))
 836 !        ABI_ALLOCATE(coordx,(cplx*i3,blocksize))
 837          ABI_ALLOCATE(coordx1,(cplx*blocksize,blocksize))
 838          ABI_ALLOCATE(coordx2,(cplx*blocksize,blocksize))
 839          grama(:,:)=zero;gramb(:,:)=zero
 840          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxax
 841          grama(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxar
 842          grama(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrar
 843          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxbx
 844          gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxbr
 845          gramb(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrbr
 846        end if
 847 
 848        ABI_ALLOCATE(tmpgramb,(cplx*bigorder,bigorder))
 849        ABI_ALLOCATE(tmpeigen,(bigorder))
 850        tmpgramb=gramb
 851 
 852        call abi_xheev('v','u',bigorder,tmpgramb,tmpeigen,x_cplx=cplx,istwf_k=istwf_k, &
 853 &       timopt=timopt,tim_xeigen=tim_xeigen,use_slk=dtset%use_slk,use_gpu=use_lapack_gpu)
 854 
 855        condestgramb=tmpeigen(bigorder)/tmpeigen(1)
 856        ABI_DEALLOCATE(tmpgramb)
 857        ABI_DEALLOCATE(tmpeigen)
 858 
 859        if (condestgramb.gt.1d+5.or.condestgramb.lt.0.d0.or.info/=0) then
 860          write(std_out,*)'condition number of the Gram matrix = ',condestgramb
 861          if (cond_try==1.and.restart==0) then
 862            ABI_DEALLOCATE(grama)
 863            ABI_DEALLOCATE(gramb)
 864            ABI_DEALLOCATE(eigen)
 865 !          ABI_DEALLOCATE(coordx)
 866            ABI_DEALLOCATE(coordx1)
 867            ABI_DEALLOCATE(coordx2)
 868            if(bigorder==i4) then
 869              ABI_DEALLOCATE(coordx3)
 870            end if
 871            if (nrestart.gt.1) then
 872              MSG_WARNING('the minimization is stopped for this block')
 873              exit iter
 874            else
 875              restart=1
 876              nrestart=nrestart+1
 877              call wrtout(std_out,'Lobpcgwf: restart performed',"PERS")
 878            end if
 879          else
 880            MSG_WARNING('Gramm matrix ill-conditionned: results may be unpredictable')
 881          end if
 882        else
 883          exit cond
 884        end if
 885      end do cond
 886 
 887 !    ###########################################################################
 888 !    ################ END LOOP ON COND #########################################
 889 !    ###########################################################################
 890 
 891      call abi_xhegv(1,'v','u',bigorder,grama,gramb,eigen,x_cplx=cplx,istwf_k=istwf_k, &
 892      timopt=timopt,tim_xeigen=tim_xeigen,use_slk=dtset%use_slk,use_gpu=use_lapack_gpu)
 893 
 894      deltae=-one
 895      do iblocksize=1,blocksize
 896        call abi_xcopy(1,lambda(cplx*(iblocksize-1)+1,iblocksize),1,zvar,1,x_cplx=x_cplx)
 897        deltae=max(deltae,abs(cmplx(zvar(1),zvar(2))-eigen(iblocksize)))
 898        zvar=(/eigen(iblocksize),zero/)
 899        call abi_xcopy(1,zvar,1,lambda(cplx*(iblocksize-1)+1,iblocksize),1,x_cplx=x_cplx)
 900      end do
 901 
 902 !    DEBUG
 903 !    write(std_out,*)'eigen',eigen(1:blocksize)
 904 !    ENDDEBUG
 905 
 906 !    coordx(1:bigorder*cplx,1:blocksize)=grama(1:bigorder*cplx,1:blocksize)
 907      coordx1(:,:) =  grama(1+i1*cplx : i2*cplx,1:blocksize)
 908      coordx2(:,:) =  grama(1+i2*cplx : i3*cplx,1:blocksize)
 909      if(bigorder==i4) then
 910        coordx3(:,:) =  grama(1+i3*cplx : i4*cplx,1:blocksize)
 911      end if
 912 
 913 
 914      if(use_linalg_gpu==1) then
 915        call copy_on_gpu(coordx2,coordx2_gpu,cplx*dp*blocksize*blocksize)
 916        if(bigorder==i4) then
 917          call copy_on_gpu(coordx3,coordx3_gpu,cplx*dp*blocksize*blocksize)
 918        end if
 919      end if
 920 
 921      ABI_DEALLOCATE(grama)
 922      ABI_DEALLOCATE(gramb)
 923      ABI_DEALLOCATE(eigen)
 924      if (restart==0 .and. iterationnumber >1) then
 925 
 926 !      blockvectorp=matmul(blockvectorr,coordx(i2+1:i3,:))+&
 927 !      &               matmul(blockvectorp,coordx(i3+1:i4,:))
 928        if(use_linalg_gpu==1) then
 929 !        call copy_on_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize)
 930          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorr_gpu,&
 931 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 932          call copy_on_gpu(blockvectorp,A_gpu,cplx*dp*vectsize*blocksize)
 933          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,vectsize,&
 934 &         coordx3_gpu,blocksize,cone,C_gpu,vectsize)
 935          call copy_from_gpu(blockvectorp,C_gpu,cplx*dp*vectsize*blocksize)
 936        else
 937          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorr,&
 938 &         vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 939          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorp,&
 940 &         vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx)
 941          call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorp,1,x_cplx=x_cplx)
 942        end if
 943 
 944 !      blockvectorap=matmul(blockvectorar,coordx(i2+1:i3,:))+&
 945 !      &                matmul(blockvectorap,coordx(i3+1:i4,:))
 946        if(use_linalg_gpu==1) then
 947 !        call copy_on_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize)
 948          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorar_gpu,&
 949 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 950          call copy_on_gpu(blockvectorap,A_gpu,cplx*dp*vectsize*blocksize)
 951          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,vectsize,&
 952 &         coordx3_gpu,blocksize,cone,C_gpu,vectsize)
 953          call copy_from_gpu(blockvectorap,C_gpu,cplx*dp*vectsize*blocksize)
 954        else
 955          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorar,&
 956 &         vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 957          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorap,&
 958 &         vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx)
 959          call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorap,1,x_cplx=x_cplx)
 960        end if
 961 
 962 
 963 !      blockvectorvp=matmul(blockvectorvr,coordx(i2+1:i3,:))+&
 964 !      &                matmul(blockvectorvp,coordx(i3+1:i4,:))
 965        if (gs_hamk%usepaw==0) then
 966          if(use_linalg_gpu==1) then
 967            call copy_on_gpu(blockvectorvr,A_gpu,cplx*dp*vectsize*blocksize)
 968            call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
 969 &           vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 970            call copy_on_gpu(blockvectorvp,A_gpu,cplx*dp*vectsize*blocksize)
 971            call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
 972 &           vectsize,coordx3_gpu,blocksize,cone,C_gpu,vectsize)
 973            call copy_from_gpu(blockvectorvp,C_gpu,cplx*dp*vectsize*blocksize)
 974          else
 975            call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvr,&
 976 &           vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 977            call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvp,&
 978 &           vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx)
 979            call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorvp,1,x_cplx=x_cplx)
 980          end if
 981        end if
 982 
 983 !      blockvectorbp=matmul(blockvectorbr,coordx(i2+1:i3,:))+&
 984 !      &                matmul(blockvectorbp,coordx(i3+1:i4,:))
 985        if(use_linalg_gpu==1) then
 986 !        call copy_on_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize)
 987          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorbr_gpu,&
 988 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
 989          call copy_on_gpu(blockvectorbp,A_gpu,cplx*dp*vectsize*blocksize)
 990          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,vectsize,&
 991 &         coordx3_gpu,blocksize,cone,C_gpu,vectsize)
 992          call copy_from_gpu(blockvectorbp,C_gpu,cplx*dp*vectsize*blocksize)
 993        else
 994          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbr,&
 995 &         vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
 996          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbp,&
 997 &         vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx)
 998          call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorbp,1,x_cplx=x_cplx)
 999        end if
1000 
1001      else
1002 
1003 !      blockvectoSz =matmul(blockvectorr,coordx(i2+1:i3,:))
1004        if(use_linalg_gpu==1) then
1005 !        call copy_on_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize)
1006          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorr_gpu,&
1007 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1008          call copy_from_gpu(blockvectorp,C_gpu,cplx*dp*vectsize*blocksize)
1009        else
1010          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorr,&
1011 &         vectsize,coordx2,blocksize,czero,blockvectorp,vectsize,x_cplx=x_cplx)
1012        end if
1013 
1014 !      blockvectorap=matmul(blockvectorar,coordx(i2+1:i3,:))
1015        if(use_linalg_gpu==1) then
1016 !        call copy_on_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize)
1017          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorar_gpu,&
1018 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1019          call copy_from_gpu(blockvectorap,C_gpu,cplx*dp*vectsize*blocksize)
1020        else
1021          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorar,&
1022 &         vectsize,coordx2,blocksize,czero,blockvectorap,vectsize,x_cplx=x_cplx)
1023        end if
1024 !      blockvectorvp=matmul(blockvectorvr,coordx(i2+1:i3,:))
1025        if (gs_hamk%usepaw==0) then
1026          if(use_linalg_gpu==1) then
1027            call copy_on_gpu(blockvectorvr,A_gpu,cplx*dp*vectsize*blocksize)
1028            call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1029 &           vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1030            call copy_from_gpu(blockvectorvp,C_gpu,cplx*dp*vectsize*blocksize)
1031          else
1032            call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvr,&
1033 &           vectsize,coordx2,blocksize,czero,blockvectorvp,vectsize,x_cplx=x_cplx)
1034          end if
1035        end if
1036 
1037 !      blockvectorbp=matmul(blockvectorbr,coordx(i2+1:i3,:))
1038        if(use_linalg_gpu==1) then
1039 !        call copy_on_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize)
1040          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorbr_gpu,&
1041 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1042          call copy_from_gpu(blockvectorbp,C_gpu,cplx*dp*vectsize*blocksize)
1043        else
1044          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbr,&
1045 &         vectsize,coordx2,blocksize,czero,blockvectorbp,vectsize,x_cplx=x_cplx)
1046        end if
1047      end if
1048 
1049      if(use_linalg_gpu==1) then
1050        call copy_on_gpu(coordx1,coordx2_gpu,cplx*dp*blocksize*blocksize)
1051      end if
1052 
1053 !    blockvectorx = matmul(blockvectorx,coordx(i1+1:i2,:))+blockvectorp
1054      if(use_linalg_gpu==1) then
1055        call copy_on_gpu(blockvectorx,A_gpu,cplx*dp*vectsize*blocksize)
1056        call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1057 &       vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1058        call copy_from_gpu(blockvectordumm,C_gpu,cplx*dp*vectsize*blocksize)
1059      else
1060        call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorx,&
1061 &       vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
1062      end if
1063      blockvectorx = blockvectordumm+blockvectorp
1064 
1065 !    blockvectorax= matmul(blockvectorax,coordx(i1+1:i2,:))+blockvectorap
1066      if(use_linalg_gpu==1) then
1067        call copy_on_gpu(blockvectorax,A_gpu,cplx*dp*vectsize*blocksize)
1068        call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1069 &       vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1070        call copy_from_gpu(blockvectordumm,C_gpu,cplx*dp*vectsize*blocksize)
1071      else
1072        call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorax,&
1073 &       vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
1074      end if
1075      blockvectorax = blockvectordumm+blockvectorap
1076 
1077 !    blockvectorvx= matmul(blockvectorvx,coordx(i1+1:i2,:))+blockvectorvp
1078      if (gs_hamk%usepaw==0) then
1079        if(use_linalg_gpu==1) then
1080          call copy_on_gpu(blockvectorvx,A_gpu,cplx*dp*vectsize*blocksize)
1081          call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1082 &         vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1083          call copy_from_gpu(blockvectordumm,C_gpu,cplx*dp*vectsize*blocksize)
1084        else
1085          call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvx,&
1086 &         vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
1087        end if
1088        blockvectorvx = blockvectordumm+blockvectorvp
1089      end if
1090 
1091 !    blockvectorbx= matmul(blockvectorbx,coordx(i1+1:i2,:))+blockvectorbp
1092      if(use_linalg_gpu==1) then
1093        call copy_on_gpu(blockvectorbx,A_gpu,cplx*dp*vectsize*blocksize)
1094        call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,&
1095 &       vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize)
1096        call copy_from_gpu(blockvectordumm,C_gpu,cplx*dp*vectsize*blocksize)
1097      else
1098        call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbx,&
1099 &       vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx)
1100      end if
1101      blockvectorbx = blockvectordumm+blockvectorbp
1102 
1103 !    ABI_DEALLOCATE(coordx)
1104      ABI_DEALLOCATE(coordx1)
1105      ABI_DEALLOCATE(coordx2)
1106      if(bigorder==i4) then
1107        ABI_DEALLOCATE(coordx3)
1108      end if
1109 
1110 !    Check convergence on energy and eventually exit
1111      if (iterationnumber==1) then
1112        deold=deltae
1113      else if (iterationnumber>1) then
1114        if ((abs(deltae)<0.005*abs(deold)).and.(iterationnumber/=maxiterations))then
1115          if(prtvol>=10)then
1116            write(message, '(2(a,i4),1x,a,1p,e12.4,a,e12.4,a)' ) &
1117 &           ' lobpcgwf: block',iblock,', line',iterationnumber,&
1118 &           ', deltae=',deltae,' < 0.005*',deold,' =>skip lines !'
1119            call wrtout(std_out,message,'PERS')
1120          end if
1121          exit
1122        else if (abs(deltae)>0.005*abs(deold)) then
1123          if(prtvol>=10)then
1124            write(message, '(2(a,i4),1x,a,1p,e12.4,a,e12.4,a)' ) &
1125 &           ' lobpcgwf: block',iblock,', line',iterationnumber,&
1126 &           ', deltae=',deltae,' > 0.005*',deold,' =>keep on working !'
1127            call wrtout(std_out,message,'PERS')
1128          end if
1129        end if
1130      end if
1131 
1132      if(abs(dtset%timopt)==4) then
1133        call timab(524,2,tsec)
1134      end if
1135 
1136    end do iter
1137 
1138 !  ###########################################################################
1139 !  ################## END LOOP ON NLINE ######################################
1140 !  ###########################################################################
1141 
1142    if(abs(dtset%timopt)==4) then
1143      call timab(525,1,tsec)
1144    end if
1145 
1146    if (havetoprecon) then
1147      call xprecon(blockvectorbx,lambda,blocksize,&
1148 &     iterationnumber,kinpw,mpi_enreg,npw_k,my_nspinor,&
1149 &     optekin,optpcon,pcon,blockvectorax,blockvectorr,vectsize,timopt=timopt,tim_xprecon=tim_xprecon)
1150 
1151      residualnorms=sum(abs(blockvectorr)**2,dim=1)
1152 
1153      if(abs(dtset%timopt)==3) then
1154        call timab(533,1,tsec)
1155      end if
1156      call xmpi_sum(residualnorms,mpi_enreg%comm_bandspinorfft,ierr)
1157      if(abs(dtset%timopt)==3) then
1158        call timab(533,2,tsec)
1159      end if
1160 
1161      resid_k(bblocksize+1:bblocksize+blocksize)=residualnorms(1:blocksize)
1162    end if
1163 
1164    call wfcopy('I',vectsize*blocksize,blockvectorx,1,cg,1,blocksize,iblock,'C',withbbloc=.true.,&
1165 &   timopt=timopt,tim_wfcopy=tim_wfcopy)
1166 
1167    if(gen_eigenpb) then
1168      call wfcopy('I',vectsize*blocksize,blockvectorbx,1,gsc,1,blocksize,iblock,'S',withbbloc=.true.,&
1169 &     timopt=timopt,tim_wfcopy=tim_wfcopy)
1170    end if
1171 
1172 !  The Vnl part of the Hamiltonian is no more stored in the packed form such as it was the case for subvnl(:).
1173 !  Now, the full matrix is stored in totvnl(:,:). This trick permits:
1174 !  1) to avoid the reconstruction of the total matrix in vtowfk.F90 (double loop over bands)
1175 !  2) to use two optimized matrix-matrix blas routine for general (in lobpcgccwf.F90) or hermitian (in vtowfk.F90)
1176 !  operators, zgemm.f and zhemm.f respectively, rather than a triple loop in both cases.
1177    iwavef=iblock*blocksize
1178    isubh=1+2*bblocksize*(bblocksize+1)/2
1179 
1180    ABI_ALLOCATE(blockvectorz,(cplx*vectsize,iwavef))
1181    if(bblocksize > 0 ) then
1182      call abi_xcopy(bblocksize*vectsize,blockvectory(:,1:bblocksize),1,blockvectorz(:,1:bblocksize),1,x_cplx=x_cplx)
1183    end if
1184    call abi_xcopy( blocksize*vectsize,blockvectorx(:,1:blocksize) ,1,blockvectorz(:,bblocksize+1:iwavef),1,x_cplx=x_cplx)
1185 
1186    ABI_ALLOCATE(tsubham,(cplx*iwavef,blocksize))
1187    tsubham(:,:)=zero
1188    call abi_xgemm(cparam(cplx),'n',iwavef,blocksize,vectsize,cone,blockvectorz,vectsize,&
1189 &   blockvectorax,vectsize,czero,tsubham,iwavef,x_cplx=x_cplx)
1190 
1191    if (gs_hamk%usepaw==0) then
1192      ! MG FIXME: Here gfortran4.9 allocates temporary array for C in abi_d2zgemm.
1193      call abi_xgemm(cparam(cplx),'n',blocksize,iwavef,vectsize,cone,blockvectorvx,vectsize,&
1194 &     blockvectorz,vectsize,czero,totvnl(cplx*bblocksize+1:cplx*iwavef,1:iwavef),blocksize,x_cplx=x_cplx)
1195    end if
1196 
1197    do iblocksize=1,blocksize
1198      do ii=1,bblocksize+iblocksize
1199        if ( cplx == 1 ) then
1200          subham(isubh)  = tsubham(ii,iblocksize)
1201          subham(isubh+1)= zero
1202        else
1203          subham(isubh)  = tsubham(2*ii-1,iblocksize)
1204          subham(isubh+1)= tsubham(2*ii  ,iblocksize)
1205        end if
1206        isubh=isubh+2
1207      end do
1208    end do
1209    ABI_DEALLOCATE(tsubham)
1210    ABI_DEALLOCATE(blockvectorz)
1211 !  comm for subham and subvnl are made in vtowfk
1212 
1213    ABI_DEALLOCATE(pcon)
1214    ABI_DEALLOCATE(blockvectory)
1215    ABI_DEALLOCATE(blockvectorby)
1216    ABI_DEALLOCATE(gramyx)
1217    ABI_DEALLOCATE(blockvectorx)
1218    ABI_DEALLOCATE(blockvectorax)
1219    ABI_DEALLOCATE(blockvectorbx)
1220    ABI_DEALLOCATE(blockvectorr)
1221    ABI_DEALLOCATE(blockvectorar)
1222    ABI_DEALLOCATE(blockvectorbr)
1223    ABI_DEALLOCATE(blockvectorp)
1224    ABI_DEALLOCATE(blockvectorap)
1225    ABI_DEALLOCATE(blockvectorbp)
1226    if (gs_hamk%usepaw==0) then
1227      ABI_DEALLOCATE(blockvectorvx)
1228      ABI_DEALLOCATE(blockvectorvp)
1229      ABI_DEALLOCATE(blockvectorvr)
1230    end if
1231    ABI_DEALLOCATE(blockvectordumm)
1232    ABI_DEALLOCATE(gramxax)
1233    ABI_DEALLOCATE(gramxar)
1234    ABI_DEALLOCATE(gramxap)
1235    ABI_DEALLOCATE(gramrar)
1236    ABI_DEALLOCATE(gramrap)
1237    ABI_DEALLOCATE(grampap)
1238    ABI_DEALLOCATE(gramxbx)
1239    ABI_DEALLOCATE(gramxbr)
1240    ABI_DEALLOCATE(gramxbp)
1241    ABI_DEALLOCATE(gramrbr)
1242    ABI_DEALLOCATE(gramrbp)
1243    ABI_DEALLOCATE(grampbp)
1244    ABI_DEALLOCATE(transf3)
1245    ABI_DEALLOCATE(transf5)
1246    ABI_DEALLOCATE(lambda)
1247    ABI_DEALLOCATE(residualnorms)
1248    if(use_linalg_gpu==1) then
1249      call dealloc_on_gpu(bblockvector_gpu)
1250      call dealloc_on_gpu(gram_gpu)
1251    end if
1252 
1253  end do  ! End big loop over bands inside blocks
1254 
1255  if(use_linalg_gpu==1) then
1256    call dealloc_on_gpu(blockvectorr_gpu)
1257    call dealloc_on_gpu(blockvectorar_gpu)
1258    call dealloc_on_gpu(blockvectorbr_gpu)
1259    call dealloc_on_gpu(A_gpu)
1260    call dealloc_on_gpu(C_gpu)
1261    call dealloc_on_gpu(coordx2_gpu)
1262    call dealloc_on_gpu(coordx3_gpu)
1263    !call gpu_linalg_shutdown()
1264  end if
1265 
1266  if(abs(dtset%timopt)==4) then
1267    call timab(525,2,tsec)
1268  end if
1269  call timab(530,2,tsec)
1270 
1271  DBG_ENTER("COLL")
1272 
1273  contains
1274 
1275    function gramindex(iblocksize)
1276 
1277 
1278 !This section has been created automatically by the script Abilint (TD).
1279 !Do not modify the following lines by hand.
1280 #undef ABI_FUNC
1281 #define ABI_FUNC 'gramindex'
1282 !End of the abilint section
1283 
1284    integer :: gramindex,iblocksize
1285    gramindex=(iblocksize-1)*cplx+1
1286  end function gramindex
1287 
1288 end subroutine lobpcgwf

ABINIT/m_lobpcgwf_old [ Modules ]

[ Top ] [ Modules ]

NAME

   m_lobpcgwf_old

FUNCTION

COPYRIGHT

  Copyright (C) 2008-2018 ABINIT group ()
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

PARENTS

CHILDREN

SOURCE

21 #if defined HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24 
25 #include "abi_common.h"
26 
27 module m_lobpcgwf_old
28 
29  implicit none
30 
31  private