TABLE OF CONTENTS
ABINIT/lobpcgwf [ Functions ]
NAME
lobpcgwf
FUNCTION
this routine updates the whole wave functions at a given k-point, using the lobpcg method for a given spin-polarization, from a fixed hamiltonian but might also simply compute eigenvectors and eigenvalues at this k point. it will also update the matrix elements of the hamiltonian.
COPYRIGHT
Copyright (C) 1998-2018 ABINIT group (FBottin,GZ,AR,MT,FDahm) this file is distributed under the terms of the gnu general public license, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt . for the initials of contributors, see ~abinit/doc/developers/contributors.txt .
INPUTS
dtset <type(dataset_type)>=all input variales for this dataset gs_hamk <type(gs_hamiltonian_type)>=all data for the hamiltonian at k icg=shift to be applied on the location of data in the array cg igsc=shift to be applied on the location of data in the array gsc kinpw(npw)=(modified) kinetic energy for each plane wave (hartree) mcg=second dimension of the cg array mgsc=second dimension of the gsc array mpi_enreg=informations about MPI parallelization nband_k=number of bands at this k point for that spin polarization nbdblock : number of blocks npw_k=number of plane waves at this k point prtvol=control print volume and debugging output
OUTPUT
resid_k(nband_k)=residuals for each states subham(nband_k*(nband_k+1))=the matrix elements of h If gs_hamk%usepaw==0: gsc(2,mgsc)=<g|s|c> matrix elements (s=overlap) totvnl(nband_k*(1-gs_hamk%usepaw),nband_k*(1-gs_hamk%usepaw))=the matrix elements of vnl
SIDE EFFECTS
cg(2,mcg)=updated wavefunctions
PARENTS
vtowfk
CHILDREN
abi_xcopy,abi_xgemm,abi_xheev,abi_xhegv,abi_xorthonormalize,abi_xtrsm alloc_on_gpu,copy_from_gpu,copy_on_gpu,dealloc_on_gpu,getghc,gpu_xgemm gpu_xorthonormalize,gpu_xtrsm,prep_getghc,setwfparameter,timab,wfcopy wrtout,xmpi_sum,xprecon
SOURCE
55 #if defined HAVE_CONFIG_H 56 #include "config.h" 57 #endif 58 59 #include "abi_common.h" 60 61 subroutine lobpcgwf(cg,dtset,gs_hamk,gsc,icg,igsc,kinpw,mcg,mgsc,mpi_enreg,& 62 & nband_k,nbdblock,npw_k,prtvol,resid_k,subham,totvnl) 63 64 65 use defs_abitypes 66 use defs_basis 67 use m_profiling_abi 68 use m_lobpcg 69 use m_abi_linalg 70 use m_wfutils 71 use m_xmpi 72 use m_errors 73 use iso_c_binding 74 75 use m_hamiltonian, only : gs_hamiltonian_type 76 use m_pawcprj, only : pawcprj_type 77 78 !This section has been created automatically by the script Abilint (TD). 79 !Do not modify the following lines by hand. 80 #undef ABI_FUNC 81 #define ABI_FUNC 'lobpcgwf' 82 use interfaces_14_hidewrite 83 use interfaces_18_timing 84 use interfaces_66_wfs 85 !End of the abilint section 86 87 implicit none 88 89 !Arguments ------------------------------------ 90 integer,intent(in) :: icg,igsc,mcg,mgsc,nband_k,nbdblock,npw_k,prtvol 91 type(gs_hamiltonian_type),intent(inout) :: gs_hamk 92 type(dataset_type),intent(in) :: dtset 93 type(mpi_type),intent(inout) :: mpi_enreg 94 real(dp),intent(inout) :: cg(2,mcg),gsc(2,mgsc) 95 real(dp),intent(in) :: kinpw(npw_k) 96 real(dp),intent(out) :: resid_k(nband_k) 97 real(dp),intent(inout) :: subham(nband_k*(nband_k+1)) 98 real(dp),intent(inout) :: totvnl((3-gs_hamk%istwf_k)*nband_k*(1-gs_hamk%usepaw),nband_k*(1-gs_hamk%usepaw)) 99 100 !Local variables------------------------------- 101 integer, parameter :: tim_getghc=5 102 integer :: activepsize,activersize,bblocksize,bigorder,blocksize,cpopt 103 integer :: cond_try 104 integer :: iblocksize,iblock,ierr,ii,info,istwf_k,isubh 105 integer :: iterationnumber 106 integer :: iwavef,i1,i2,i3,i4,maxiterations,my_nspinor 107 integer :: nrestart,optekin,optpcon,restart 108 integer :: sij_opt,timopt,tim_wfcopy,tim_xeigen 109 integer :: tim_xortho,tim_xprecon,use_lapack_gpu,use_linalg_gpu,vectsize 110 logical :: gen_eigenpb 111 integer :: cplx 112 real(dp) :: condestgramb,deltae,deold,dum 113 complex(dpc) :: cminusone 114 real(dp) :: zvar(2) 115 logical :: havetoprecon 116 real(dp) :: tsec(2) 117 real(dp), allocatable :: gwavef(:,:),cwavef(:,:),gvnlc(:,:) 118 real(dp), allocatable :: swavef(:,:) 119 real(dp), allocatable :: residualnorms(:),eigen(:) 120 real(dp), allocatable :: tmpeigen(:) 121 real(dp), allocatable :: pcon(:,:) 122 real(dp), allocatable :: blockvectorx(:,:),blockvectorvx(:,:),blockvectorax(:,:),blockvectorbx(:,:) 123 real(dp),allocatable :: blockvectorr(:,:),blockvectorvr(:,:),blockvectorar(:,:),blockvectorbr(:,:) 124 real(dp),allocatable :: blockvectorp(:,:),blockvectorvp(:,:),blockvectorap(:,:),blockvectorbp(:,:),blockvectordumm(:,:) 125 real(dp),allocatable :: blockvectory(:,:),blockvectorby(:,:),blockvectorz(:,:) 126 real(dp),allocatable :: gramxax(:,:),gramxar(:,:),gramxap(:,:),gramrar(:,:),gramrap(:,:),grampap(:,:) 127 real(dp),allocatable :: gramxbx(:,:),gramxbr(:,:),gramxbp(:,:),gramrbr(:,:),gramrbp(:,:),grampbp(:,:) 128 real(dp),allocatable :: coordx1(:,:),coordx2(:,:),coordx3(:,:),lambda(:,:),grama(:,:),gramb(:,:),gramyx(:,:) 129 real(dp),allocatable :: tmpgramb(:,:),transf3(:,:,:),transf5(:,:,:) 130 real(dp), allocatable :: tsubham(:,:) 131 type(pawcprj_type) :: cprj_dum(gs_hamk%natom,0) 132 character(len=500) :: message 133 character, dimension(2) :: cparam 134 type(c_ptr) :: A_gpu,C_gpu,coordx2_gpu,coordx3_gpu,bblockvector_gpu,gram_gpu 135 type(c_ptr) :: blockvectorr_gpu,blockvectorar_gpu,blockvectorbr_gpu 136 137 !Index of a given band 138 !gramindex(iblocksize)=(iblocksize-1)*cplx+1 139 140 ! ********************************************************************* 141 142 DBG_ENTER("COLL") 143 144 call timab(530,1,tsec) 145 if(abs(dtset%timopt)==4) then 146 call timab(520,1,tsec) 147 end if 148 149 !########################################################################### 150 !################ INITIALISATION ########################################## 151 !########################################################################### 152 153 !For timing 154 timopt=dtset%timopt 155 tim_wfcopy=584 156 !tim_xcopy=584 157 tim_xeigen=587 158 !tim_xgemm=532 159 tim_xortho=535 160 tim_xprecon=536 161 !tim_xtrsm=535 162 163 !Variables 164 maxiterations=dtset%nline 165 gen_eigenpb=(gs_hamk%usepaw==1) 166 my_nspinor=max(1,dtset%nspinor/mpi_enreg%nproc_spinor) 167 cminusone=-cone 168 istwf_k=gs_hamk%istwf_k 169 info = 0 170 cparam(1)='t' 171 cparam(2)='c' 172 173 !Depends on istwfk 174 if ( istwf_k == 2 ) then 175 cplx=1 176 if (mpi_enreg%me_g0 == 1) then 177 vectsize=2*npw_k*my_nspinor-1 178 else 179 vectsize=2*npw_k*my_nspinor 180 end if 181 else 182 cplx=2 183 vectsize=npw_k*my_nspinor 184 end if 185 186 !For preconditionning 187 optekin=0;if (dtset%wfoptalg>10) optekin=0 188 optpcon=1;if (dtset%wfoptalg>10) optpcon=0 189 190 !For communication 191 blocksize=mpi_enreg%nproc_fft 192 if(mpi_enreg%paral_kgb==1) blocksize=mpi_enreg%nproc_band*mpi_enreg%bandpp 193 !IF you want to compare with new lobpcg in sequential uncomment the following 194 !line 195 !blocksize=mpi_enreg%nproc_band*mpi_enreg%bandpp 196 197 !Iniitializations/allocations of GPU parallelism 198 use_linalg_gpu=0;use_lapack_gpu=0 199 if ((dtset%use_gpu_cuda==1).and. & 200 & (vectsize*blocksize*blocksize>dtset%gpu_linalg_limit)) use_linalg_gpu=1 201 #if defined HAVE_LINALG_MAGMA 202 use_lapack_gpu=use_linalg_gpu 203 #endif 204 if(use_linalg_gpu==1) then 205 ! call gpu_linalg_init() 206 call alloc_on_gpu(A_gpu,cplx*dp*vectsize*blocksize) 207 call alloc_on_gpu(C_gpu,cplx*dp*vectsize*blocksize) 208 call alloc_on_gpu(blockvectorr_gpu,cplx*dp*vectsize*blocksize) 209 call alloc_on_gpu(blockvectorar_gpu,cplx*dp*vectsize*blocksize) 210 call alloc_on_gpu(blockvectorbr_gpu,cplx*dp*vectsize*blocksize) 211 call alloc_on_gpu(coordx2_gpu,cplx*dp*blocksize*blocksize) 212 call alloc_on_gpu(coordx3_gpu,cplx*dp*blocksize*blocksize) 213 end if 214 215 if(abs(dtset%timopt)==4) then 216 call timab(520,2,tsec) 217 end if 218 219 !########################################################################### 220 !################ BIG LOOP OVER BLOCKS #################################### 221 !########################################################################### 222 223 do iblock=1,nbdblock 224 225 if(abs(dtset%timopt)==4) then 226 call timab(521,1,tsec) 227 end if 228 229 havetoprecon=.true. 230 nrestart=0 231 bblocksize=(iblock-1)*blocksize 232 233 ! allocations 234 ABI_ALLOCATE(pcon,(npw_k,blocksize)) 235 ABI_ALLOCATE(blockvectorx,(cplx*vectsize,blocksize)) 236 ABI_ALLOCATE(blockvectorax,(cplx*vectsize,blocksize)) 237 ABI_ALLOCATE(blockvectorbx,(cplx*vectsize,blocksize)) 238 ABI_ALLOCATE(blockvectorr,(cplx*vectsize,blocksize)) 239 ABI_ALLOCATE(blockvectorar,(cplx*vectsize,blocksize)) 240 ABI_ALLOCATE(blockvectorbr,(cplx*vectsize,blocksize)) 241 ABI_ALLOCATE(blockvectorp,(cplx*vectsize,blocksize)) 242 ABI_ALLOCATE(blockvectorap,(cplx*vectsize,blocksize)) 243 ABI_ALLOCATE(blockvectorbp,(cplx*vectsize,blocksize)) 244 ABI_ALLOCATE(blockvectordumm,(cplx*vectsize,blocksize)) 245 ABI_ALLOCATE(gramxax,(cplx*blocksize,blocksize)) 246 ABI_ALLOCATE(gramxar,(cplx*blocksize,blocksize)) 247 ABI_ALLOCATE(gramxap,(cplx*blocksize,blocksize)) 248 ABI_ALLOCATE(gramrar,(cplx*blocksize,blocksize)) 249 ABI_ALLOCATE(gramrap,(cplx*blocksize,blocksize)) 250 ABI_ALLOCATE(grampap,(cplx*blocksize,blocksize)) 251 ABI_ALLOCATE(gramxbx,(cplx*blocksize,blocksize)) 252 ABI_ALLOCATE(gramxbr,(cplx*blocksize,blocksize)) 253 ABI_ALLOCATE(gramxbp,(cplx*blocksize,blocksize)) 254 ABI_ALLOCATE(gramrbr,(cplx*blocksize,blocksize)) 255 ABI_ALLOCATE(gramrbp,(cplx*blocksize,blocksize)) 256 ABI_ALLOCATE(grampbp,(cplx*blocksize,blocksize)) 257 ABI_ALLOCATE(transf3,(cplx*blocksize,blocksize,3)) 258 ABI_ALLOCATE(transf5,(cplx*blocksize,blocksize,5)) 259 ABI_ALLOCATE(lambda,(cplx*blocksize,blocksize)) 260 ABI_ALLOCATE(residualnorms,(blocksize)) 261 262 ABI_ALLOCATE(blockvectory,(cplx*vectsize,bblocksize)) 263 ABI_ALLOCATE(blockvectorby,(cplx*vectsize,bblocksize)) 264 ABI_ALLOCATE(gramyx,(cplx*bblocksize,blocksize)) 265 if (gs_hamk%usepaw==0) then 266 ABI_ALLOCATE(blockvectorvx,(cplx*vectsize,blocksize)) 267 ABI_ALLOCATE(blockvectorvr,(cplx*vectsize,blocksize)) 268 ABI_ALLOCATE(blockvectorvp,(cplx*vectsize,blocksize)) 269 end if 270 271 if(use_linalg_gpu==1) then 272 if(iblock/=1) then 273 call alloc_on_gpu(bblockvector_gpu,cplx*dp*vectsize*bblocksize) 274 call alloc_on_gpu(gram_gpu,cplx*dp*bblocksize*blocksize) 275 else 276 call alloc_on_gpu(bblockvector_gpu,cplx*dp*vectsize*blocksize) 277 call alloc_on_gpu(gram_gpu,cplx*dp*blocksize*blocksize) 278 end if 279 end if 280 281 ! Initialize global variables in m_wfutils. 282 call setWFParameter(cplx,mpi_enreg%me_g0,npw_k,my_nspinor,icg,igsc,blocksize) 283 284 ! transfer array of wf coeff in iblock to blockvectorx 285 call wfcopy('D',blocksize*vectsize,cg,1,blockvectorx,1,blocksize,iblock,'C',withbbloc=.true.,& 286 & timopt=timopt,tim_wfcopy=tim_wfcopy) 287 288 ! !!!!!!!!!!!!!!!!!!!!!!!! Begin if iblock /=1 !!!!!!!!!!!!!!!!!!!!!!!!!! 289 ! transfer array of wf coeff less than iblock to blockvectory 290 if(iblock /=1) then 291 call wfcopy('D',bblocksize*vectsize,cg,1,blockvectory,1,bblocksize,iblock,'C',withbbloc=.false.,& 292 & timopt=timopt,tim_wfcopy=tim_wfcopy) 293 294 if(gen_eigenpb) then 295 call wfcopy('D',bblocksize*vectsize,gsc,1,blockvectorby,1,bblocksize,iblock,'S',withbbloc=.false.,& 296 & timopt=timopt,tim_wfcopy=tim_wfcopy) 297 else 298 call abi_xcopy(vectsize*bblocksize,blockvectory,1,blockvectorby,1,x_cplx=x_cplx) 299 end if 300 301 ! b-orthogonalize x to the constraint y (supposed b-orthonormal) 302 ! blockvectorx=blockvectorx-matmul(blockvectory,matmul((blockvectorby)^T,blockvectorx)) 303 304 call abi_xgemm(cparam(cplx),'n',bblocksize,blocksize,vectsize,cone,blockvectorby,& 305 & vectsize,blockvectorx,vectsize,czero,gramyx,bblocksize,x_cplx=x_cplx) 306 307 if(abs(dtset%timopt)==3) then 308 call timab(533,1,tsec) 309 end if 310 call xmpi_sum(gramyx,mpi_enreg%comm_bandspinorfft,ierr) 311 if(abs(dtset%timopt)==3) then 312 call timab(533,2,tsec) 313 end if 314 315 call abi_xgemm('n','n',vectsize,blocksize,bblocksize,cminusone,blockvectory,& 316 & vectsize,gramyx,bblocksize,cone,blockvectorx,vectsize,x_cplx=x_cplx) 317 318 end if 319 ! !!!!!!!!!!!!!!!!!!!!!!!! End if iblock /=1 !!!!!!!!!!!!!!!!!!!!!!!!!!! 320 321 ABI_ALLOCATE(cwavef,(2,npw_k*my_nspinor*blocksize)) 322 ABI_ALLOCATE(gwavef,(2,npw_k*my_nspinor*blocksize)) 323 ABI_ALLOCATE(gvnlc,(2,npw_k*my_nspinor*blocksize)) 324 ABI_ALLOCATE(swavef,(2,npw_k*my_nspinor*blocksize)) 325 326 call wfcopy('I',vectsize*blocksize,blockvectorx,1,cwavef,1,blocksize,iblock,'W',withbbloc=.false.,& 327 & timopt=timopt,tim_wfcopy=tim_wfcopy) 328 329 if(abs(dtset%timopt)==4) then 330 call timab(521,2,tsec) 331 end if 332 if(abs(dtset%timopt)==4) then 333 call timab(526,1,tsec) 334 end if 335 336 cpopt=-1;sij_opt=0;if (gen_eigenpb) sij_opt=1 337 338 if (mpi_enreg%paral_kgb==0) then 339 call getghc(cpopt,cwavef,cprj_dum,gwavef,swavef,gs_hamk,gvnlc,dum,& 340 & mpi_enreg,blocksize,prtvol,sij_opt,tim_getghc,0) 341 else 342 call prep_getghc(cwavef,gs_hamk,gvnlc,gwavef,swavef,dum,blocksize,mpi_enreg,& 343 & prtvol,sij_opt,cpopt,cprj_dum,already_transposed=.false.) 344 end if 345 if(abs(dtset%timopt)==4) then 346 call timab(526,2,tsec) 347 end if 348 if(abs(dtset%timopt)==4) then 349 call timab(522,1,tsec) 350 end if 351 352 if ( gen_eigenpb ) then 353 call wfcopy('D',vectsize*blocksize,swavef,1,blockvectorbx,1,blocksize,iblock,'W',withbbloc=.false.,& 354 & timopt=timopt,tim_wfcopy=tim_wfcopy) 355 else 356 call wfcopy('D',vectsize*blocksize,gvnlc,1,blockvectorvx,1,blocksize,iblock,'W',withbbloc=.false.,& 357 & timopt=timopt,tim_wfcopy=tim_wfcopy) 358 call abi_xcopy(vectsize*blocksize,blockvectorx,1,blockvectorbx,1,x_cplx=x_cplx) 359 end if 360 361 call wfcopy('D',vectsize*blocksize,gwavef,1,blockvectorax,1,blocksize,iblock,'W',withbbloc=.false.,& 362 & timopt=timopt,tim_wfcopy=tim_wfcopy) 363 364 ABI_DEALLOCATE(cwavef) 365 ABI_DEALLOCATE(gwavef) 366 ABI_DEALLOCATE(gvnlc) 367 ABI_DEALLOCATE(swavef) 368 369 call abi_xorthonormalize(blockvectorx,blockvectorbx,blocksize,mpi_enreg%comm_bandspinorfft,gramxbx,vectsize,& 370 & x_cplx,timopt=timopt,tim_xortho=tim_xortho) 371 372 call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramxbx,blocksize,blockvectorbx,vectsize,x_cplx=x_cplx) 373 call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramxbx,blocksize,blockvectorax,vectsize,x_cplx=x_cplx) 374 375 if (gs_hamk%usepaw==0) then 376 call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramxbx,blocksize,blockvectorvx,vectsize,x_cplx=x_cplx) 377 end if 378 379 ! Do rayleigh ritz on a in space x 380 ! gramxax=matmul(transpose(blockvectorx),blockvectorax) 381 call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorx,& 382 & vectsize,blockvectorax,vectsize,czero,gramxax,blocksize,x_cplx=x_cplx) 383 384 if(abs(dtset%timopt)==3) then 385 call timab(533,1,tsec) 386 end if 387 call xmpi_sum(gramxax,mpi_enreg%comm_bandspinorfft,ierr) 388 if(abs(dtset%timopt)==3) then 389 call timab(533,2,tsec) 390 end if 391 ABI_ALLOCATE(eigen,(blocksize)) 392 393 call abi_xheev('v','u',blocksize,gramxax,eigen,x_cplx=cplx,istwf_k=istwf_k, & 394 timopt=timopt,tim_xeigen=tim_xeigen,use_slk=dtset%use_slk,use_gpu=use_lapack_gpu) 395 396 ! blockvectorx=matmul(blockvectorx,gramxax) 397 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorx,& 398 & vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx) 399 call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorx,1,x_cplx=x_cplx) 400 401 ! blockvectorax=matmul(blockvectorax,gramxax) 402 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorax,& 403 & vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx) 404 call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorax,1,x_cplx=x_cplx) 405 406 ! blockvectorvx=matmul(blockvectorvx,gramxax) 407 if (gs_hamk%usepaw==0) then 408 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvx,& 409 & vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx) 410 call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorvx,1,x_cplx=x_cplx) 411 end if 412 413 ! blockvectorbx=matmul(blockvectorbx,gramxax) 414 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbx,& 415 & vectsize,gramxax,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx) 416 call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorbx,1,x_cplx=x_cplx) 417 418 do iblocksize=1,blocksize 419 zvar=(/eigen(iblocksize),zero/) 420 call abi_xcopy(1,zvar,1,lambda(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx) 421 end do 422 ABI_DEALLOCATE(eigen) 423 424 if(abs(dtset%timopt)==4) then 425 call timab(522,2,tsec) 426 end if 427 428 ! ########################################################################### 429 ! ################ PERFORM LOOP ON NLINE #################################### 430 ! ########################################################################### 431 ! now the main alogrithm 432 iter: do iterationnumber=1,maxiterations 433 434 if(abs(dtset%timopt)==4) then 435 call timab(523,1,tsec) 436 end if 437 438 ! Build residual 439 ! blockvectorr=blockvectorax-matmul(blockvectorx,lambda) 440 call xprecon(blockvectorbx,lambda,blocksize,& 441 & iterationnumber,kinpw,mpi_enreg,npw_k,my_nspinor,& 442 & optekin,optpcon,pcon,blockvectorax,blockvectorr,vectsize,timopt=timopt,tim_xprecon=tim_xprecon) 443 444 residualnorms=sum(blockvectorr**2,dim=1) 445 446 if(abs(dtset%timopt)==3) then 447 call timab(533,1,tsec) 448 end if 449 call xmpi_sum(residualnorms,mpi_enreg%comm_bandspinorfft,ierr) 450 if(abs(dtset%timopt)==3) then 451 call timab(533,2,tsec) 452 end if 453 454 resid_k(bblocksize+1:bblocksize+blocksize)=residualnorms(1:blocksize) 455 456 ! If residual sufficiently small stop line minimizations 457 if (abs(maxval(residualnorms(1:blocksize)))<dtset%tolwfr) then 458 if (prtvol > 0) then 459 write(message, '(a,i0,a,i0,a,es12.4)' ) & 460 & ' lobpcgwf: block ',iblock,' converged after ',iterationnumber,& 461 & ' line minimizations: maxval(resid(1:blocksize)) =',maxval(residualnorms(1:blocksize)) 462 call wrtout(std_out,message,'PERS') 463 end if 464 havetoprecon=.false. 465 exit 466 end if 467 468 if(use_linalg_gpu==1) then 469 call copy_on_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize) 470 end if 471 472 if(iblock /=1) then 473 ! Residuals orthogonal to blockvectorby 474 ! blockvectorr=blockvectorr-matmul(blockvectory,matmul((blockvectorby)^T,blockvectorr)) 475 476 if(use_linalg_gpu==1) then 477 call copy_on_gpu(blockvectorby,bblockvector_gpu,cplx*dp*vectsize*bblocksize) 478 call gpu_xgemm(cplx,cparam(cplx),'n',bblocksize,blocksize,vectsize,cone,bblockvector_gpu,& 479 & vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,bblocksize) 480 call copy_from_gpu(gramyx,gram_gpu,cplx*dp*bblocksize*blocksize) 481 else 482 call abi_xgemm(cparam(cplx),'n',bblocksize,blocksize,vectsize,cone,blockvectorby,& 483 & vectsize,blockvectorr,vectsize,czero,gramyx,bblocksize,x_cplx=x_cplx) 484 end if 485 486 if(abs(dtset%timopt)==3) then 487 call timab(533,1,tsec) 488 end if 489 call xmpi_sum(gramyx,mpi_enreg%comm_bandspinorfft,ierr) 490 if(abs(dtset%timopt)==3) then 491 call timab(533,2,tsec) 492 end if 493 494 if(use_linalg_gpu==1) then 495 call copy_on_gpu(gramyx,gram_gpu,cplx*dp*bblocksize*blocksize) 496 call copy_on_gpu(blockvectory,bblockvector_gpu,cplx*dp*vectsize*bblocksize) 497 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,bblocksize,cminusone,bblockvector_gpu,& 498 & vectsize,gram_gpu,bblocksize,cone,blockvectorr_gpu,vectsize) 499 else 500 call abi_xgemm('n','n',vectsize,blocksize,bblocksize,cminusone,blockvectory,& 501 & vectsize,gramyx,bblocksize,cone,blockvectorr,vectsize,x_cplx=x_cplx) 502 end if 503 504 end if 505 506 ! Residuals orthogonal to blockvectorx 507 ! blockvectorr=blockvectorr-matmul(blockvectorx,matmul((blockvectorbx)^T,blockvectorr)) 508 if(use_linalg_gpu==1) then 509 call copy_on_gpu(blockvectorbx,C_gpu,cplx*dp*vectsize*blocksize) 510 call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,C_gpu,& 511 & vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,blocksize) 512 call copy_from_gpu(gramxax,gram_gpu,cplx*dp*blocksize*blocksize) 513 else 514 call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorbx,& 515 & vectsize,blockvectorr,vectsize,czero,gramxax,blocksize,x_cplx=x_cplx) 516 end if 517 518 if(abs(dtset%timopt)==3) then 519 call timab(533,1,tsec) 520 end if 521 call xmpi_sum(gramxax,mpi_enreg%comm_bandspinorfft,ierr) 522 if(abs(dtset%timopt)==3) then 523 call timab(533,2,tsec) 524 end if 525 526 if(use_linalg_gpu==1) then 527 call copy_on_gpu(gramxax,gram_gpu,cplx*dp*blocksize*blocksize) 528 call copy_on_gpu(blockvectorx,C_gpu,cplx*dp*vectsize*blocksize) 529 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cminusone,C_gpu,& 530 & vectsize,gram_gpu,blocksize,cone,blockvectorr_gpu,vectsize) 531 call copy_from_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize) 532 else 533 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cminusone,blockvectorx,& 534 & vectsize,gramxax,blocksize,cone,blockvectorr,vectsize,x_cplx=x_cplx) 535 end if 536 537 ABI_ALLOCATE(cwavef,(2,npw_k*my_nspinor*blocksize)) 538 ABI_ALLOCATE(gwavef,(2,npw_k*my_nspinor*blocksize)) 539 ABI_ALLOCATE(gvnlc,(2,npw_k*my_nspinor*blocksize)) 540 ABI_ALLOCATE(swavef,(2,npw_k*my_nspinor*blocksize)) 541 542 call wfcopy('I',vectsize*blocksize,blockvectorr,1,cwavef,1,blocksize,iblock,'W',withbbloc=.false.,& 543 & timopt=timopt,tim_wfcopy=tim_wfcopy) 544 545 cpopt=-1;sij_opt=0;if (gen_eigenpb) sij_opt=1 546 547 if(abs(dtset%timopt)==4) then 548 call timab(523,2,tsec) 549 end if 550 if(abs(dtset%timopt)==4) then 551 call timab(526,1,tsec) 552 end if 553 554 if (mpi_enreg%paral_kgb==0) then 555 call getghc(cpopt,cwavef,cprj_dum,gwavef,swavef,gs_hamk,gvnlc,dum,& 556 & mpi_enreg,blocksize,prtvol,sij_opt,tim_getghc,0) 557 else 558 call prep_getghc(cwavef,gs_hamk,gvnlc,gwavef,swavef,dum,blocksize,mpi_enreg,& 559 & prtvol,sij_opt,cpopt,cprj_dum,already_transposed=.false.) 560 end if 561 562 if(abs(dtset%timopt)==4) then 563 call timab(526,2,tsec) 564 end if 565 if(abs(dtset%timopt)==4) then 566 call timab(524,1,tsec) 567 end if 568 569 if (gen_eigenpb) then 570 call wfcopy('D',vectsize*blocksize,swavef,1,blockvectorbr,1,blocksize,iblock,'W',withbbloc=.false.,& 571 & timopt=timopt,tim_wfcopy=tim_wfcopy) 572 else 573 call abi_xcopy(vectsize*blocksize,blockvectorr,1,blockvectorbr,1,x_cplx=x_cplx) 574 call wfcopy('D',vectsize*blocksize,gvnlc,1,blockvectorvr,1,blocksize,iblock,'W',withbbloc=.false.,& 575 & timopt=timopt,tim_wfcopy=tim_wfcopy) 576 end if 577 578 call wfcopy('D',vectsize*blocksize,gwavef,1,blockvectorar,1,blocksize,iblock,'W',withbbloc=.false.,& 579 & timopt=timopt,tim_wfcopy=tim_wfcopy) 580 581 ABI_DEALLOCATE(cwavef) 582 ABI_DEALLOCATE(gwavef) 583 ABI_DEALLOCATE(gvnlc) 584 ABI_DEALLOCATE(swavef) 585 586 if(use_linalg_gpu==1) then 587 call copy_on_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize) 588 call gpu_xorthonormalize(blockvectorr_gpu,blockvectorbr_gpu,blocksize,mpi_enreg%comm_bandspinorfft,gram_gpu,vectsize,& 589 & x_cplx,timopt=timopt,tim_xortho=tim_xortho) 590 call copy_from_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize) 591 call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,blockvectorbr_gpu,vectsize) 592 call copy_from_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize) 593 call copy_on_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize) 594 call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,blockvectorar_gpu,vectsize) 595 call copy_from_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize) 596 if (gs_hamk%usepaw==0) then 597 call copy_on_gpu(blockvectorvr,A_gpu,cplx*dp*vectsize*blocksize) 598 call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,A_gpu,vectsize) 599 call copy_from_gpu(blockvectorvr,A_gpu,cplx*dp*vectsize*blocksize) 600 end if 601 call copy_from_gpu(gramrbr,gram_gpu,cplx*dp*blocksize*blocksize) 602 else 603 call abi_xorthonormalize(blockvectorr,blockvectorbr,blocksize,mpi_enreg%comm_bandspinorfft,gramrbr,vectsize,& 604 & x_cplx,timopt=timopt,tim_xortho=tim_xortho) 605 call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramrbr,blocksize,blockvectorbr,vectsize,x_cplx=x_cplx) 606 call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramrbr,blocksize,blockvectorar,vectsize,x_cplx=x_cplx) 607 if (gs_hamk%usepaw==0) then 608 call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,gramrbr,blocksize,blockvectorvr,vectsize,x_cplx=x_cplx) 609 end if 610 end if 611 612 if(iterationnumber>1) then 613 if(use_linalg_gpu==1) then 614 call copy_on_gpu(blockvectorp,A_gpu,cplx*dp*vectsize*blocksize) 615 call copy_on_gpu(blockvectorbp,C_gpu,cplx*dp*vectsize*blocksize) 616 call gpu_xorthonormalize(A_gpu,C_gpu,blocksize,mpi_enreg%comm_bandspinorfft,gram_gpu,vectsize,& 617 & x_cplx,timopt=timopt,tim_xortho=tim_xortho) 618 call copy_from_gpu(blockvectorp,A_gpu,cplx*dp*vectsize*blocksize) 619 call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,C_gpu,vectsize) 620 call copy_from_gpu(blockvectorbp,C_gpu,cplx*dp*vectsize*blocksize) 621 call copy_on_gpu(blockvectorap,A_gpu,cplx*dp*vectsize*blocksize) 622 call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,A_gpu,vectsize) 623 call copy_from_gpu(blockvectorap,A_gpu,cplx*dp*vectsize*blocksize) 624 if (gs_hamk%usepaw==0) then 625 call copy_on_gpu(blockvectorvp,A_gpu,cplx*dp*vectsize*blocksize) 626 call gpu_xtrsm(cplx,'r','u','n','n',vectsize,blocksize,cone,gram_gpu,blocksize,A_gpu,vectsize) 627 call copy_from_gpu(blockvectorvp,A_gpu,cplx*dp*vectsize*blocksize) 628 end if 629 call copy_from_gpu(grampbp,gram_gpu,cplx*dp*blocksize*blocksize) 630 else 631 ! call orthonormalize(blockvectorp,blockvectorbp,blockvectorap) 632 call abi_xorthonormalize(blockvectorp,blockvectorbp,blocksize,mpi_enreg%comm_bandspinorfft,grampbp,vectsize,& 633 & x_cplx,timopt=timopt,tim_xortho=tim_xortho) 634 ! blockvectorap=matmul(blockvectorap,grampbp) 635 call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,grampbp,blocksize,blockvectorbp,vectsize,x_cplx=x_cplx) 636 call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,grampbp,blocksize,blockvectorap,vectsize,x_cplx=x_cplx) 637 if (gs_hamk%usepaw==0) then 638 call abi_xtrsm('r','u','n','n',vectsize,blocksize,cone,grampbp,blocksize,blockvectorvp,vectsize,x_cplx=x_cplx) 639 end if 640 end if 641 end if 642 643 activersize=blocksize 644 if (iterationnumber==1) then 645 activepsize=0 646 restart=1 647 else 648 activepsize=blocksize 649 restart=0 650 end if 651 652 ! gramxar=matmul((blockvectorax)^T,blockvectorr) 653 ! gramrar=matmul((blockvectorar)^T,blockvectorr) 654 ! gramxax=matmul((blockvectorax)^T,blockvectorx) 655 if(use_linalg_gpu==1) then 656 call copy_on_gpu(blockvectorax,A_gpu,cplx*dp*vectsize*blocksize) 657 call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,A_gpu,& 658 & vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,blocksize) 659 call copy_from_gpu(gramxar,gram_gpu,cplx*dp*blocksize*blocksize) 660 call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorar_gpu,& 661 & vectsize,blockvectorr_gpu,vectsize,czero,gram_gpu,blocksize) 662 call copy_from_gpu(gramrar,gram_gpu,cplx*dp*blocksize*blocksize) 663 call copy_on_gpu(blockvectorx,C_gpu,cplx*dp*vectsize*blocksize) 664 call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,cone,A_gpu,& 665 & vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize) 666 call copy_from_gpu(gramxax,gram_gpu,cplx*dp*blocksize*blocksize) 667 else 668 call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorax,& 669 & vectsize,blockvectorr,vectsize,czero,gramxar,blocksize,x_cplx=x_cplx) 670 call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorar,& 671 & vectsize,blockvectorr,vectsize,czero,gramrar,blocksize,x_cplx=x_cplx) 672 call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorax,& 673 & vectsize,blockvectorx,vectsize,czero,gramxax,blocksize,x_cplx=x_cplx) 674 end if 675 676 call abi_xcopy(blocksize*blocksize,gramxar,1,transf3(:,:,1),1,x_cplx=x_cplx) 677 call abi_xcopy(blocksize*blocksize,gramrar,1,transf3(:,:,2),1,x_cplx=x_cplx) 678 call abi_xcopy(blocksize*blocksize,gramxax,1,transf3(:,:,3),1,x_cplx=x_cplx) 679 if(abs(dtset%timopt)==3) then 680 call timab(533,1,tsec) 681 end if 682 call xmpi_sum(transf3,mpi_enreg%comm_bandspinorfft,ierr) 683 if(abs(dtset%timopt)==3) then 684 call timab(533,2,tsec) 685 end if 686 687 call abi_xcopy(blocksize*blocksize,transf3(:,:,1),1,gramxar,1,x_cplx=x_cplx) 688 call abi_xcopy(blocksize*blocksize,transf3(:,:,2),1,gramrar,1,x_cplx=x_cplx) 689 call abi_xcopy(blocksize*blocksize,transf3(:,:,3),1,gramxax,1,x_cplx=x_cplx) 690 691 ! gramxbx=matmul((blockvectorbx)^T,blockvectorx) 692 ! gramrbr=matmul((blockvectorbr)^T,blockvectorr) 693 ! gramxbr=matmul((blockvectorbx)^T,blockvectorr) 694 ! Note that the gramb matrix is more easier to construct than grama: 695 ! i) <x|B|x>=<r|B|r>=<p|B|p>=(1;0) 696 ! since the x, r and p blockvector are normalized 697 ! ii) <r|B|x>=(0;0) 698 ! since the x and r blockvector are orthogonalized 699 ! iii) The <p|B|r> and <p|B|x> have to be computed. 700 gramxbx(:,:)=zero 701 gramrbr(:,:)=zero 702 gramxbr(:,:)=zero 703 do iblocksize=1,blocksize 704 zvar=(/one,zero/) 705 call abi_xcopy(1,zvar,1,gramxbx(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx) 706 call abi_xcopy(1,zvar,1,gramrbr(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx) 707 end do 708 709 ! ########################################################################### 710 ! ################ PERFORM LOOP ON COND ##################################### 711 ! ########################################################################### 712 713 i1=0;i2=blocksize;i3=2*blocksize;i4=3*blocksize 714 cond: do cond_try=1,2 !2 when restart 715 if (restart==0) then 716 717 ! gramxap=matmul((blockvectorax)^T,blockvectorp) 718 ! gramrap=matmul((blockvectorar)^T,blockvectorp) 719 ! grampap=matmul((blockvectorap)^T,blockvectorp) 720 ! gramxbp=matmul((blockvectorbx)^T,blockvectorp) 721 ! gramrbp=matmul((blockvectorbr)^T,blockvectorp) 722 ! grampbp=matmul((blockvectorbp)^T,blockvectorp) 723 if(use_linalg_gpu==1) then 724 call copy_on_gpu(blockvectorp,C_gpu,cplx*dp*vectsize*blocksize) 725 call copy_on_gpu(blockvectorax,A_gpu,cplx*dp*vectsize*blocksize) 726 call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,& 727 & cone,A_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize) 728 call copy_from_gpu(gramxap,gram_gpu,cplx*dp*blocksize*blocksize) 729 call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,& 730 & cone,blockvectorar_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize) 731 call copy_from_gpu(gramrap,gram_gpu,cplx*dp*blocksize*blocksize) 732 call copy_on_gpu(blockvectorap,A_gpu,cplx*dp*vectsize*blocksize) 733 call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,& 734 & cone,A_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize) 735 call copy_from_gpu(grampap,gram_gpu,cplx*dp*blocksize*blocksize) 736 call copy_on_gpu(blockvectorbx,A_gpu,cplx*dp*vectsize*blocksize) 737 call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,& 738 & cone,A_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize) 739 call copy_from_gpu(gramxbp,gram_gpu,cplx*dp*blocksize*blocksize) 740 call gpu_xgemm(cplx,cparam(cplx),'n',blocksize,blocksize,vectsize,& 741 & cone,blockvectorbr_gpu,vectsize,C_gpu,vectsize,czero,gram_gpu,blocksize) 742 call copy_from_gpu(gramrbp,gram_gpu,cplx*dp*blocksize*blocksize) 743 else 744 call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorax,& 745 & vectsize,blockvectorp,vectsize,czero,gramxap,blocksize,x_cplx=x_cplx) 746 call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorar,& 747 & vectsize,blockvectorp,vectsize,czero,gramrap,blocksize,x_cplx=x_cplx) 748 call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorap,& 749 & vectsize,blockvectorp,vectsize,czero,grampap,blocksize,x_cplx=x_cplx) 750 call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorbx,& 751 & vectsize,blockvectorp,vectsize,czero,gramxbp,blocksize,x_cplx=x_cplx) 752 call abi_xgemm(cparam(cplx),'n',blocksize,blocksize,vectsize,cone,blockvectorbr,& 753 & vectsize,blockvectorp,vectsize,czero,gramrbp,blocksize,x_cplx=x_cplx) 754 end if 755 ! It's not necessary to compute the last one: <p|B|p>=(1;0) (see above) 756 transf5(:,:,1)=gramxap(:,:) 757 transf5(:,:,2)=gramrap(:,:) 758 transf5(:,:,3)=grampap(:,:) 759 transf5(:,:,4)=gramxbp(:,:) 760 transf5(:,:,5)=gramrbp(:,:) 761 if(abs(dtset%timopt)==3) then 762 call timab(533,1,tsec) 763 end if 764 call xmpi_sum(transf5,mpi_enreg%comm_bandspinorfft,ierr) 765 if(abs(dtset%timopt)==3) then 766 call timab(533,2,tsec) 767 end if 768 gramxap(:,:)=transf5(:,:,1) 769 gramrap(:,:)=transf5(:,:,2) 770 grampap(:,:)=transf5(:,:,3) 771 gramxbp(:,:)=transf5(:,:,4) 772 gramrbp(:,:)=transf5(:,:,5) 773 grampbp(:,:)=zero 774 do iblocksize=1,blocksize 775 zvar=(/one,zero/) 776 call abi_xcopy(1,zvar,1,grampbp(cplx*(iblocksize-1)+1:cplx*iblocksize,iblocksize),1,x_cplx=x_cplx) 777 end do 778 bigorder=i4 779 ABI_ALLOCATE(grama,(cplx*i4,i4)) 780 ABI_ALLOCATE(gramb,(cplx*i4,i4)) 781 ABI_ALLOCATE(eigen,(i4)) 782 ! ABI_ALLOCATE(coordx,(cplx*i4,blocksize)) 783 ABI_ALLOCATE(coordx1,(cplx*blocksize,blocksize)) 784 ABI_ALLOCATE(coordx2,(cplx*blocksize,blocksize)) 785 ABI_ALLOCATE(coordx3,(cplx*blocksize,blocksize)) 786 grama(:,:)=zero;gramb(:,:)=zero 787 grama(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxax 788 grama(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxar 789 grama(gramindex(i1+1):gramindex(i2)+cplx-1,i3+1:i4)=gramxap 790 grama(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrar 791 grama(gramindex(i2+1):gramindex(i3)+cplx-1,i3+1:i4)=gramrap 792 grama(gramindex(i3+1):gramindex(i4)+cplx-1,i3+1:i4)=grampap 793 gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxbx 794 gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxbr 795 gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i3+1:i4)=gramxbp 796 gramb(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrbr 797 gramb(gramindex(i2+1):gramindex(i3)+cplx-1,i3+1:i4)=gramrbp 798 gramb(gramindex(i3+1):gramindex(i4)+cplx-1,i3+1:i4)=grampbp 799 else 800 bigorder=i3 801 ABI_ALLOCATE(grama,(cplx*i3,i3)) 802 ABI_ALLOCATE(gramb,(cplx*i3,i3)) 803 ABI_ALLOCATE(eigen,(i3)) 804 ! ABI_ALLOCATE(coordx,(cplx*i3,blocksize)) 805 ABI_ALLOCATE(coordx1,(cplx*blocksize,blocksize)) 806 ABI_ALLOCATE(coordx2,(cplx*blocksize,blocksize)) 807 grama(:,:)=zero;gramb(:,:)=zero 808 grama(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxax 809 grama(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxar 810 grama(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrar 811 gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i1+1:i2)=gramxbx 812 gramb(gramindex(i1+1):gramindex(i2)+cplx-1,i2+1:i3)=gramxbr 813 gramb(gramindex(i2+1):gramindex(i3)+cplx-1,i2+1:i3)=gramrbr 814 end if 815 816 ABI_ALLOCATE(tmpgramb,(cplx*bigorder,bigorder)) 817 ABI_ALLOCATE(tmpeigen,(bigorder)) 818 tmpgramb=gramb 819 820 call abi_xheev('v','u',bigorder,tmpgramb,tmpeigen,x_cplx=cplx,istwf_k=istwf_k, & 821 & timopt=timopt,tim_xeigen=tim_xeigen,use_slk=dtset%use_slk,use_gpu=use_lapack_gpu) 822 823 condestgramb=tmpeigen(bigorder)/tmpeigen(1) 824 ABI_DEALLOCATE(tmpgramb) 825 ABI_DEALLOCATE(tmpeigen) 826 827 if (condestgramb.gt.1d+5.or.condestgramb.lt.0.d0.or.info/=0) then 828 write(std_out,*)'condition number of the Gram matrix = ',condestgramb 829 if (cond_try==1.and.restart==0) then 830 ABI_DEALLOCATE(grama) 831 ABI_DEALLOCATE(gramb) 832 ABI_DEALLOCATE(eigen) 833 ! ABI_DEALLOCATE(coordx) 834 ABI_DEALLOCATE(coordx1) 835 ABI_DEALLOCATE(coordx2) 836 if(bigorder==i4) then 837 ABI_DEALLOCATE(coordx3) 838 end if 839 if (nrestart.gt.1) then 840 MSG_WARNING('the minimization is stopped for this block') 841 exit iter 842 else 843 restart=1 844 nrestart=nrestart+1 845 call wrtout(std_out,'Lobpcgwf: restart performed',"PERS") 846 end if 847 else 848 MSG_WARNING('Gramm matrix ill-conditionned: results may be unpredictable') 849 end if 850 else 851 exit cond 852 end if 853 end do cond 854 855 ! ########################################################################### 856 ! ################ END LOOP ON COND ######################################### 857 ! ########################################################################### 858 859 call abi_xhegv(1,'v','u',bigorder,grama,gramb,eigen,x_cplx=cplx,istwf_k=istwf_k, & 860 timopt=timopt,tim_xeigen=tim_xeigen,use_slk=dtset%use_slk,use_gpu=use_lapack_gpu) 861 862 deltae=-one 863 do iblocksize=1,blocksize 864 call abi_xcopy(1,lambda(cplx*(iblocksize-1)+1,iblocksize),1,zvar,1,x_cplx=x_cplx) 865 deltae=max(deltae,abs(cmplx(zvar(1),zvar(2))-eigen(iblocksize))) 866 zvar=(/eigen(iblocksize),zero/) 867 call abi_xcopy(1,zvar,1,lambda(cplx*(iblocksize-1)+1,iblocksize),1,x_cplx=x_cplx) 868 end do 869 870 ! DEBUG 871 ! write(std_out,*)'eigen',eigen(1:blocksize) 872 ! ENDDEBUG 873 874 ! coordx(1:bigorder*cplx,1:blocksize)=grama(1:bigorder*cplx,1:blocksize) 875 coordx1(:,:) = grama(1+i1*cplx : i2*cplx,1:blocksize) 876 coordx2(:,:) = grama(1+i2*cplx : i3*cplx,1:blocksize) 877 if(bigorder==i4) then 878 coordx3(:,:) = grama(1+i3*cplx : i4*cplx,1:blocksize) 879 end if 880 881 882 if(use_linalg_gpu==1) then 883 call copy_on_gpu(coordx2,coordx2_gpu,cplx*dp*blocksize*blocksize) 884 if(bigorder==i4) then 885 call copy_on_gpu(coordx3,coordx3_gpu,cplx*dp*blocksize*blocksize) 886 end if 887 end if 888 889 ABI_DEALLOCATE(grama) 890 ABI_DEALLOCATE(gramb) 891 ABI_DEALLOCATE(eigen) 892 if (restart==0 .and. iterationnumber >1) then 893 894 ! blockvectorp=matmul(blockvectorr,coordx(i2+1:i3,:))+& 895 ! & matmul(blockvectorp,coordx(i3+1:i4,:)) 896 if(use_linalg_gpu==1) then 897 ! call copy_on_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize) 898 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorr_gpu,& 899 & vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize) 900 call copy_on_gpu(blockvectorp,A_gpu,cplx*dp*vectsize*blocksize) 901 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,vectsize,& 902 & coordx3_gpu,blocksize,cone,C_gpu,vectsize) 903 call copy_from_gpu(blockvectorp,C_gpu,cplx*dp*vectsize*blocksize) 904 else 905 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorr,& 906 & vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx) 907 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorp,& 908 & vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx) 909 call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorp,1,x_cplx=x_cplx) 910 end if 911 912 ! blockvectorap=matmul(blockvectorar,coordx(i2+1:i3,:))+& 913 ! & matmul(blockvectorap,coordx(i3+1:i4,:)) 914 if(use_linalg_gpu==1) then 915 ! call copy_on_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize) 916 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorar_gpu,& 917 & vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize) 918 call copy_on_gpu(blockvectorap,A_gpu,cplx*dp*vectsize*blocksize) 919 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,vectsize,& 920 & coordx3_gpu,blocksize,cone,C_gpu,vectsize) 921 call copy_from_gpu(blockvectorap,C_gpu,cplx*dp*vectsize*blocksize) 922 else 923 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorar,& 924 & vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx) 925 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorap,& 926 & vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx) 927 call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorap,1,x_cplx=x_cplx) 928 end if 929 930 931 ! blockvectorvp=matmul(blockvectorvr,coordx(i2+1:i3,:))+& 932 ! & matmul(blockvectorvp,coordx(i3+1:i4,:)) 933 if (gs_hamk%usepaw==0) then 934 if(use_linalg_gpu==1) then 935 call copy_on_gpu(blockvectorvr,A_gpu,cplx*dp*vectsize*blocksize) 936 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,& 937 & vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize) 938 call copy_on_gpu(blockvectorvp,A_gpu,cplx*dp*vectsize*blocksize) 939 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,& 940 & vectsize,coordx3_gpu,blocksize,cone,C_gpu,vectsize) 941 call copy_from_gpu(blockvectorvp,C_gpu,cplx*dp*vectsize*blocksize) 942 else 943 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvr,& 944 & vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx) 945 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvp,& 946 & vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx) 947 call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorvp,1,x_cplx=x_cplx) 948 end if 949 end if 950 951 ! blockvectorbp=matmul(blockvectorbr,coordx(i2+1:i3,:))+& 952 ! & matmul(blockvectorbp,coordx(i3+1:i4,:)) 953 if(use_linalg_gpu==1) then 954 ! call copy_on_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize) 955 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorbr_gpu,& 956 & vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize) 957 call copy_on_gpu(blockvectorbp,A_gpu,cplx*dp*vectsize*blocksize) 958 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,vectsize,& 959 & coordx3_gpu,blocksize,cone,C_gpu,vectsize) 960 call copy_from_gpu(blockvectorbp,C_gpu,cplx*dp*vectsize*blocksize) 961 else 962 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbr,& 963 & vectsize,coordx2,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx) 964 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbp,& 965 & vectsize,coordx3,blocksize,cone,blockvectordumm,vectsize,x_cplx=x_cplx) 966 call abi_xcopy(vectsize*blocksize,blockvectordumm,1,blockvectorbp,1,x_cplx=x_cplx) 967 end if 968 969 else 970 971 ! blockvectoSz =matmul(blockvectorr,coordx(i2+1:i3,:)) 972 if(use_linalg_gpu==1) then 973 ! call copy_on_gpu(blockvectorr,blockvectorr_gpu,cplx*dp*vectsize*blocksize) 974 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorr_gpu,& 975 & vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize) 976 call copy_from_gpu(blockvectorp,C_gpu,cplx*dp*vectsize*blocksize) 977 else 978 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorr,& 979 & vectsize,coordx2,blocksize,czero,blockvectorp,vectsize,x_cplx=x_cplx) 980 end if 981 982 ! blockvectorap=matmul(blockvectorar,coordx(i2+1:i3,:)) 983 if(use_linalg_gpu==1) then 984 ! call copy_on_gpu(blockvectorar,blockvectorar_gpu,cplx*dp*vectsize*blocksize) 985 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorar_gpu,& 986 & vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize) 987 call copy_from_gpu(blockvectorap,C_gpu,cplx*dp*vectsize*blocksize) 988 else 989 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorar,& 990 & vectsize,coordx2,blocksize,czero,blockvectorap,vectsize,x_cplx=x_cplx) 991 end if 992 ! blockvectorvp=matmul(blockvectorvr,coordx(i2+1:i3,:)) 993 if (gs_hamk%usepaw==0) then 994 if(use_linalg_gpu==1) then 995 call copy_on_gpu(blockvectorvr,A_gpu,cplx*dp*vectsize*blocksize) 996 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,& 997 & vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize) 998 call copy_from_gpu(blockvectorvp,C_gpu,cplx*dp*vectsize*blocksize) 999 else 1000 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvr,& 1001 & vectsize,coordx2,blocksize,czero,blockvectorvp,vectsize,x_cplx=x_cplx) 1002 end if 1003 end if 1004 1005 ! blockvectorbp=matmul(blockvectorbr,coordx(i2+1:i3,:)) 1006 if(use_linalg_gpu==1) then 1007 ! call copy_on_gpu(blockvectorbr,blockvectorbr_gpu,cplx*dp*vectsize*blocksize) 1008 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,blockvectorbr_gpu,& 1009 & vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize) 1010 call copy_from_gpu(blockvectorbp,C_gpu,cplx*dp*vectsize*blocksize) 1011 else 1012 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbr,& 1013 & vectsize,coordx2,blocksize,czero,blockvectorbp,vectsize,x_cplx=x_cplx) 1014 end if 1015 end if 1016 1017 if(use_linalg_gpu==1) then 1018 call copy_on_gpu(coordx1,coordx2_gpu,cplx*dp*blocksize*blocksize) 1019 end if 1020 1021 ! blockvectorx = matmul(blockvectorx,coordx(i1+1:i2,:))+blockvectorp 1022 if(use_linalg_gpu==1) then 1023 call copy_on_gpu(blockvectorx,A_gpu,cplx*dp*vectsize*blocksize) 1024 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,& 1025 & vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize) 1026 call copy_from_gpu(blockvectordumm,C_gpu,cplx*dp*vectsize*blocksize) 1027 else 1028 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorx,& 1029 & vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx) 1030 end if 1031 blockvectorx = blockvectordumm+blockvectorp 1032 1033 ! blockvectorax= matmul(blockvectorax,coordx(i1+1:i2,:))+blockvectorap 1034 if(use_linalg_gpu==1) then 1035 call copy_on_gpu(blockvectorax,A_gpu,cplx*dp*vectsize*blocksize) 1036 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,& 1037 & vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize) 1038 call copy_from_gpu(blockvectordumm,C_gpu,cplx*dp*vectsize*blocksize) 1039 else 1040 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorax,& 1041 & vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx) 1042 end if 1043 blockvectorax = blockvectordumm+blockvectorap 1044 1045 ! blockvectorvx= matmul(blockvectorvx,coordx(i1+1:i2,:))+blockvectorvp 1046 if (gs_hamk%usepaw==0) then 1047 if(use_linalg_gpu==1) then 1048 call copy_on_gpu(blockvectorvx,A_gpu,cplx*dp*vectsize*blocksize) 1049 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,& 1050 & vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize) 1051 call copy_from_gpu(blockvectordumm,C_gpu,cplx*dp*vectsize*blocksize) 1052 else 1053 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorvx,& 1054 & vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx) 1055 end if 1056 blockvectorvx = blockvectordumm+blockvectorvp 1057 end if 1058 1059 ! blockvectorbx= matmul(blockvectorbx,coordx(i1+1:i2,:))+blockvectorbp 1060 if(use_linalg_gpu==1) then 1061 call copy_on_gpu(blockvectorbx,A_gpu,cplx*dp*vectsize*blocksize) 1062 call gpu_xgemm(cplx,'n','n',vectsize,blocksize,blocksize,cone,A_gpu,& 1063 & vectsize,coordx2_gpu,blocksize,czero,C_gpu,vectsize) 1064 call copy_from_gpu(blockvectordumm,C_gpu,cplx*dp*vectsize*blocksize) 1065 else 1066 call abi_xgemm('n','n',vectsize,blocksize,blocksize,cone,blockvectorbx,& 1067 & vectsize,coordx1,blocksize,czero,blockvectordumm,vectsize,x_cplx=x_cplx) 1068 end if 1069 blockvectorbx = blockvectordumm+blockvectorbp 1070 1071 ! ABI_DEALLOCATE(coordx) 1072 ABI_DEALLOCATE(coordx1) 1073 ABI_DEALLOCATE(coordx2) 1074 if(bigorder==i4) then 1075 ABI_DEALLOCATE(coordx3) 1076 end if 1077 1078 ! Check convergence on energy and eventually exit 1079 if (iterationnumber==1) then 1080 deold=deltae 1081 else if (iterationnumber>1) then 1082 if ((abs(deltae)<0.005*abs(deold)).and.(iterationnumber/=maxiterations))then 1083 if(prtvol>=10)then 1084 write(message, '(2(a,i4),1x,a,1p,e12.4,a,e12.4,a)' ) & 1085 & ' lobpcgwf: block',iblock,', line',iterationnumber,& 1086 & ', deltae=',deltae,' < 0.005*',deold,' =>skip lines !' 1087 call wrtout(std_out,message,'PERS') 1088 end if 1089 exit 1090 else if (abs(deltae)>0.005*abs(deold)) then 1091 if(prtvol>=10)then 1092 write(message, '(2(a,i4),1x,a,1p,e12.4,a,e12.4,a)' ) & 1093 & ' lobpcgwf: block',iblock,', line',iterationnumber,& 1094 & ', deltae=',deltae,' > 0.005*',deold,' =>keep on working !' 1095 call wrtout(std_out,message,'PERS') 1096 end if 1097 end if 1098 end if 1099 1100 if(abs(dtset%timopt)==4) then 1101 call timab(524,2,tsec) 1102 end if 1103 1104 end do iter 1105 1106 ! ########################################################################### 1107 ! ################## END LOOP ON NLINE ###################################### 1108 ! ########################################################################### 1109 1110 if(abs(dtset%timopt)==4) then 1111 call timab(525,1,tsec) 1112 end if 1113 1114 if (havetoprecon) then 1115 call xprecon(blockvectorbx,lambda,blocksize,& 1116 & iterationnumber,kinpw,mpi_enreg,npw_k,my_nspinor,& 1117 & optekin,optpcon,pcon,blockvectorax,blockvectorr,vectsize,timopt=timopt,tim_xprecon=tim_xprecon) 1118 1119 residualnorms=sum(abs(blockvectorr)**2,dim=1) 1120 1121 if(abs(dtset%timopt)==3) then 1122 call timab(533,1,tsec) 1123 end if 1124 call xmpi_sum(residualnorms,mpi_enreg%comm_bandspinorfft,ierr) 1125 if(abs(dtset%timopt)==3) then 1126 call timab(533,2,tsec) 1127 end if 1128 1129 resid_k(bblocksize+1:bblocksize+blocksize)=residualnorms(1:blocksize) 1130 end if 1131 1132 call wfcopy('I',vectsize*blocksize,blockvectorx,1,cg,1,blocksize,iblock,'C',withbbloc=.true.,& 1133 & timopt=timopt,tim_wfcopy=tim_wfcopy) 1134 1135 if(gen_eigenpb) then 1136 call wfcopy('I',vectsize*blocksize,blockvectorbx,1,gsc,1,blocksize,iblock,'S',withbbloc=.true.,& 1137 & timopt=timopt,tim_wfcopy=tim_wfcopy) 1138 end if 1139 1140 ! The Vnl part of the Hamiltonian is no more stored in the packed form such as it was the case for subvnl(:). 1141 ! Now, the full matrix is stored in totvnl(:,:). This trick permits: 1142 ! 1) to avoid the reconstruction of the total matrix in vtowfk.F90 (double loop over bands) 1143 ! 2) to use two optimized matrix-matrix blas routine for general (in lobpcgccwf.F90) or hermitian (in vtowfk.F90) 1144 ! operators, zgemm.f and zhemm.f respectively, rather than a triple loop in both cases. 1145 iwavef=iblock*blocksize 1146 isubh=1+2*bblocksize*(bblocksize+1)/2 1147 1148 ABI_ALLOCATE(blockvectorz,(cplx*vectsize,iwavef)) 1149 if(bblocksize > 0 ) then 1150 call abi_xcopy(bblocksize*vectsize,blockvectory(:,1:bblocksize),1,blockvectorz(:,1:bblocksize),1,x_cplx=x_cplx) 1151 end if 1152 call abi_xcopy( blocksize*vectsize,blockvectorx(:,1:blocksize) ,1,blockvectorz(:,bblocksize+1:iwavef),1,x_cplx=x_cplx) 1153 1154 ABI_ALLOCATE(tsubham,(cplx*iwavef,blocksize)) 1155 tsubham(:,:)=zero 1156 call abi_xgemm(cparam(cplx),'n',iwavef,blocksize,vectsize,cone,blockvectorz,vectsize,& 1157 & blockvectorax,vectsize,czero,tsubham,iwavef,x_cplx=x_cplx) 1158 1159 if (gs_hamk%usepaw==0) then 1160 ! MG FIXME: Here gfortran4.9 allocates temporary array for C in abi_d2zgemm. 1161 call abi_xgemm(cparam(cplx),'n',blocksize,iwavef,vectsize,cone,blockvectorvx,vectsize,& 1162 & blockvectorz,vectsize,czero,totvnl(cplx*bblocksize+1:cplx*iwavef,1:iwavef),blocksize,x_cplx=x_cplx) 1163 end if 1164 1165 do iblocksize=1,blocksize 1166 do ii=1,bblocksize+iblocksize 1167 if ( cplx == 1 ) then 1168 subham(isubh) = tsubham(ii,iblocksize) 1169 subham(isubh+1)= zero 1170 else 1171 subham(isubh) = tsubham(2*ii-1,iblocksize) 1172 subham(isubh+1)= tsubham(2*ii ,iblocksize) 1173 end if 1174 isubh=isubh+2 1175 end do 1176 end do 1177 ABI_DEALLOCATE(tsubham) 1178 ABI_DEALLOCATE(blockvectorz) 1179 ! comm for subham and subvnl are made in vtowfk 1180 1181 ABI_DEALLOCATE(pcon) 1182 ABI_DEALLOCATE(blockvectory) 1183 ABI_DEALLOCATE(blockvectorby) 1184 ABI_DEALLOCATE(gramyx) 1185 ABI_DEALLOCATE(blockvectorx) 1186 ABI_DEALLOCATE(blockvectorax) 1187 ABI_DEALLOCATE(blockvectorbx) 1188 ABI_DEALLOCATE(blockvectorr) 1189 ABI_DEALLOCATE(blockvectorar) 1190 ABI_DEALLOCATE(blockvectorbr) 1191 ABI_DEALLOCATE(blockvectorp) 1192 ABI_DEALLOCATE(blockvectorap) 1193 ABI_DEALLOCATE(blockvectorbp) 1194 if (gs_hamk%usepaw==0) then 1195 ABI_DEALLOCATE(blockvectorvx) 1196 ABI_DEALLOCATE(blockvectorvp) 1197 ABI_DEALLOCATE(blockvectorvr) 1198 end if 1199 ABI_DEALLOCATE(blockvectordumm) 1200 ABI_DEALLOCATE(gramxax) 1201 ABI_DEALLOCATE(gramxar) 1202 ABI_DEALLOCATE(gramxap) 1203 ABI_DEALLOCATE(gramrar) 1204 ABI_DEALLOCATE(gramrap) 1205 ABI_DEALLOCATE(grampap) 1206 ABI_DEALLOCATE(gramxbx) 1207 ABI_DEALLOCATE(gramxbr) 1208 ABI_DEALLOCATE(gramxbp) 1209 ABI_DEALLOCATE(gramrbr) 1210 ABI_DEALLOCATE(gramrbp) 1211 ABI_DEALLOCATE(grampbp) 1212 ABI_DEALLOCATE(transf3) 1213 ABI_DEALLOCATE(transf5) 1214 ABI_DEALLOCATE(lambda) 1215 ABI_DEALLOCATE(residualnorms) 1216 if(use_linalg_gpu==1) then 1217 call dealloc_on_gpu(bblockvector_gpu) 1218 call dealloc_on_gpu(gram_gpu) 1219 end if 1220 1221 end do ! End big loop over bands inside blocks 1222 1223 if(use_linalg_gpu==1) then 1224 call dealloc_on_gpu(blockvectorr_gpu) 1225 call dealloc_on_gpu(blockvectorar_gpu) 1226 call dealloc_on_gpu(blockvectorbr_gpu) 1227 call dealloc_on_gpu(A_gpu) 1228 call dealloc_on_gpu(C_gpu) 1229 call dealloc_on_gpu(coordx2_gpu) 1230 call dealloc_on_gpu(coordx3_gpu) 1231 !call gpu_linalg_shutdown() 1232 end if 1233 1234 if(abs(dtset%timopt)==4) then 1235 call timab(525,2,tsec) 1236 end if 1237 call timab(530,2,tsec) 1238 1239 DBG_ENTER("COLL") 1240 1241 contains 1242 1243 function gramindex(iblocksize) 1244 1245 1246 !This section has been created automatically by the script Abilint (TD). 1247 !Do not modify the following lines by hand. 1248 #undef ABI_FUNC 1249 #define ABI_FUNC 'gramindex' 1250 !End of the abilint section 1251 1252 integer :: gramindex,iblocksize 1253 gramindex=(iblocksize-1)*cplx+1 1254 end function gramindex 1255 1256 end subroutine lobpcgwf