TABLE OF CONTENTS
ABINIT/cheb_oracle [ Functions ]
NAME
cheb_oracle
FUNCTION
Returns the number of necessary iterations to decrease residual by at least tol Here as in the rest of the code, the convention is that residuals are squared (||Ax-lx||^2)
COPYRIGHT
Copyright (C) 2014-2018 ABINIT group (AL) this file is distributed under the terms of the gnu general public license, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt . for the initials of contributors, see ~abinit/doc/developers/contributors.txt .
INPUTS
x= input variable a= left bound of the interval b= right bound of the interval tol= needed precision nmax= max number of iterations
OUTPUT
n= number of iterations needed to decrease residual by tol
SIDE EFFECTS
PARENTS
chebfi
CHILDREN
NOTES
SOURCE
686 function cheb_oracle(x, a, b, tol, nmax) result(n) 687 use defs_basis 688 689 !This section has been created automatically by the script Abilint (TD). 690 !Do not modify the following lines by hand. 691 #undef ABI_FUNC 692 #define ABI_FUNC 'cheb_oracle' 693 !End of the abilint section 694 695 implicit none 696 697 real(dp) :: tol 698 699 integer :: nmax 700 integer :: n, i 701 real(dp), intent(in) :: x, a, b 702 real(dp) :: y, xred, temp 703 real(dp) :: yim1 704 705 ! ************************************************************************* 706 707 xred = (x-(a+b)/2)/(b-a)*2 708 y = xred 709 yim1 = one 710 711 n = nmax 712 if(1/(y**2) < tol) then 713 n = 1 714 else 715 do i=2, nmax-1 716 temp = y 717 y = 2*xred*y - yim1 718 yim1 = temp 719 if(1/(y**2) < tol) then 720 n = i 721 exit 722 end if 723 end do 724 end if 725 726 end function cheb_oracle
ABINIT/cheb_poly [ Functions ]
NAME
cheb_poly
FUNCTION
Computes the value of the Chebyshev polynomial of degree n on the interval [a,b] at x
COPYRIGHT
Copyright (C) 2014-2018 ABINIT group (AL) this file is distributed under the terms of the gnu general public license, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt . for the initials of contributors, see ~abinit/doc/developers/contributors.txt .
INPUTS
x= input variable n= degree a= left bound of the interval b= right bound of the interval
OUTPUT
y= Tn(x)
SIDE EFFECTS
PARENTS
chebfi
CHILDREN
NOTES
SOURCE
618 function cheb_poly(x, n, a, b) result(y) 619 use defs_basis 620 621 !This section has been created automatically by the script Abilint (TD). 622 !Do not modify the following lines by hand. 623 #undef ABI_FUNC 624 #define ABI_FUNC 'cheb_poly' 625 !End of the abilint section 626 627 implicit none 628 629 integer, intent(in) :: n 630 integer :: i 631 real(dp), intent(in) :: x, a, b 632 real(dp) :: y, xred, temp 633 real(dp) :: yim1 634 635 ! ************************************************************************* 636 637 xred = (x-(a+b)/2)/(b-a)*2 638 y = xred 639 yim1 = one 640 do i=2, n 641 temp = y 642 y = 2*xred*y - yim1 643 yim1 = temp 644 end do 645 646 end function cheb_poly
ABINIT/chebfi [ Functions ]
NAME
chebfi
FUNCTION
this routine updates the wave functions at a given k-point, using the ChebFi method (see paper by A. Levitt and M. Torrent)
COPYRIGHT
Copyright (C) 2014-2018 ABINIT group (AL) this file is distributed under the terms of the gnu general public license, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt . for the initials of contributors, see ~abinit/doc/developers/contributors.txt .
INPUTS
dtset <type(dataset_type)>=all input variales for this dataset gs_hamk <type(gs_hamiltonian_type)>=all data for the hamiltonian at k kinpw(npw)=(modified) kinetic energy for each plane wave (hartree) mpi_enreg=information about MPI parallelization nband=number of bands at this k point for that spin polarization npw=number of plane waves at this k point nspinor=number of plane waves at this k point prtvol=control print volume and debugging output
OUTPUT
eig(nband)=array for holding eigenvalues (hartree) resid(nband)=residuals for each states If gs_hamk%usepaw==1: gsc(2,*)=<g|s|c> matrix elements (s=overlap) If gs_hamk%usepaw==0 enl(nband)=contribution from each band to nonlocal pseudopotential part of total energy, at this k-point
SIDE EFFECTS
cg(2,*)=updated wavefunctions
PARENTS
vtowfk
CHILDREN
apply_invovl,dotprod_g,getghc,pawcprj_alloc,pawcprj_axpby,pawcprj_copy pawcprj_free,prep_getghc,prep_index_wavef_bandpp rayleigh_ritz_distributed,rayleigh_ritz_subdiago,timab,wrtout xmpi_alltoallv,xmpi_barrier,xmpi_max,xmpi_min,xmpi_sum
NOTES
-- TODO -- Normev? Ecutsm nspinor 2 spinors parallelisation fock -- Performance -- Improve load balancing Don't diagonalize converged eigenvectors, just orthogonalize Maybe don't diagonalize so often (once every two outer iterations?) Benchmark diagonalizations, choose np_slk How to chose npfft? Implement MINRES for invovl -- LOBPCG -- Improve stability (see paper by Lehoucq Sorensen, maybe use bunch-kaufman factorizations?)
SOURCE
66 #if defined HAVE_CONFIG_H 67 #include "config.h" 68 #endif 69 70 #include "abi_common.h" 71 72 subroutine chebfi(cg,dtset,eig,enl,gs_hamk,gsc,kinpw,mpi_enreg,nband,npw,nspinor,prtvol,resid) 73 74 75 use defs_basis 76 use defs_abitypes 77 use m_errors 78 use m_xmpi 79 use m_profiling_abi 80 use m_cgtools, only : dotprod_g 81 use m_efield 82 use m_abi_linalg 83 use m_invovl 84 use m_bandfft_kpt, only : bandfft_kpt,bandfft_kpt_get_ikpt 85 #if defined HAVE_MPI2 86 use mpi 87 #endif 88 89 use m_pawcprj, only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_axpby, pawcprj_copy 90 use m_hamiltonian, only : gs_hamiltonian_type 91 92 !This section has been created automatically by the script Abilint (TD). 93 !Do not modify the following lines by hand. 94 #undef ABI_FUNC 95 #define ABI_FUNC 'chebfi' 96 use interfaces_14_hidewrite 97 use interfaces_18_timing 98 use interfaces_66_wfs, except_this_one => chebfi 99 !End of the abilint section 100 101 implicit none 102 103 #if defined HAVE_MPI1 104 include 'mpif.h' 105 #endif 106 107 !Arguments ------------------------------------ 108 type(gs_hamiltonian_type),intent(inout) :: gs_hamk 109 type(dataset_type),intent(in) :: dtset 110 type(mpi_type),intent(inout) :: mpi_enreg 111 integer,intent(in) :: nband,npw,prtvol,nspinor 112 real(dp),intent(inout), target :: cg(2,npw*nspinor*nband),gsc(2,npw*nspinor*nband) 113 real(dp),intent(in) :: kinpw(npw) 114 real(dp),intent(out) :: resid(nband) 115 real(dp),intent(out) :: enl(nband*(1-gs_hamk%usepaw)) 116 real(dp),intent(out) :: eig(nband) 117 118 !Local variables------------------------------- 119 real(dp) :: pcon(npw) 120 real(dp) :: filter_low 121 real(dp) :: filter_center, filter_radius 122 real(dp), dimension(2, npw*nspinor*nband), target :: ghc, gvnlc 123 real(dp), allocatable, dimension(:,:) :: cg_filter_next, cg_filter_prev, gsm1hc_filter, gsc_filter_prev, gsc_filter_next 124 real(dp), allocatable, dimension(:,:), target :: cg_alltoall1,gsc_alltoall1,ghc_alltoall1,gvnlc_alltoall1 125 real(dp), allocatable, dimension(:,:), target :: cg_alltoall2,gsc_alltoall2,ghc_alltoall2,gvnlc_alltoall2 126 real(dp), pointer, dimension(:,:) :: cg_filter, gsc_filter, ghc_filter, gvnlc_filter 127 real(dp) :: resid_vec(2, npw*nspinor) 128 logical :: paw 129 integer :: shift, shift_cg_loadbalanced 130 integer :: iband, iline, ispinor 131 integer :: sij_opt, cpopt 132 real(dp) :: eval, tsec(2) 133 integer :: tim_getghc = 5, ierr 134 integer :: i 135 integer, allocatable :: index_wavef_band(:) 136 real(dp) :: maxeig, mineig 137 real(dp), allocatable :: resids_filter(:), residvec_filter(:,:) 138 integer, allocatable :: nline_bands(:) 139 integer :: iactive, nactive 140 real(dp) :: ampfactor 141 integer :: nline_max, nline_decrease, nline_tolwfr 142 ! real(dp) :: load_imbalance 143 integer :: mcg 144 real(dp) :: dprod_r, dprod_i 145 character(len=500) :: message 146 integer :: rdisplsloc(mpi_enreg%nproc_band),recvcountsloc(mpi_enreg%nproc_band) 147 integer :: sdisplsloc(mpi_enreg%nproc_band), sendcountsloc(mpi_enreg%nproc_band) 148 integer :: ikpt_this_proc, npw_filter, nband_filter 149 type(pawcprj_type), allocatable :: cwaveprj(:,:), cwaveprj_next(:,:), cwaveprj_prev(:,:) 150 ! integer :: nline_total 151 152 ! timers 153 integer, parameter :: timer_chebfi = 1600, timer_alltoall = 1601, timer_apply_inv_ovl = 1602, timer_rotation = 1603 154 integer, parameter :: timer_subdiago = 1604, timer_subham = 1605, timer_ortho = 1606, timer_getghc = 1607 155 integer, parameter :: timer_residuals = 1608, timer_update_eigen = 1609, timer_sync = 1610 156 157 ! ************************************************************************* 158 159 !====================================================================================================== 160 ! Initialize, transpose input cg if paral_kgb 161 !====================================================================================================== 162 call timab(timer_chebfi,1,tsec) 163 164 !Initializations 165 paw = gs_hamk%usepaw == 1 166 mcg = npw*nspinor*nband 167 168 ! Init pcon 169 pcon = (27+kinpw*(18+kinpw*(12+8*kinpw))) / (27+kinpw*(18+kinpw*(12+8*kinpw)) + 16*kinpw**4) 170 171 ghc=zero; gvnlc=zero 172 173 ! Initialize the _filter pointers. Depending on paral_kgb, they might point to the actual arrays or to _alltoall variables 174 if (dtset%paral_kgb == 1) then 175 ikpt_this_proc = bandfft_kpt_get_ikpt() 176 npw_filter = bandfft_kpt(ikpt_this_proc)%ndatarecv 177 nband_filter = mpi_enreg%bandpp 178 179 ABI_ALLOCATE(cg_alltoall1, (2, npw_filter*nspinor*nband_filter)) 180 ABI_ALLOCATE(gsc_alltoall1, (2, npw_filter*nspinor*nband_filter)) 181 ABI_ALLOCATE(ghc_alltoall1, (2, npw_filter*nspinor*nband_filter)) 182 ABI_ALLOCATE(gvnlc_alltoall1, (2, npw_filter*nspinor*nband_filter)) 183 ABI_ALLOCATE(cg_alltoall2, (2, npw_filter*nspinor*nband_filter)) 184 ABI_ALLOCATE(gsc_alltoall2, (2, npw_filter*nspinor*nband_filter)) 185 ABI_ALLOCATE(ghc_alltoall2, (2, npw_filter*nspinor*nband_filter)) 186 ABI_ALLOCATE(gvnlc_alltoall2, (2, npw_filter*nspinor*nband_filter)) 187 188 ! Init tranpose variables 189 recvcountsloc=bandfft_kpt(ikpt_this_proc)%recvcounts*2*nspinor*mpi_enreg%bandpp 190 rdisplsloc=bandfft_kpt(ikpt_this_proc)%rdispls*2*nspinor*mpi_enreg%bandpp 191 sendcountsloc=bandfft_kpt(ikpt_this_proc)%sendcounts*2*nspinor 192 sdisplsloc=bandfft_kpt(ikpt_this_proc)%sdispls*2*nspinor 193 194 ! Load balancing, so that each processor has approximately the same number of converged and non-converged bands 195 ! for two procs, rearrange 1 2 3 4 5 6 as 1 4 2 5 3 6 196 ! 197 ! trick to save memory: ghc has the necessary size, and will be overwritten afterwards anyway 198 #define cg_loadbalanced ghc 199 shift = 0 200 do i=1, mpi_enreg%nproc_band 201 do iband=1, mpi_enreg%bandpp 202 shift_cg_loadbalanced = (i-1 + (iband-1)*mpi_enreg%nproc_band)*npw*nspinor 203 cg_loadbalanced(:, shift+1:shift+npw*nspinor) = cg(:, shift_cg_loadbalanced+1:shift_cg_loadbalanced+npw*nspinor) 204 shift = shift + npw*nspinor 205 end do 206 end do 207 208 ! Transpose input cg into cg_alloall1. cg_alltoall1 is now (npw_filter, nband_filter) 209 call timab(timer_alltoall, 1, tsec) 210 call xmpi_alltoallv(cg_loadbalanced,sendcountsloc,sdisplsloc,cg_alltoall1,& 211 & recvcountsloc,rdisplsloc,mpi_enreg%comm_band,ierr) 212 call timab(timer_alltoall, 2, tsec) 213 #undef cg_loadbalanced 214 215 ! sort according to bandpp (from lobpcg, I don't fully understand what's going on but it works and it's fast) 216 call prep_index_wavef_bandpp(mpi_enreg%nproc_band,mpi_enreg%bandpp,& 217 & nspinor,bandfft_kpt(ikpt_this_proc)%ndatarecv,& 218 & bandfft_kpt(ikpt_this_proc)%recvcounts,bandfft_kpt(ikpt_this_proc)%rdispls,& 219 & index_wavef_band) 220 cg_alltoall2(:,:) = cg_alltoall1(:,index_wavef_band) 221 222 cg_filter => cg_alltoall2 223 gsc_filter => gsc_alltoall2 224 ghc_filter => ghc_alltoall2 225 gvnlc_filter => gvnlc_alltoall2 226 else 227 npw_filter = npw 228 nband_filter = nband 229 230 cg_filter => cg 231 gsc_filter => gsc 232 ghc_filter => ghc 233 gvnlc_filter => gvnlc 234 end if 235 ! from here to the next alltoall, all computation is done on _filter variables, agnostic 236 ! to whether it's nband x npw (paral_kgb == 0) or ndatarecv*bandpp (paral_kgb = 1) 237 238 ! Allocate filter variables for the application of the Chebyshev polynomial 239 ABI_ALLOCATE(cg_filter_next, (2, npw_filter*nspinor*nband_filter)) 240 ABI_ALLOCATE(cg_filter_prev, (2, npw_filter*nspinor*nband_filter)) 241 ABI_ALLOCATE(gsc_filter_prev, (2, npw_filter*nspinor*nband_filter)) 242 ABI_ALLOCATE(gsc_filter_next, (2, npw_filter*nspinor*nband_filter)) 243 ABI_ALLOCATE(gsm1hc_filter, (2, npw_filter*nspinor*nband_filter)) 244 245 ! PAW init 246 if(paw) then 247 ABI_DATATYPE_ALLOCATE(cwaveprj, (gs_hamk%natom,nspinor*nband_filter)) 248 ABI_DATATYPE_ALLOCATE(cwaveprj_next, (gs_hamk%natom,nspinor*nband_filter)) 249 ABI_DATATYPE_ALLOCATE(cwaveprj_prev, (gs_hamk%natom,nspinor*nband_filter)) 250 call pawcprj_alloc(cwaveprj,0,gs_hamk%dimcprj) 251 call pawcprj_alloc(cwaveprj_next,0,gs_hamk%dimcprj) 252 call pawcprj_alloc(cwaveprj_prev,0,gs_hamk%dimcprj) 253 254 sij_opt = 1 ! recompute S 255 cpopt = 0 ! save cprojs 256 else 257 sij_opt = 0 258 cpopt = -1 259 end if 260 261 262 263 !====================================================================================================== 264 ! Data in npfft x npband distribution. First getghc, update eigenvalues and residuals 265 !====================================================================================================== 266 write(message, *) 'First getghc' 267 call wrtout(std_out,message,'COLL') 268 269 ! get_ghc on cg 270 call timab(timer_getghc, 1, tsec) 271 if (dtset%paral_kgb == 0) then 272 call getghc(cpopt,cg_filter,cwaveprj,ghc_filter,gsc_filter,gs_hamk,gvnlc_filter,& 273 & eval,mpi_enreg,nband,prtvol,sij_opt,tim_getghc,0) 274 else 275 call prep_getghc(cg_filter,gs_hamk,gvnlc_filter,ghc_filter,gsc_filter,eval,nband,mpi_enreg,& 276 & prtvol,sij_opt,cpopt,cwaveprj,already_transposed=.true.) 277 end if 278 call timab(timer_getghc, 2, tsec) 279 280 ! Debug barrier: should be invisible 281 call timab(timer_sync, 1, tsec) 282 call xmpi_barrier(mpi_enreg%comm_band) 283 call timab(timer_sync, 2, tsec) 284 285 write(message, *) 'Computing residuals' 286 call wrtout(std_out,message,'COLL') 287 ! update eigenvalues and residuals 288 call timab(timer_update_eigen, 1, tsec) 289 ABI_ALLOCATE(resids_filter, (nband_filter)) 290 ABI_ALLOCATE(residvec_filter, (2, npw_filter*nspinor)) 291 ABI_ALLOCATE(nline_bands, (nband_filter)) 292 do iband=1, nband_filter 293 shift = npw_filter*nspinor*(iband-1) 294 call dotprod_g(eig(iband),dprod_i,gs_hamk%istwf_k,npw_filter*nspinor,1,ghc_filter(:, shift+1:shift+npw_filter*nspinor),& 295 & cg_filter(:, shift+1:shift+npw_filter*nspinor),mpi_enreg%me_g0,mpi_enreg%comm_spinorfft) 296 if(paw) then 297 call dotprod_g(dprod_r,dprod_i,gs_hamk%istwf_k,npw_filter*nspinor,1,gsc_filter(:, shift+1:shift+npw_filter*nspinor),& 298 & cg_filter(:, shift+1:shift+npw_filter*nspinor),mpi_enreg%me_g0,mpi_enreg%comm_spinorfft) 299 eig(iband) = eig(iband)/dprod_r 300 end if 301 302 if(paw) then 303 residvec_filter = ghc_filter(:, shift+1 : shift+npw_filter*nspinor) & 304 & - eig(iband)*gsc_filter(:, shift+1 : shift+npw_filter*nspinor) 305 else 306 residvec_filter = ghc_filter(:, shift+1 : shift+npw_filter*nspinor) & 307 & - eig(iband)*cg_filter(:, shift+1 : shift+npw_filter*nspinor) 308 end if 309 resids_filter(iband) = SUM(residvec_filter**2) 310 end do 311 call xmpi_sum(resids_filter,mpi_enreg%comm_fft,ierr) 312 call xmpi_max(MAXVAL(eig(1:nband_filter)),maxeig,mpi_enreg%comm_band,ierr) 313 call xmpi_min(MINVAL(eig(1:nband_filter)),mineig,mpi_enreg%comm_band,ierr) 314 filter_low = maxeig 315 call timab(timer_update_eigen, 2, tsec) 316 317 ! Decide how many iterations per band are needed 318 ! don't go above this, or face bad conditioning of the Gram matrix. 319 nline_max = cheb_oracle(mineig, filter_low, dtset%ecut, 1e-16_dp, 40) 320 ! if(mpi_enreg%me == 0) write(0, *) nline_max 321 do iband=1, nband_filter 322 ! nline necessary to converge to tolwfr 323 nline_tolwfr = cheb_oracle(eig(iband), filter_low, dtset%ecut, dtset%tolwfr / resids_filter(iband), dtset%nline) 324 ! nline necessary to decrease residual by a constant factor 325 nline_decrease = cheb_oracle(eig(iband), filter_low, dtset%ecut, 0.1_dp, dtset%nline) 326 327 nline_bands(iband) = MAX(MIN(nline_tolwfr, nline_decrease, nline_max, dtset%nline), 1) 328 nline_bands(iband) = dtset%nline ! fiddle with this to use locking 329 end do 330 331 332 !!!!! Uncomment for diagnostics 333 ! nline_total = SUM(nline_bands) 334 ! call xmpi_sum(nline_total, mpi_enreg%comm_band, ierr) 335 ! load_imbalance = (SUM(nline_bands) - REAL(nline_total)/REAL(mpi_enreg%nproc_band)) / & 336 ! & (REAL(nline_total)/REAL(mpi_enreg%nproc_band)) 337 ! call xmax_mpi(load_imbalance, mpi_enreg%comm_band, ierr) 338 339 ! write(message, *) 'Mean nline', REAL(nline_total)/REAL(nband), 'max imbalance (%)', load_imbalance*100 340 ! call wrtout(std_out,message,'COLL') 341 342 ABI_DEALLOCATE(resids_filter) 343 ABI_DEALLOCATE(residvec_filter) 344 345 !====================================================================================================== 346 ! Chebyshev polynomial application 347 !====================================================================================================== 348 ! Filter by a chebyshev polynomial of degree nline 349 do iline=1,dtset%nline 350 ! Filter only on [iactive, iactive+nactive-1] 351 iactive = nband_filter 352 do iband = 1, nband_filter 353 ! does iband need an iteration? 354 if (nline_bands(iband) >= iline) then 355 iactive = iband 356 exit 357 end if 358 end do 359 nactive = nband_filter - iactive + 1 360 shift = npw_filter*nspinor*(iactive-1) + 1 361 ! trick the legacy prep_getghc 362 mpi_enreg%bandpp = nactive 363 364 ! Define the filter position 365 filter_center = (dtset%ecut+filter_low)/2 366 filter_radius = (dtset%ecut-filter_low)/2 367 368 ! write(message, *) 'Applying invovl, iteration', iline 369 ! call wrtout(std_out,message,'COLL') 370 371 ! If paw, have to apply S^-1 372 if(paw) then 373 call timab(timer_apply_inv_ovl, 1, tsec) 374 call apply_invovl(gs_hamk, ghc_filter(:,shift:), gsm1hc_filter(:,shift:), cwaveprj_next(:,iactive:), & 375 & npw_filter, nactive, mpi_enreg, nspinor) 376 call timab(timer_apply_inv_ovl, 2, tsec) 377 else 378 gsm1hc_filter(:,shift:) = ghc_filter(:,shift:) 379 end if 380 381 ! Chebyshev iteration: UPDATE cg 382 if(iline == 1) then 383 cg_filter_next(:,shift:) = one/filter_radius * (gsm1hc_filter(:,shift:) - filter_center*cg_filter(:,shift:)) 384 else 385 cg_filter_next(:,shift:) = two/filter_radius * (gsm1hc_filter(:,shift:) - filter_center*cg_filter(:,shift:)) & 386 & - cg_filter_prev(:,shift:) 387 end if 388 ! Update gsc and cwaveprj 389 if(paw) then 390 if(iline == 1) then 391 gsc_filter_next(:,shift:) = one/filter_radius * (ghc_filter(:,shift:) - filter_center*gsc_filter(:,shift:)) 392 !cwaveprj_next = one/filter_radius * (cwaveprj_next - filter_center*cwaveprj) 393 call pawcprj_axpby(-filter_center/filter_radius, one/filter_radius,cwaveprj(:,iactive:),cwaveprj_next(:,iactive:)) 394 else 395 gsc_filter_next(:,shift:) = two/filter_radius * (ghc_filter(:,shift:) - filter_center*gsc_filter(:,shift:))& 396 & - gsc_filter_prev(:,shift:) 397 !cwaveprj_next = two/filter_radius * (cwaveprj_next - filter_center*cwaveprj) - cwaveprj_prev 398 call pawcprj_axpby(-two*filter_center/filter_radius, two/filter_radius,cwaveprj(:,iactive:),cwaveprj_next(:,iactive:)) 399 call pawcprj_axpby(-one, one,cwaveprj_prev(:,iactive:),cwaveprj_next(:,iactive:)) 400 end if 401 end if 402 403 ! Bookkeeping of the _prev variables 404 cg_filter_prev(:,shift:) = cg_filter(:,shift:) 405 cg_filter(:,shift:) = cg_filter_next(:,shift:) 406 if(paw) then 407 gsc_filter_prev(:,shift:) = gsc_filter(:,shift:) 408 gsc_filter(:,shift:) = gsc_filter_next(:,shift:) 409 410 call pawcprj_copy(cwaveprj(:,iactive:),cwaveprj_prev(:,iactive:)) 411 call pawcprj_copy(cwaveprj_next(:,iactive:),cwaveprj(:,iactive:)) 412 end if 413 414 ! Update ghc 415 if(paw) then 416 !! DEBUG use this to remove the optimization and recompute gsc/cprojs 417 ! sij_opt = 1 418 ! cpopt = 0 419 420 sij_opt = 0 ! gsc is already computed 421 cpopt = 2 ! reuse cprojs 422 else 423 sij_opt = 0 424 cpopt = -1 425 end if 426 427 write(message, *) 'Getghc, iteration', iline 428 call wrtout(std_out,message,'COLL') 429 430 call timab(timer_getghc, 1, tsec) 431 if (dtset%paral_kgb == 0) then 432 call getghc(cpopt,cg_filter(:,shift:),cwaveprj(:,iactive:),ghc_filter(:,shift:),& 433 & gsc_filter(:,shift:),gs_hamk,gvnlc_filter(:,shift:),eval,mpi_enreg,& 434 & nband,prtvol,sij_opt,tim_getghc,0) 435 else 436 call prep_getghc(cg_filter(:,shift:),gs_hamk,gvnlc_filter(:,shift:),ghc_filter(:,shift:),& 437 & gsc_filter(:,shift:),eval,nband,mpi_enreg,prtvol,sij_opt,cpopt,& 438 & cwaveprj(:,iactive:),already_transposed=.true.) 439 end if 440 441 ! end of the trick 442 mpi_enreg%bandpp = nband_filter 443 444 call timab(timer_getghc, 2, tsec) 445 end do ! end loop on nline 446 447 ! normalize according to the previously computed rayleigh quotients (inaccurate, but cheap) 448 do iband = 1, nband_filter 449 ampfactor = cheb_poly(eig(iband), nline_bands(iband), filter_low, dtset%ecut) 450 if(abs(ampfactor) < 1e-3) ampfactor = 1e-3 ! just in case, avoid amplifying too much 451 shift = npw_filter*nspinor*(iband-1) 452 cg_filter(:, shift+1:shift+npw_filter*nspinor) = cg_filter(:, shift+1:shift+npw_filter*nspinor) / ampfactor 453 ghc_filter(:, shift+1:shift+npw_filter*nspinor) = ghc_filter(:, shift+1:shift+npw_filter*nspinor) / ampfactor 454 if(paw) then 455 gsc_filter(:, shift+1:shift+npw_filter*nspinor) = gsc_filter(:, shift+1:shift+npw_filter*nspinor) / ampfactor 456 else 457 gvnlc_filter(:, shift+1:shift+npw_filter*nspinor) = gvnlc_filter(:, shift+1:shift+npw_filter*nspinor) / ampfactor 458 end if 459 end do 460 461 ! Cleanup 462 if(paw) then 463 call pawcprj_free(cwaveprj) 464 call pawcprj_free(cwaveprj_next) 465 call pawcprj_free(cwaveprj_prev) 466 ABI_DATATYPE_DEALLOCATE(cwaveprj) 467 ABI_DATATYPE_DEALLOCATE(cwaveprj_next) 468 ABI_DATATYPE_DEALLOCATE(cwaveprj_prev) 469 end if 470 ABI_DEALLOCATE(nline_bands) 471 ABI_DEALLOCATE(cg_filter_next) 472 ABI_DEALLOCATE(cg_filter_prev) 473 ABI_DEALLOCATE(gsc_filter_prev) 474 ABI_DEALLOCATE(gsc_filter_next) 475 ABI_DEALLOCATE(gsm1hc_filter) 476 477 !====================================================================================================== 478 ! Filtering done, tranpose back 479 !====================================================================================================== 480 481 write(message, *) 'Filtering done, transposing back' 482 call wrtout(std_out,message,'COLL') 483 484 ! transpose back 485 if(dtset%paral_kgb == 1) then 486 cg_alltoall1(:,index_wavef_band) = cg_alltoall2(:,:) 487 ghc_alltoall1(:,index_wavef_band) = ghc_alltoall2(:,:) 488 if(paw) then 489 gsc_alltoall1(:,index_wavef_band) = gsc_alltoall2(:,:) 490 else 491 gvnlc_alltoall1(:,index_wavef_band) = gvnlc_alltoall2(:,:) 492 end if 493 494 ABI_DEALLOCATE(index_wavef_band) 495 496 call timab(timer_sync, 1, tsec) 497 call xmpi_barrier(mpi_enreg%comm_band) 498 call timab(timer_sync, 2, tsec) 499 500 call timab(timer_alltoall, 1, tsec) 501 502 ! Do we pack the arrays in the alltoall, saving latency, or do we do it separately, saving memory and copies? 503 call xmpi_alltoallv(cg_alltoall1,recvcountsloc,rdisplsloc,cg,& 504 & sendcountsloc,sdisplsloc,mpi_enreg%comm_band,ierr) 505 call xmpi_alltoallv(ghc_alltoall1,recvcountsloc,rdisplsloc,ghc,& 506 & sendcountsloc,sdisplsloc,mpi_enreg%comm_band,ierr) 507 if(paw) then 508 call xmpi_alltoallv(gsc_alltoall1,recvcountsloc,rdisplsloc,gsc,& 509 & sendcountsloc,sdisplsloc,mpi_enreg%comm_band,ierr) 510 else 511 call xmpi_alltoallv(gvnlc_alltoall1,recvcountsloc,rdisplsloc,gvnlc,& 512 & sendcountsloc,sdisplsloc,mpi_enreg%comm_band,ierr) 513 end if 514 call timab(timer_alltoall, 2, tsec) 515 516 if(mpi_enreg%paral_kgb == 1) then 517 ABI_DEALLOCATE(cg_alltoall1) 518 ABI_DEALLOCATE(gsc_alltoall1) 519 ABI_DEALLOCATE(ghc_alltoall1) 520 ABI_DEALLOCATE(gvnlc_alltoall1) 521 ABI_DEALLOCATE(cg_alltoall2) 522 ABI_DEALLOCATE(gsc_alltoall2) 523 ABI_DEALLOCATE(ghc_alltoall2) 524 ABI_DEALLOCATE(gvnlc_alltoall2) 525 end if 526 else 527 ! nothing to do, the _filter variables already point to the right ones 528 end if 529 530 531 532 !====================================================================================================== 533 ! Data in (npfft*npband) x 1 distribution. Rayleigh-Ritz step 534 !====================================================================================================== 535 536 ! _subdiago might use less memory when using only one proc, should maybe call it, or just remove it 537 ! and always call _distributed 538 #if defined HAVE_LINALG_SCALAPACK 539 call rayleigh_ritz_distributed(cg,ghc,gsc,gvnlc,eig,gs_hamk%istwf_k,mpi_enreg,nband,npw,nspinor,gs_hamk%usepaw) 540 #else 541 call rayleigh_ritz_subdiago(cg,ghc,gsc,gvnlc,eig,gs_hamk%istwf_k,mpi_enreg,nband,npw,nspinor,gs_hamk%usepaw) 542 #endif 543 544 ! Build residuals 545 call timab(timer_residuals, 1, tsec) 546 do iband=1,nband 547 shift = npw*nspinor*(iband-1) 548 if(paw) then 549 resid_vec = ghc(:, shift+1 : shift+npw*nspinor) - eig(iband)*gsc(:, shift+1 : shift+npw*nspinor) 550 else 551 resid_vec = ghc(:, shift+1 : shift+npw*nspinor) - eig(iband)*cg (:, shift+1 : shift+npw*nspinor) 552 end if 553 554 ! precondition resid_vec 555 do ispinor = 1,nspinor 556 resid_vec(1, npw*(ispinor-1)+1:npw*ispinor) = resid_vec(1, npw*(ispinor-1)+1:npw*ispinor) * pcon 557 resid_vec(2, npw*(ispinor-1)+1:npw*ispinor) = resid_vec(2, npw*(ispinor-1)+1:npw*ispinor) * pcon 558 end do 559 560 call dotprod_g(resid(iband),dprod_i,gs_hamk%istwf_k,npw*nspinor,1,resid_vec,& 561 & resid_vec,mpi_enreg%me_g0,mpi_enreg%comm_bandspinorfft) 562 563 if(.not. paw) then 564 call dotprod_g(enl(iband),dprod_i,gs_hamk%istwf_k,npw*nspinor,1,cg(:, shift+1:shift+npw*nspinor),& 565 & gvnlc(:, shift+1:shift+npw_filter*nspinor),mpi_enreg%me_g0,mpi_enreg%comm_bandspinorfft) 566 end if 567 end do 568 call timab(timer_residuals, 2, tsec) 569 570 ! write(message, '(a,4e10.2)') 'Resids (1, N, min, max) ', resid(1), resid(nband), MINVAL(resid), MAXVAL(resid) 571 ! call wrtout(std_out,message,'COLL') 572 573 ! write(message,*)'Eigens(1,nocc,nband) ',eig(1), eig(ilastocc),eig(nband) 574 ! call wrtout(std_out,message,'COLL') 575 ! write(message,*)'Resids(1,nocc,nband) ',resid(1), resid(ilastocc),resid(nband) 576 ! call wrtout(std_out,message,'COLL') 577 578 call timab(timer_chebfi,2,tsec) 579 580 end subroutine chebfi