TABLE OF CONTENTS


ABINIT/cheb_oracle [ Functions ]

[ Top ] [ Functions ]

NAME

 cheb_oracle

FUNCTION

 Returns the number of necessary iterations to decrease residual by at least tol
 Here as in the rest of the code, the convention is that residuals are squared (||Ax-lx||^2)

COPYRIGHT

 Copyright (C) 2014-2018 ABINIT group (AL)
 this file is distributed under the terms of the
 gnu general public license, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 for the initials of contributors, see ~abinit/doc/developers/contributors.txt .

INPUTS

 x= input variable
 a= left bound of the interval
 b= right bound of the interval
 tol= needed precision
 nmax= max number of iterations

OUTPUT

 n= number of iterations needed to decrease residual by tol

SIDE EFFECTS

PARENTS

      chebfi

CHILDREN

NOTES

SOURCE

686 function cheb_oracle(x, a, b, tol, nmax) result(n)
687  use defs_basis
688 
689 !This section has been created automatically by the script Abilint (TD).
690 !Do not modify the following lines by hand.
691 #undef ABI_FUNC
692 #define ABI_FUNC 'cheb_oracle'
693 !End of the abilint section
694 
695  implicit none
696 
697  real(dp) :: tol
698 
699  integer :: nmax
700  integer :: n, i
701  real(dp), intent(in) :: x, a, b
702  real(dp) :: y, xred, temp
703  real(dp) :: yim1
704 
705 ! *************************************************************************
706 
707  xred = (x-(a+b)/2)/(b-a)*2
708  y = xred
709  yim1 = one
710 
711  n = nmax
712  if(1/(y**2) < tol) then
713    n = 1
714  else
715    do i=2, nmax-1
716      temp = y
717      y = 2*xred*y - yim1
718      yim1 = temp
719      if(1/(y**2) < tol) then
720        n = i
721        exit
722      end if
723    end do
724  end if
725 
726 end function cheb_oracle

ABINIT/cheb_poly [ Functions ]

[ Top ] [ Functions ]

NAME

 cheb_poly

FUNCTION

 Computes the value of the Chebyshev polynomial of degree n on the interval [a,b] at x

COPYRIGHT

 Copyright (C) 2014-2018 ABINIT group (AL)
 this file is distributed under the terms of the
 gnu general public license, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 for the initials of contributors, see ~abinit/doc/developers/contributors.txt .

INPUTS

 x= input variable
 n= degree
 a= left bound of the interval
 b= right bound of the interval

OUTPUT

 y= Tn(x)

SIDE EFFECTS

PARENTS

      chebfi

CHILDREN

NOTES

SOURCE

618 function cheb_poly(x, n, a, b) result(y)
619  use defs_basis
620 
621 !This section has been created automatically by the script Abilint (TD).
622 !Do not modify the following lines by hand.
623 #undef ABI_FUNC
624 #define ABI_FUNC 'cheb_poly'
625 !End of the abilint section
626 
627  implicit none
628 
629  integer, intent(in) :: n
630  integer :: i
631  real(dp), intent(in) :: x, a, b
632  real(dp) :: y, xred, temp
633  real(dp) :: yim1
634 
635 ! *************************************************************************
636 
637  xred = (x-(a+b)/2)/(b-a)*2
638  y = xred
639  yim1 = one
640  do i=2, n
641    temp = y
642    y = 2*xred*y - yim1
643    yim1 = temp
644  end do
645 
646 end function cheb_poly

ABINIT/chebfi [ Functions ]

[ Top ] [ Functions ]

NAME

 chebfi

FUNCTION

 this routine updates the wave functions at a given k-point,
 using the ChebFi method (see paper by A. Levitt and M. Torrent)

COPYRIGHT

 Copyright (C) 2014-2018 ABINIT group (AL)
 this file is distributed under the terms of the
 gnu general public license, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 for the initials of contributors, see ~abinit/doc/developers/contributors.txt .

INPUTS

  dtset <type(dataset_type)>=all input variales for this dataset
  gs_hamk <type(gs_hamiltonian_type)>=all data for the hamiltonian at k
  kinpw(npw)=(modified) kinetic energy for each plane wave (hartree)
  mpi_enreg=information about MPI parallelization
  nband=number of bands at this k point for that spin polarization
  npw=number of plane waves at this k point
  nspinor=number of plane waves at this k point
  prtvol=control print volume and debugging output

OUTPUT

  eig(nband)=array for holding eigenvalues (hartree)
  resid(nband)=residuals for each states
  If gs_hamk%usepaw==1:
    gsc(2,*)=<g|s|c> matrix elements (s=overlap)
  If gs_hamk%usepaw==0
    enl(nband)=contribution from each band to nonlocal pseudopotential part of total energy, at this k-point

SIDE EFFECTS

  cg(2,*)=updated wavefunctions

PARENTS

      vtowfk

CHILDREN

      apply_invovl,dotprod_g,getghc,pawcprj_alloc,pawcprj_axpby,pawcprj_copy
      pawcprj_free,prep_getghc,prep_index_wavef_bandpp
      rayleigh_ritz_distributed,rayleigh_ritz_subdiago,timab,wrtout
      xmpi_alltoallv,xmpi_barrier,xmpi_max,xmpi_min,xmpi_sum

NOTES

  -- TODO --
  Normev?
  Ecutsm
  nspinor 2
  spinors parallelisation
  fock
  -- Performance --
  Improve load balancing
  Don't diagonalize converged eigenvectors, just orthogonalize
  Maybe don't diagonalize so often (once every two outer iterations?)
  Benchmark diagonalizations, choose np_slk
  How to chose npfft?
  Implement MINRES for invovl
  -- LOBPCG --
  Improve stability (see paper by Lehoucq Sorensen, maybe use bunch-kaufman factorizations?)

SOURCE

 66 #if defined HAVE_CONFIG_H
 67 #include "config.h"
 68 #endif
 69 
 70 #include "abi_common.h"
 71 
 72 subroutine chebfi(cg,dtset,eig,enl,gs_hamk,gsc,kinpw,mpi_enreg,nband,npw,nspinor,prtvol,resid)
 73 
 74 
 75  use defs_basis
 76  use defs_abitypes
 77  use m_errors
 78  use m_xmpi
 79  use m_profiling_abi
 80  use m_cgtools, only : dotprod_g
 81  use m_efield
 82  use m_abi_linalg
 83  use m_invovl
 84  use m_bandfft_kpt, only : bandfft_kpt,bandfft_kpt_get_ikpt
 85 #if defined HAVE_MPI2
 86  use mpi
 87 #endif
 88 
 89  use m_pawcprj, only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_axpby, pawcprj_copy
 90  use m_hamiltonian, only : gs_hamiltonian_type
 91 
 92 !This section has been created automatically by the script Abilint (TD).
 93 !Do not modify the following lines by hand.
 94 #undef ABI_FUNC
 95 #define ABI_FUNC 'chebfi'
 96  use interfaces_14_hidewrite
 97  use interfaces_18_timing
 98  use interfaces_66_wfs, except_this_one => chebfi
 99 !End of the abilint section
100 
101  implicit none
102 
103 #if defined HAVE_MPI1
104  include 'mpif.h'
105 #endif
106 
107 !Arguments ------------------------------------
108  type(gs_hamiltonian_type),intent(inout) :: gs_hamk
109  type(dataset_type),intent(in) :: dtset
110  type(mpi_type),intent(inout) :: mpi_enreg
111  integer,intent(in) :: nband,npw,prtvol,nspinor
112  real(dp),intent(inout), target :: cg(2,npw*nspinor*nband),gsc(2,npw*nspinor*nband)
113  real(dp),intent(in) :: kinpw(npw)
114  real(dp),intent(out) :: resid(nband)
115  real(dp),intent(out) :: enl(nband*(1-gs_hamk%usepaw))
116  real(dp),intent(out) :: eig(nband)
117 
118 !Local variables-------------------------------
119  real(dp) :: pcon(npw)
120  real(dp) :: filter_low
121  real(dp) :: filter_center, filter_radius
122  real(dp), dimension(2, npw*nspinor*nband), target :: ghc, gvnlc
123  real(dp), allocatable, dimension(:,:) :: cg_filter_next, cg_filter_prev, gsm1hc_filter, gsc_filter_prev, gsc_filter_next
124  real(dp), allocatable, dimension(:,:), target :: cg_alltoall1,gsc_alltoall1,ghc_alltoall1,gvnlc_alltoall1
125  real(dp), allocatable, dimension(:,:), target :: cg_alltoall2,gsc_alltoall2,ghc_alltoall2,gvnlc_alltoall2
126  real(dp), pointer, dimension(:,:) :: cg_filter, gsc_filter, ghc_filter, gvnlc_filter
127  real(dp) :: resid_vec(2, npw*nspinor)
128  logical :: paw
129  integer :: shift, shift_cg_loadbalanced
130  integer :: iband, iline, ispinor
131  integer :: sij_opt, cpopt
132  real(dp) :: eval, tsec(2)
133  integer :: tim_getghc = 5, ierr
134  integer :: i
135  integer, allocatable :: index_wavef_band(:)
136  real(dp) :: maxeig, mineig
137  real(dp), allocatable :: resids_filter(:), residvec_filter(:,:)
138  integer, allocatable :: nline_bands(:)
139  integer :: iactive, nactive
140  real(dp) :: ampfactor
141  integer :: nline_max, nline_decrease, nline_tolwfr
142  ! real(dp) :: load_imbalance
143  integer :: mcg
144  real(dp) :: dprod_r, dprod_i
145  character(len=500) :: message
146  integer :: rdisplsloc(mpi_enreg%nproc_band),recvcountsloc(mpi_enreg%nproc_band)
147  integer :: sdisplsloc(mpi_enreg%nproc_band), sendcountsloc(mpi_enreg%nproc_band)
148  integer :: ikpt_this_proc, npw_filter, nband_filter
149  type(pawcprj_type), allocatable :: cwaveprj(:,:), cwaveprj_next(:,:), cwaveprj_prev(:,:)
150  ! integer :: nline_total
151 
152  ! timers
153  integer, parameter :: timer_chebfi = 1600, timer_alltoall = 1601, timer_apply_inv_ovl = 1602, timer_rotation = 1603
154  integer, parameter :: timer_subdiago = 1604, timer_subham = 1605, timer_ortho = 1606, timer_getghc = 1607
155  integer, parameter :: timer_residuals = 1608, timer_update_eigen = 1609, timer_sync = 1610
156 
157 ! *************************************************************************
158 
159  !======================================================================================================
160  ! Initialize, transpose input cg if paral_kgb
161  !======================================================================================================
162  call timab(timer_chebfi,1,tsec)
163 
164  !Initializations
165  paw = gs_hamk%usepaw == 1
166  mcg = npw*nspinor*nband
167 
168  ! Init pcon
169  pcon = (27+kinpw*(18+kinpw*(12+8*kinpw))) / (27+kinpw*(18+kinpw*(12+8*kinpw)) + 16*kinpw**4)
170 
171  ghc=zero; gvnlc=zero
172 
173  ! Initialize the _filter pointers. Depending on paral_kgb, they might point to the actual arrays or to _alltoall variables
174  if (dtset%paral_kgb == 1) then
175    ikpt_this_proc = bandfft_kpt_get_ikpt()
176    npw_filter = bandfft_kpt(ikpt_this_proc)%ndatarecv
177    nband_filter = mpi_enreg%bandpp
178 
179    ABI_ALLOCATE(cg_alltoall1, (2, npw_filter*nspinor*nband_filter))
180    ABI_ALLOCATE(gsc_alltoall1, (2, npw_filter*nspinor*nband_filter))
181    ABI_ALLOCATE(ghc_alltoall1, (2, npw_filter*nspinor*nband_filter))
182    ABI_ALLOCATE(gvnlc_alltoall1, (2, npw_filter*nspinor*nband_filter))
183    ABI_ALLOCATE(cg_alltoall2, (2, npw_filter*nspinor*nband_filter))
184    ABI_ALLOCATE(gsc_alltoall2, (2, npw_filter*nspinor*nband_filter))
185    ABI_ALLOCATE(ghc_alltoall2, (2, npw_filter*nspinor*nband_filter))
186    ABI_ALLOCATE(gvnlc_alltoall2, (2, npw_filter*nspinor*nband_filter))
187 
188    ! Init tranpose variables
189    recvcountsloc=bandfft_kpt(ikpt_this_proc)%recvcounts*2*nspinor*mpi_enreg%bandpp
190    rdisplsloc=bandfft_kpt(ikpt_this_proc)%rdispls*2*nspinor*mpi_enreg%bandpp
191    sendcountsloc=bandfft_kpt(ikpt_this_proc)%sendcounts*2*nspinor
192    sdisplsloc=bandfft_kpt(ikpt_this_proc)%sdispls*2*nspinor
193 
194    ! Load balancing, so that each processor has approximately the same number of converged and non-converged bands
195    ! for two procs, rearrange 1 2 3 4 5 6 as 1 4 2 5 3 6
196    !
197    ! trick to save memory: ghc has the necessary size, and will be overwritten afterwards anyway
198 #define cg_loadbalanced ghc
199    shift = 0
200    do i=1, mpi_enreg%nproc_band
201      do iband=1, mpi_enreg%bandpp
202        shift_cg_loadbalanced = (i-1 + (iband-1)*mpi_enreg%nproc_band)*npw*nspinor
203        cg_loadbalanced(:, shift+1:shift+npw*nspinor) = cg(:, shift_cg_loadbalanced+1:shift_cg_loadbalanced+npw*nspinor)
204        shift = shift + npw*nspinor
205      end do
206    end do
207 
208    ! Transpose input cg into cg_alloall1. cg_alltoall1 is now (npw_filter, nband_filter)
209    call timab(timer_alltoall, 1, tsec)
210    call xmpi_alltoallv(cg_loadbalanced,sendcountsloc,sdisplsloc,cg_alltoall1,&
211 &   recvcountsloc,rdisplsloc,mpi_enreg%comm_band,ierr)
212    call timab(timer_alltoall, 2, tsec)
213 #undef cg_loadbalanced
214 
215    ! sort according to bandpp (from lobpcg, I don't fully understand what's going on but it works and it's fast)
216    call prep_index_wavef_bandpp(mpi_enreg%nproc_band,mpi_enreg%bandpp,&
217 &   nspinor,bandfft_kpt(ikpt_this_proc)%ndatarecv,&
218 &   bandfft_kpt(ikpt_this_proc)%recvcounts,bandfft_kpt(ikpt_this_proc)%rdispls,&
219 &   index_wavef_band)
220    cg_alltoall2(:,:) = cg_alltoall1(:,index_wavef_band)
221 
222    cg_filter => cg_alltoall2
223    gsc_filter => gsc_alltoall2
224    ghc_filter => ghc_alltoall2
225    gvnlc_filter => gvnlc_alltoall2
226  else
227    npw_filter = npw
228    nband_filter = nband
229 
230    cg_filter => cg
231    gsc_filter => gsc
232    ghc_filter => ghc
233    gvnlc_filter => gvnlc
234  end if
235  ! from here to the next alltoall, all computation is done on _filter variables, agnostic
236  ! to whether it's nband x npw (paral_kgb == 0) or ndatarecv*bandpp (paral_kgb = 1)
237 
238  ! Allocate filter variables for the application of the Chebyshev polynomial
239  ABI_ALLOCATE(cg_filter_next, (2, npw_filter*nspinor*nband_filter))
240  ABI_ALLOCATE(cg_filter_prev, (2, npw_filter*nspinor*nband_filter))
241  ABI_ALLOCATE(gsc_filter_prev, (2, npw_filter*nspinor*nband_filter))
242  ABI_ALLOCATE(gsc_filter_next, (2, npw_filter*nspinor*nband_filter))
243  ABI_ALLOCATE(gsm1hc_filter, (2, npw_filter*nspinor*nband_filter))
244 
245  ! PAW init
246  if(paw) then
247    ABI_DATATYPE_ALLOCATE(cwaveprj, (gs_hamk%natom,nspinor*nband_filter))
248    ABI_DATATYPE_ALLOCATE(cwaveprj_next, (gs_hamk%natom,nspinor*nband_filter))
249    ABI_DATATYPE_ALLOCATE(cwaveprj_prev, (gs_hamk%natom,nspinor*nband_filter))
250    call pawcprj_alloc(cwaveprj,0,gs_hamk%dimcprj)
251    call pawcprj_alloc(cwaveprj_next,0,gs_hamk%dimcprj)
252    call pawcprj_alloc(cwaveprj_prev,0,gs_hamk%dimcprj)
253 
254    sij_opt = 1 ! recompute S
255    cpopt = 0 ! save cprojs
256  else
257    sij_opt = 0
258    cpopt = -1
259  end if
260 
261 
262 
263  !======================================================================================================
264  ! Data in npfft x npband distribution. First getghc, update eigenvalues and residuals
265  !======================================================================================================
266  write(message, *) 'First getghc'
267  call wrtout(std_out,message,'COLL')
268 
269  ! get_ghc on cg
270  call timab(timer_getghc, 1, tsec)
271  if (dtset%paral_kgb == 0) then
272    call getghc(cpopt,cg_filter,cwaveprj,ghc_filter,gsc_filter,gs_hamk,gvnlc_filter,&
273 &   eval,mpi_enreg,nband,prtvol,sij_opt,tim_getghc,0)
274  else
275    call prep_getghc(cg_filter,gs_hamk,gvnlc_filter,ghc_filter,gsc_filter,eval,nband,mpi_enreg,&
276 &   prtvol,sij_opt,cpopt,cwaveprj,already_transposed=.true.)
277  end if
278  call timab(timer_getghc, 2, tsec)
279 
280  ! Debug barrier: should be invisible
281  call timab(timer_sync, 1, tsec)
282  call xmpi_barrier(mpi_enreg%comm_band)
283  call timab(timer_sync, 2, tsec)
284 
285  write(message, *) 'Computing residuals'
286  call wrtout(std_out,message,'COLL')
287  ! update eigenvalues and residuals
288  call timab(timer_update_eigen, 1, tsec)
289  ABI_ALLOCATE(resids_filter, (nband_filter))
290  ABI_ALLOCATE(residvec_filter, (2, npw_filter*nspinor))
291  ABI_ALLOCATE(nline_bands, (nband_filter))
292  do iband=1, nband_filter
293    shift = npw_filter*nspinor*(iband-1)
294    call dotprod_g(eig(iband),dprod_i,gs_hamk%istwf_k,npw_filter*nspinor,1,ghc_filter(:, shift+1:shift+npw_filter*nspinor),&
295 &   cg_filter(:, shift+1:shift+npw_filter*nspinor),mpi_enreg%me_g0,mpi_enreg%comm_spinorfft)
296    if(paw) then
297      call dotprod_g(dprod_r,dprod_i,gs_hamk%istwf_k,npw_filter*nspinor,1,gsc_filter(:, shift+1:shift+npw_filter*nspinor),&
298 &     cg_filter(:, shift+1:shift+npw_filter*nspinor),mpi_enreg%me_g0,mpi_enreg%comm_spinorfft)
299      eig(iband) = eig(iband)/dprod_r
300    end if
301 
302    if(paw) then
303      residvec_filter = ghc_filter(:, shift+1 : shift+npw_filter*nspinor) &
304 &     - eig(iband)*gsc_filter(:, shift+1 : shift+npw_filter*nspinor)
305    else
306      residvec_filter = ghc_filter(:, shift+1 : shift+npw_filter*nspinor) &
307 &     - eig(iband)*cg_filter(:, shift+1 : shift+npw_filter*nspinor)
308    end if
309    resids_filter(iband) = SUM(residvec_filter**2)
310  end do
311  call xmpi_sum(resids_filter,mpi_enreg%comm_fft,ierr)
312  call xmpi_max(MAXVAL(eig(1:nband_filter)),maxeig,mpi_enreg%comm_band,ierr)
313  call xmpi_min(MINVAL(eig(1:nband_filter)),mineig,mpi_enreg%comm_band,ierr)
314  filter_low = maxeig
315  call timab(timer_update_eigen, 2, tsec)
316 
317  ! Decide how many iterations per band are needed
318  ! don't go above this, or face bad conditioning of the Gram matrix.
319  nline_max = cheb_oracle(mineig, filter_low, dtset%ecut, 1e-16_dp, 40)
320  ! if(mpi_enreg%me == 0) write(0, *) nline_max
321  do iband=1, nband_filter
322    ! nline necessary to converge to tolwfr
323    nline_tolwfr = cheb_oracle(eig(iband), filter_low, dtset%ecut, dtset%tolwfr / resids_filter(iband), dtset%nline)
324    ! nline necessary to decrease residual by a constant factor
325    nline_decrease = cheb_oracle(eig(iband), filter_low, dtset%ecut, 0.1_dp, dtset%nline)
326 
327    nline_bands(iband) = MAX(MIN(nline_tolwfr, nline_decrease, nline_max, dtset%nline), 1)
328    nline_bands(iband) = dtset%nline ! fiddle with this to use locking
329  end do
330 
331 
332  !!!!! Uncomment for diagnostics
333  ! nline_total = SUM(nline_bands)
334  ! call xmpi_sum(nline_total, mpi_enreg%comm_band, ierr)
335  ! load_imbalance = (SUM(nline_bands) - REAL(nline_total)/REAL(mpi_enreg%nproc_band)) / &
336  ! &                (REAL(nline_total)/REAL(mpi_enreg%nproc_band))
337  ! call xmax_mpi(load_imbalance, mpi_enreg%comm_band, ierr)
338 
339  ! write(message, *) 'Mean nline', REAL(nline_total)/REAL(nband), 'max imbalance (%)', load_imbalance*100
340  ! call wrtout(std_out,message,'COLL')
341 
342  ABI_DEALLOCATE(resids_filter)
343  ABI_DEALLOCATE(residvec_filter)
344 
345  !======================================================================================================
346  ! Chebyshev polynomial application
347  !======================================================================================================
348  ! Filter by a chebyshev polynomial of degree nline
349  do iline=1,dtset%nline
350    ! Filter only on [iactive, iactive+nactive-1]
351    iactive = nband_filter
352    do iband = 1, nband_filter
353      ! does iband need an iteration?
354      if (nline_bands(iband) >= iline) then
355        iactive = iband
356        exit
357      end if
358    end do
359    nactive = nband_filter - iactive + 1
360    shift = npw_filter*nspinor*(iactive-1) + 1
361    ! trick the legacy prep_getghc
362    mpi_enreg%bandpp = nactive
363 
364    ! Define the filter position
365    filter_center = (dtset%ecut+filter_low)/2
366    filter_radius = (dtset%ecut-filter_low)/2
367 
368    ! write(message, *) 'Applying invovl, iteration', iline
369    ! call wrtout(std_out,message,'COLL')
370 
371    ! If paw, have to apply S^-1
372    if(paw) then
373      call timab(timer_apply_inv_ovl, 1, tsec)
374      call apply_invovl(gs_hamk, ghc_filter(:,shift:), gsm1hc_filter(:,shift:), cwaveprj_next(:,iactive:), &
375 &     npw_filter, nactive, mpi_enreg, nspinor)
376      call timab(timer_apply_inv_ovl, 2, tsec)
377    else
378      gsm1hc_filter(:,shift:) = ghc_filter(:,shift:)
379    end if
380 
381    ! Chebyshev iteration: UPDATE cg
382    if(iline == 1) then
383      cg_filter_next(:,shift:) = one/filter_radius * (gsm1hc_filter(:,shift:) - filter_center*cg_filter(:,shift:))
384    else
385      cg_filter_next(:,shift:) = two/filter_radius * (gsm1hc_filter(:,shift:) - filter_center*cg_filter(:,shift:)) &
386 &     - cg_filter_prev(:,shift:)
387    end if
388    ! Update gsc and cwaveprj
389    if(paw) then
390      if(iline == 1) then
391        gsc_filter_next(:,shift:) = one/filter_radius * (ghc_filter(:,shift:) - filter_center*gsc_filter(:,shift:))
392        !cwaveprj_next = one/filter_radius * (cwaveprj_next - filter_center*cwaveprj)
393        call pawcprj_axpby(-filter_center/filter_radius, one/filter_radius,cwaveprj(:,iactive:),cwaveprj_next(:,iactive:))
394      else
395        gsc_filter_next(:,shift:) = two/filter_radius * (ghc_filter(:,shift:) - filter_center*gsc_filter(:,shift:))&
396 &       - gsc_filter_prev(:,shift:)
397        !cwaveprj_next = two/filter_radius * (cwaveprj_next - filter_center*cwaveprj) - cwaveprj_prev
398        call pawcprj_axpby(-two*filter_center/filter_radius, two/filter_radius,cwaveprj(:,iactive:),cwaveprj_next(:,iactive:))
399        call pawcprj_axpby(-one, one,cwaveprj_prev(:,iactive:),cwaveprj_next(:,iactive:))
400      end if
401    end if
402 
403    ! Bookkeeping of the _prev variables
404    cg_filter_prev(:,shift:) = cg_filter(:,shift:)
405    cg_filter(:,shift:) = cg_filter_next(:,shift:)
406    if(paw) then
407      gsc_filter_prev(:,shift:) = gsc_filter(:,shift:)
408      gsc_filter(:,shift:) = gsc_filter_next(:,shift:)
409 
410      call pawcprj_copy(cwaveprj(:,iactive:),cwaveprj_prev(:,iactive:))
411      call pawcprj_copy(cwaveprj_next(:,iactive:),cwaveprj(:,iactive:))
412    end if
413 
414    ! Update ghc
415    if(paw) then
416      !! DEBUG use this to remove the optimization and recompute gsc/cprojs
417      ! sij_opt = 1
418      ! cpopt = 0
419 
420      sij_opt = 0 ! gsc is already computed
421      cpopt = 2 ! reuse cprojs
422    else
423      sij_opt = 0
424      cpopt = -1
425    end if
426 
427    write(message, *) 'Getghc, iteration', iline
428    call wrtout(std_out,message,'COLL')
429 
430    call timab(timer_getghc, 1, tsec)
431    if (dtset%paral_kgb == 0) then
432      call getghc(cpopt,cg_filter(:,shift:),cwaveprj(:,iactive:),ghc_filter(:,shift:),&
433 &     gsc_filter(:,shift:),gs_hamk,gvnlc_filter(:,shift:),eval,mpi_enreg,&
434 &     nband,prtvol,sij_opt,tim_getghc,0)
435    else
436      call prep_getghc(cg_filter(:,shift:),gs_hamk,gvnlc_filter(:,shift:),ghc_filter(:,shift:),&
437 &     gsc_filter(:,shift:),eval,nband,mpi_enreg,prtvol,sij_opt,cpopt,&
438 &     cwaveprj(:,iactive:),already_transposed=.true.)
439    end if
440 
441    ! end of the trick
442    mpi_enreg%bandpp = nband_filter
443 
444    call timab(timer_getghc, 2, tsec)
445  end do ! end loop on nline
446 
447  ! normalize according to the previously computed rayleigh quotients (inaccurate, but cheap)
448  do iband = 1, nband_filter
449    ampfactor = cheb_poly(eig(iband), nline_bands(iband), filter_low, dtset%ecut)
450    if(abs(ampfactor) < 1e-3) ampfactor = 1e-3 ! just in case, avoid amplifying too much
451    shift = npw_filter*nspinor*(iband-1)
452    cg_filter(:, shift+1:shift+npw_filter*nspinor) = cg_filter(:, shift+1:shift+npw_filter*nspinor) / ampfactor
453    ghc_filter(:, shift+1:shift+npw_filter*nspinor) = ghc_filter(:, shift+1:shift+npw_filter*nspinor) / ampfactor
454    if(paw) then
455      gsc_filter(:, shift+1:shift+npw_filter*nspinor) = gsc_filter(:, shift+1:shift+npw_filter*nspinor) / ampfactor
456    else
457      gvnlc_filter(:, shift+1:shift+npw_filter*nspinor) = gvnlc_filter(:, shift+1:shift+npw_filter*nspinor) / ampfactor
458    end if
459  end do
460 
461  ! Cleanup
462  if(paw) then
463    call pawcprj_free(cwaveprj)
464    call pawcprj_free(cwaveprj_next)
465    call pawcprj_free(cwaveprj_prev)
466    ABI_DATATYPE_DEALLOCATE(cwaveprj)
467    ABI_DATATYPE_DEALLOCATE(cwaveprj_next)
468    ABI_DATATYPE_DEALLOCATE(cwaveprj_prev)
469  end if
470  ABI_DEALLOCATE(nline_bands)
471  ABI_DEALLOCATE(cg_filter_next)
472  ABI_DEALLOCATE(cg_filter_prev)
473  ABI_DEALLOCATE(gsc_filter_prev)
474  ABI_DEALLOCATE(gsc_filter_next)
475  ABI_DEALLOCATE(gsm1hc_filter)
476 
477  !======================================================================================================
478  ! Filtering done, tranpose back
479  !======================================================================================================
480 
481  write(message, *) 'Filtering done, transposing back'
482  call wrtout(std_out,message,'COLL')
483 
484  ! transpose back
485  if(dtset%paral_kgb == 1) then
486    cg_alltoall1(:,index_wavef_band) = cg_alltoall2(:,:)
487    ghc_alltoall1(:,index_wavef_band) = ghc_alltoall2(:,:)
488    if(paw) then
489      gsc_alltoall1(:,index_wavef_band) = gsc_alltoall2(:,:)
490    else
491      gvnlc_alltoall1(:,index_wavef_band) = gvnlc_alltoall2(:,:)
492    end if
493 
494    ABI_DEALLOCATE(index_wavef_band)
495 
496    call timab(timer_sync, 1, tsec)
497    call xmpi_barrier(mpi_enreg%comm_band)
498    call timab(timer_sync, 2, tsec)
499 
500    call timab(timer_alltoall, 1, tsec)
501 
502   ! Do we pack the arrays in the alltoall, saving latency, or do we do it separately, saving memory and copies?
503    call xmpi_alltoallv(cg_alltoall1,recvcountsloc,rdisplsloc,cg,&
504 &   sendcountsloc,sdisplsloc,mpi_enreg%comm_band,ierr)
505    call xmpi_alltoallv(ghc_alltoall1,recvcountsloc,rdisplsloc,ghc,&
506 &   sendcountsloc,sdisplsloc,mpi_enreg%comm_band,ierr)
507    if(paw) then
508      call xmpi_alltoallv(gsc_alltoall1,recvcountsloc,rdisplsloc,gsc,&
509 &     sendcountsloc,sdisplsloc,mpi_enreg%comm_band,ierr)
510    else
511      call xmpi_alltoallv(gvnlc_alltoall1,recvcountsloc,rdisplsloc,gvnlc,&
512 &     sendcountsloc,sdisplsloc,mpi_enreg%comm_band,ierr)
513    end if
514    call timab(timer_alltoall, 2, tsec)
515 
516    if(mpi_enreg%paral_kgb == 1) then
517      ABI_DEALLOCATE(cg_alltoall1)
518      ABI_DEALLOCATE(gsc_alltoall1)
519      ABI_DEALLOCATE(ghc_alltoall1)
520      ABI_DEALLOCATE(gvnlc_alltoall1)
521      ABI_DEALLOCATE(cg_alltoall2)
522      ABI_DEALLOCATE(gsc_alltoall2)
523      ABI_DEALLOCATE(ghc_alltoall2)
524      ABI_DEALLOCATE(gvnlc_alltoall2)
525    end if
526  else
527    ! nothing to do, the _filter variables already point to the right ones
528  end if
529 
530 
531 
532  !======================================================================================================
533  ! Data in (npfft*npband) x 1 distribution. Rayleigh-Ritz step
534  !======================================================================================================
535 
536  ! _subdiago might use less memory when using only one proc, should maybe call it, or just remove it
537  ! and always call _distributed
538 #if defined HAVE_LINALG_SCALAPACK
539  call rayleigh_ritz_distributed(cg,ghc,gsc,gvnlc,eig,gs_hamk%istwf_k,mpi_enreg,nband,npw,nspinor,gs_hamk%usepaw)
540 #else
541  call rayleigh_ritz_subdiago(cg,ghc,gsc,gvnlc,eig,gs_hamk%istwf_k,mpi_enreg,nband,npw,nspinor,gs_hamk%usepaw)
542 #endif
543 
544  ! Build residuals
545  call timab(timer_residuals, 1, tsec)
546  do iband=1,nband
547    shift = npw*nspinor*(iband-1)
548    if(paw) then
549      resid_vec = ghc(:, shift+1 : shift+npw*nspinor) - eig(iband)*gsc(:, shift+1 : shift+npw*nspinor)
550    else
551      resid_vec = ghc(:, shift+1 : shift+npw*nspinor) - eig(iband)*cg (:, shift+1 : shift+npw*nspinor)
552    end if
553 
554    ! precondition resid_vec
555    do ispinor = 1,nspinor
556      resid_vec(1, npw*(ispinor-1)+1:npw*ispinor) = resid_vec(1, npw*(ispinor-1)+1:npw*ispinor) * pcon
557      resid_vec(2, npw*(ispinor-1)+1:npw*ispinor) = resid_vec(2, npw*(ispinor-1)+1:npw*ispinor) * pcon
558    end do
559 
560    call dotprod_g(resid(iband),dprod_i,gs_hamk%istwf_k,npw*nspinor,1,resid_vec,&
561 &   resid_vec,mpi_enreg%me_g0,mpi_enreg%comm_bandspinorfft)
562 
563    if(.not. paw) then
564      call dotprod_g(enl(iband),dprod_i,gs_hamk%istwf_k,npw*nspinor,1,cg(:, shift+1:shift+npw*nspinor),&
565 &     gvnlc(:, shift+1:shift+npw_filter*nspinor),mpi_enreg%me_g0,mpi_enreg%comm_bandspinorfft)
566    end if
567  end do
568  call timab(timer_residuals, 2, tsec)
569 
570  ! write(message, '(a,4e10.2)') 'Resids (1, N, min, max) ', resid(1), resid(nband), MINVAL(resid), MAXVAL(resid)
571  ! call wrtout(std_out,message,'COLL')
572 
573  ! write(message,*)'Eigens(1,nocc,nband) ',eig(1), eig(ilastocc),eig(nband)
574  ! call wrtout(std_out,message,'COLL')
575  ! write(message,*)'Resids(1,nocc,nband) ',resid(1), resid(ilastocc),resid(nband)
576  ! call wrtout(std_out,message,'COLL')
577 
578  call timab(timer_chebfi,2,tsec)
579 
580 end subroutine chebfi