TABLE OF CONTENTS


ABINIT/forstrnps [ Functions ]

[ Top ] [ Functions ]

NAME

 forstrnps

FUNCTION

 Compute nonlocal pseudopotential energy contribution to forces and/or stress tensor
 as well as kinetic energy contribution to stress tensor.

COPYRIGHT

 Copyright (C) 1998-2018 ABINIT group (DCA, XG, GMR, AF, AR, MB, MT)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 For the initials of contributors, see ~abinit/doc/developers/contributors.txt .

INPUTS

  cg(2,mcg)=wavefunctions (may be read from disk file)
  cprj(natom,mcprj*usecprj)=<p_lmn|Cnk> coefficients for each WF |Cnk> and each NL proj |p_lmn>
  ecut=cut-off energy for plane wave basis sphere (Ha)
  ecutsm=smearing energy for plane wave kinetic energy (Ha)
  effmass_free=effective mass for electrons (1. in common case)
  electronpositron <type(electronpositron_type)>=quantities for the electron-positron annihilation (optional argument)
  eigen(mband*nkpt*nsppol)=array for holding eigenvalues (hartree)
  fock <type(fock_type)>= quantities to calculate Fock exact exchange
  istwfk(nkpt)=input option parameter that describes the storage of wfs
  kg(3,mpw*mkmem)=reduced coordinates (integers) of G vecs in basis
  kpt(3,nkpt)=k points in reduced coordinates
  mband=maximum number of bands
  mcg=size of wave-functions array (cg) =mpw*nspinor*mband*mkmem*nsppol
  mcprj=size of projected wave-functions array (cprj) =nspinor*mband*mkmem*nsppol
  mgfft=maximum size of 1D FFTs
  mkmem=number of k points treated by this node.
  mpi_enreg=informations about MPI parallelization
  mpsang= 1+maximum angular momentum for nonlocal pseudopotentials
  mpw= maximum number of plane waves
  my_natom=number of atoms treated by current processor
  natom=number of atoms in cell.
  nband(nkpt)=number of bands at each k point
  nfft=number of FFT grid points
  ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
  nkpt=number of k points in Brillouin zone
  nloalg(3)=governs the choice of the algorithm for non-local operator.
  npwarr(nkpt)=number of planewaves in basis and boundary at each k
  nspden=Number of spin Density components
  nspinor=number of spinorial components of the wavefunctions
  nsppol=1 for unpolarized, 2 for spin-polarized
  nsym=number of elements in symmetry group
  ntypat=number of types of atoms
  nucdipmom(3,my_natom)= nuclear dipole moments
  occ(mband*nkpt*nsppol)=occupation numbers for each band over all k points
  optfor=1 if computation of forces is required
  paw_ij(my_natom*usepaw) <type(paw_ij_type)>=paw arrays given on (i,j) channels
  pawtab(ntypat*usepaw) <type(pawtab_type)>=paw tabulated starting data
  ph1d(2,3*(2*mgfft+1)*natom)=one-dimensional structure factor information
  psps <type(pseudopotential_type)>=variables related to pseudopotentials
  rprimd(3,3)=dimensional primitive translations in real space (bohr)
  stress_needed=1 if computation of stress tensor is required
  symrec(3,3,nsym)=symmetries in reciprocal space (dimensionless)
  typat(natom)=type of each atom
  usecprj=1 if cprj datastructure has been allocated
  use_gpu_cuda= 0 or 1 to know if we use cuda for nonlop call
  wtk(nkpt)=weight associated with each k point
  xred(3,natom)=reduced dimensionless atomic coordinates
  ylm(mpw*mkmem,mpsang*mpsang*useylm)= real spherical harmonics for each G and k point
  ylmgr(mpw*mkmem,3,mpsang*mpsang*useylm)= gradients of real spherical harmonics

OUTPUT

  if (optfor==1)
   grnl(3*natom*optfor)=stores grads of nonlocal energy wrt atomic coordinates
  if (stress_needed==1)
   kinstr(6)=kinetic energy part of stress tensor (hartree/bohr^3)
   Store 6 unique components of symmetric 3x3 tensor in the order
   11, 22, 33, 32, 31, 21
   npsstr(6)=nonlocal pseudopotential energy part of stress tensor
    (hartree/bohr^3)

PARENTS

      forstr

CHILDREN

      bandfft_kpt_restoretabs,bandfft_kpt_savetabs,destroy_hamiltonian
      fock_getghc,init_hamiltonian,load_k_hamiltonian,load_spin_hamiltonian
      meanvalue_g,mkffnl,mkkpg,nonlop,pawcprj_alloc,pawcprj_free,pawcprj_get
      pawcprj_reorder,prep_bandfft_tabs,prep_nonlop,stresssym,timab,xmpi_sum

SOURCE

 89 #if defined HAVE_CONFIG_H
 90 #include "config.h"
 91 #endif
 92 
 93 #include "abi_common.h"
 94 
 95 subroutine forstrnps(cg,cprj,ecut,ecutsm,effmass_free,eigen,electronpositron,fock,&
 96 &  grnl,istwfk,kg,kinstr,npsstr,kpt,mband,mcg,mcprj,mgfft,mkmem,mpi_enreg,mpsang,&
 97 &  mpw,my_natom,natom,nband,nfft,ngfft,nkpt,nloalg,npwarr,nspden,nspinor,nsppol,nsym,&
 98 &  ntypat,nucdipmom,occ,optfor,paw_ij,pawtab,ph1d,psps,rprimd,&
 99 &  stress_needed,symrec,typat,usecprj,usefock,use_gpu_cuda,wtk,xred,ylm,ylmgr)
100 
101  use defs_basis
102  use defs_datatypes
103  use defs_abitypes
104  use m_profiling_abi
105  use m_xmpi
106  use m_errors
107  use m_fock
108  use m_hamiltonian,      only : init_hamiltonian,destroy_hamiltonian,load_spin_hamiltonian,&
109 &                               load_k_hamiltonian,gs_hamiltonian_type,load_kprime_hamiltonian!,K_H_KPRIME
110  use m_electronpositron, only : electronpositron_type,electronpositron_calctype
111  use m_bandfft_kpt,      only : bandfft_kpt,bandfft_kpt_type,&
112 &                               bandfft_kpt_savetabs,bandfft_kpt_restoretabs
113  use m_pawtab,           only : pawtab_type
114  use m_paw_ij,           only : paw_ij_type
115  use m_pawcprj,          only : pawcprj_type,pawcprj_alloc,pawcprj_free,pawcprj_get,pawcprj_reorder
116 use m_cgtools
117 
118 !This section has been created automatically by the script Abilint (TD).
119 !Do not modify the following lines by hand.
120 #undef ABI_FUNC
121 #define ABI_FUNC 'forstrnps'
122  use interfaces_18_timing
123  use interfaces_32_util
124  use interfaces_41_geometry
125  use interfaces_53_spacepar
126  use interfaces_66_nonlocal
127  use interfaces_66_wfs
128 !End of the abilint section
129 
130  implicit none
131 
132 !Arguments ------------------------------------
133 !scalars
134  integer,intent(in) :: mband,mcg,mcprj,mgfft,mkmem,mpsang,mpw,my_natom,natom,nfft,nkpt
135  integer,intent(in) :: nspden,nsppol,nspinor,nsym,ntypat,optfor,stress_needed
136  integer,intent(in) :: usecprj,usefock,use_gpu_cuda
137  real(dp),intent(in) :: ecut,ecutsm,effmass_free
138  type(electronpositron_type),pointer :: electronpositron
139  type(MPI_type),intent(inout) :: mpi_enreg
140  type(pseudopotential_type),intent(in) :: psps
141 !arrays
142  integer,intent(in) :: istwfk(nkpt),kg(3,mpw*mkmem),nband(nkpt*nsppol)
143  integer,intent(in) :: ngfft(18),nloalg(3),npwarr(nkpt)
144  integer,intent(in) :: symrec(3,3,nsym),typat(natom)
145  real(dp),intent(in) :: cg(2,mcg)
146  real(dp),intent(in) :: eigen(mband*nkpt*nsppol),kpt(3,nkpt),nucdipmom(3,my_natom)
147  real(dp),intent(in) :: occ(mband*nkpt*nsppol),ph1d(2,3*(2*mgfft+1)*natom)
148  real(dp),intent(in) :: rprimd(3,3),wtk(nkpt),xred(3,natom)
149  real(dp),intent(in) :: ylm(mpw*mkmem,mpsang*mpsang*psps%useylm)
150  real(dp),intent(in) :: ylmgr(mpw*mkmem,3,mpsang*mpsang*psps%useylm)
151  real(dp),intent(out) :: grnl(3*natom*optfor),kinstr(6),npsstr(6)
152  type(pawcprj_type),intent(inout) :: cprj(natom,mcprj*usecprj)
153  type(paw_ij_type),intent(in) :: paw_ij(my_natom*psps%usepaw)
154  type(pawtab_type),intent(in) :: pawtab(ntypat*psps%usepaw)
155  type(fock_type),pointer, intent(inout) :: fock
156 !Local variables-------------------------------
157 !scalars
158  integer,parameter :: tim_rwwf=7
159  integer :: bandpp,bdtot_index,choice,cpopt,dimffnl,dimffnl_str,iband,iband_cprj,iband_last,ibg,icg,ider,ider_str
160  integer :: idir,idir_str,ierr,ii,ikg,ikpt,ilm,ipositron,ipw,ishift,isppol,istwf_k
161  integer :: mband_cprj,me_distrb,my_ikpt,my_nspinor,nband_k,nband_cprj_k,ndat,nkpg
162  integer :: nnlout,npw_k,paw_opt,signs,spaceComm
163  integer :: tim_nonlop,tim_nonlop_prep,usecprj_local,use_ACE_old
164  integer :: blocksize,iblock,iblocksize,ibs,nblockbd
165  real(dp) :: ar,renorm_factor,dfsm,ecutsm_inv,fact_kin,fsm,htpisq,kgc1
166  real(dp) :: kgc2,kgc3,kin,xx
167  type(gs_hamiltonian_type) :: gs_hamk
168  logical :: compute_gbound,usefock_loc
169  character(len=500) :: msg
170  type(fock_common_type),pointer :: fockcommon
171 !arrays
172  integer,allocatable :: kg_k(:,:)
173  real(dp) :: kpoint(3),nonlop_dum(1,1),rmet(3,3),tsec(2)
174  real(dp),allocatable :: cwavef(:,:),enlout(:),ffnl_sav(:,:,:,:),ffnl_str(:,:,:,:)
175  real(dp),allocatable :: ghc_dum(:,:),gprimd(:,:),kpg_k(:,:),kpg_k_sav(:,:)
176  real(dp),allocatable :: kstr1(:),kstr2(:),kstr3(:),kstr4(:),kstr5(:),kstr6(:)
177  real(dp),allocatable :: lambda(:),occblock(:),ph3d(:,:,:),ph3d_sav(:,:,:)
178  real(dp),allocatable :: weight(:),ylm_k(:,:),ylmgr_k(:,:,:)
179  real(dp),allocatable,target :: ffnl(:,:,:,:)
180  type(bandfft_kpt_type),pointer :: my_bandfft_kpt => null()
181  type(pawcprj_type),target,allocatable :: cwaveprj(:,:)
182  type(pawcprj_type),pointer :: cwaveprj_idat(:,:)
183 
184 
185 !*************************************************************************
186 
187  call timab(920,1,tsec)
188  call timab(921,1,tsec)
189 
190 !Init mpicomm and me
191  if(mpi_enreg%paral_kgb==1)then
192    spaceComm=mpi_enreg%comm_kpt
193    me_distrb=mpi_enreg%me_kpt
194  else
195 !* In case of HF calculation
196    if (mpi_enreg%paral_hf==1) then
197      spaceComm=mpi_enreg%comm_kpt
198      me_distrb=mpi_enreg%me_kpt
199    else
200      spaceComm=mpi_enreg%comm_cell
201      me_distrb=mpi_enreg%me_cell
202    end if
203  end if
204 
205 !Some constants
206  ipositron=abs(electronpositron_calctype(electronpositron))
207  my_nspinor=max(1,nspinor/mpi_enreg%nproc_spinor)
208 !Smearing of plane wave kinetic energy
209  ecutsm_inv=zero;if( ecutsm>1.0d-20) ecutsm_inv=1/ecutsm
210 !htpisq is (1/2) (2 Pi) **2:
211  htpisq=0.5_dp*(two_pi)**2
212 
213 !Check that fock is present if want to use fock option
214  compute_gbound=.false.
215  usefock_loc = (usefock==1 .and. associated(fock))
216 !Arrays initializations
217  grnl(:)=zero
218  if (usefock_loc) then
219    fockcommon =>fock%fock_common
220    fockcommon%optfor=.false.
221    fockcommon%optstr=.false.
222    use_ACE_old=fockcommon%use_ACE
223    fockcommon%use_ACE=0
224    if (fockcommon%optfor) compute_gbound=.true.
225  end if
226  if (stress_needed==1) then
227    kinstr(:)=zero;npsstr(:)=zero
228    if (usefock_loc) then
229      fockcommon%optstr=.TRUE.
230      fockcommon%stress=zero
231      compute_gbound=.true.
232    end if
233  end if
234  
235  usecprj_local=usecprj
236 
237  if ((usefock_loc).and.(psps%usepaw==1)) then
238    usecprj_local=1
239    if(optfor==1)then 
240      fockcommon%optfor=.true.
241      if (.not.allocated(fockcommon%forces_ikpt)) then
242        ABI_ALLOCATE(fockcommon%forces_ikpt,(3,natom,mband))
243      end if
244      if (.not.allocated(fockcommon%forces)) then
245        ABI_ALLOCATE(fockcommon%forces,(3,natom))
246      end if
247      fockcommon%forces=zero
248      compute_gbound=.true.
249    end if
250  end if
251 
252 !Initialize Hamiltonian (k-independent terms)
253 
254  call init_hamiltonian(gs_hamk,psps,pawtab,nspinor,nsppol,nspden,natom,&
255 & typat,xred,nfft,mgfft,ngfft,rprimd,nloalg,usecprj=usecprj_local,&
256 & comm_atom=mpi_enreg%comm_atom,mpi_atmtab=mpi_enreg%my_atmtab,mpi_spintab=mpi_enreg%my_isppoltab,&
257 & paw_ij=paw_ij,ph1d=ph1d,electronpositron=electronpositron,fock=fock,&
258 & nucdipmom=nucdipmom,use_gpu_cuda=use_gpu_cuda)
259  rmet = MATMUL(TRANSPOSE(rprimd),rprimd)
260  call timab(921,2,tsec)
261 
262 !need to reorder cprj=<p_lmn|Cnk> (from unsorted to atom-sorted)
263  if (psps%usepaw==1.and.usecprj_local==1) then
264    call pawcprj_reorder(cprj,gs_hamk%atindx)
265  end if
266 
267 !Common data for "nonlop" routine
268  signs=1 ; idir=0  ; ishift=0 ; tim_nonlop=4 ; tim_nonlop_prep=12
269  choice=2*optfor;if (stress_needed==1) choice=10*choice+3
270  if (optfor==1.and.stress_needed==1)  ishift=6
271  nnlout=max(1,6*stress_needed+3*natom*optfor)
272  if (psps%usepaw==0) then
273    paw_opt=0 ; cpopt=-1
274  else
275    paw_opt=2 ; cpopt=-1+3*usecprj_local
276  end if
277 
278  call timab(921,2,tsec)
279 
280 !LOOP OVER SPINS
281  bdtot_index=0;ibg=0;icg=0
282  do isppol=1,nsppol
283 
284 !  Continue to initialize the Hamiltonian (PAW DIJ coefficients)
285    call load_spin_hamiltonian(gs_hamk,isppol,with_nonlocal=.true.)
286    if (usefock_loc) fockcommon%isppol=isppol
287 
288 !  Loop over k points
289    ikg=0
290    do ikpt=1,nkpt
291      if (usefock_loc) fockcommon%ikpt=ikpt
292      nband_k=nband(ikpt+(isppol-1)*nkpt)
293      istwf_k=istwfk(ikpt)
294      npw_k=npwarr(ikpt)
295      kpoint(:)=kpt(:,ikpt)
296 
297      if(proc_distrb_cycle(mpi_enreg%proc_distrb,ikpt,1,nband_k,isppol,me_distrb)) then
298        bdtot_index=bdtot_index+nband_k
299        cycle
300      end if
301 
302      call timab(922,1,tsec)
303 
304 !    Parallelism over FFT and/or bands: define sizes and tabs
305      if (mpi_enreg%paral_kgb==1) then
306        my_ikpt=mpi_enreg%my_kpttab(ikpt)
307        nblockbd=nband_k/(mpi_enreg%nproc_band*mpi_enreg%bandpp)
308        bandpp=mpi_enreg%bandpp
309        my_bandfft_kpt => bandfft_kpt(my_ikpt)
310      else
311        my_ikpt=ikpt
312        bandpp=mpi_enreg%bandpp
313        nblockbd=nband_k/bandpp
314      end if
315      blocksize=nband_k/nblockbd
316      mband_cprj=mband/mpi_enreg%nproc_band
317      nband_cprj_k=nband_k/mpi_enreg%nproc_band
318 
319      ABI_ALLOCATE(cwavef,(2,npw_k*my_nspinor*blocksize))
320      if (psps%usepaw==1.and.usecprj_local==1) then
321        ABI_DATATYPE_ALLOCATE(cwaveprj,(natom,my_nspinor*bandpp))
322        call pawcprj_alloc(cwaveprj,0,gs_hamk%dimcprj)
323      else
324        ABI_DATATYPE_ALLOCATE(cwaveprj,(0,0))
325      end if
326 
327      if (stress_needed==1) then
328        ABI_ALLOCATE(kstr1,(npw_k))
329        ABI_ALLOCATE(kstr2,(npw_k))
330        ABI_ALLOCATE(kstr3,(npw_k))
331        ABI_ALLOCATE(kstr4,(npw_k))
332        ABI_ALLOCATE(kstr5,(npw_k))
333        ABI_ALLOCATE(kstr6,(npw_k))
334      end if
335 
336      ABI_ALLOCATE(kg_k,(3,mpw))
337 !$OMP PARALLEL DO
338      do ipw=1,npw_k
339        kg_k(:,ipw)=kg(:,ipw+ikg)
340      end do
341 
342      ABI_ALLOCATE(ylm_k,(npw_k,mpsang*mpsang*psps%useylm))
343      if (stress_needed==1) then
344        ABI_ALLOCATE(ylmgr_k,(npw_k,3,mpsang*mpsang*psps%useylm))
345      else
346        ABI_ALLOCATE(ylmgr_k,(0,0,0))
347      end if
348      if (psps%useylm==1) then
349 !$OMP PARALLEL DO COLLAPSE(2)
350        do ilm=1,mpsang*mpsang
351          do ipw=1,npw_k
352            ylm_k(ipw,ilm)=ylm(ipw+ikg,ilm)
353          end do
354        end do
355        if (stress_needed==1) then
356 !$OMP PARALLEL DO COLLAPSE(2)
357          do ilm=1,mpsang*mpsang
358            do ii=1,3
359              do ipw=1,npw_k
360                ylmgr_k(ipw,ii,ilm)=ylmgr(ipw+ikg,ii,ilm)
361              end do
362            end do
363          end do
364        end if
365      end if
366 
367 !    Prepare kinetic contribution to stress tensor (Warning : the symmetry
368 !    has not been broken, like in mkkin.f or kpg3.f . It should be, in order to be coherent).
369      if (stress_needed==1) then
370        ABI_ALLOCATE(gprimd,(3,3))
371        gprimd=gs_hamk%gprimd
372 !$OMP PARALLEL DO PRIVATE(fact_kin,ipw,kgc1,kgc2,kgc3,kin,xx,fsm,dfsm) &
373 !$OMP&SHARED(ecut,ecutsm,ecutsm_inv,gs_hamk,htpisq,kg_k,kpoint,kstr1,kstr2,kstr3,kstr4,kstr5,kstr6,npw_k)
374        do ipw=1,npw_k
375 !        Compute Cartesian coordinates of (k+G)
376          kgc1=gprimd(1,1)*(kpoint(1)+kg_k(1,ipw))+&
377 &         gprimd(1,2)*(kpoint(2)+kg_k(2,ipw))+&
378 &         gprimd(1,3)*(kpoint(3)+kg_k(3,ipw))
379          kgc2=gprimd(2,1)*(kpoint(1)+kg_k(1,ipw))+&
380 &         gprimd(2,2)*(kpoint(2)+kg_k(2,ipw))+&
381 &         gprimd(2,3)*(kpoint(3)+kg_k(3,ipw))
382          kgc3=gprimd(3,1)*(kpoint(1)+kg_k(1,ipw))+&
383 &         gprimd(3,2)*(kpoint(2)+kg_k(2,ipw))+&
384 &         gprimd(3,3)*(kpoint(3)+kg_k(3,ipw))
385          kin=htpisq* ( kgc1**2 + kgc2**2 + kgc3**2 )
386          fact_kin=1.0_dp
387          if(kin>ecut-ecutsm)then
388            if(kin>ecut)then
389              fact_kin=0.0_dp
390            else
391 !            See the routine mkkin.f, for the smearing procedure
392              xx=(ecut-kin)*ecutsm_inv
393 !            This kinetic cutoff smoothing function and its xx derivatives
394 !            were produced with Mathematica and the fortran code has been
395 !            numerically checked against Mathematica.
396              fsm=1.0_dp/(xx**2*(3+xx*(1+xx*(-6+3*xx))))
397              dfsm=-3.0_dp*(-1+xx)**2*xx*(2+5*xx)*fsm**2
398 !            d2fsm=6.0_dp*xx**2*(9+xx*(8+xx*(-52+xx*(-3+xx*(137+xx*&
399 !            &                         (-144+45*xx))))))*fsm**3
400              fact_kin=fsm+kin*(-ecutsm_inv)*dfsm
401            end if
402          end if
403          kstr1(ipw)=fact_kin*kgc1*kgc1
404          kstr2(ipw)=fact_kin*kgc2*kgc2
405          kstr3(ipw)=fact_kin*kgc3*kgc3
406          kstr4(ipw)=fact_kin*kgc3*kgc2
407          kstr5(ipw)=fact_kin*kgc3*kgc1
408          kstr6(ipw)=fact_kin*kgc2*kgc1
409        end do ! ipw
410        ABI_DEALLOCATE(gprimd)
411      end if
412 
413 !    Compute (k+G) vectors
414      nkpg=3*nloalg(3)
415      ABI_ALLOCATE(kpg_k,(npw_k,nkpg))
416      if (nkpg>0) then
417        call mkkpg(kg_k,kpg_k,kpoint,nkpg,npw_k)
418      end if
419 
420 !    Compute nonlocal form factors ffnl at all (k+G)
421      ider=0;idir=0;dimffnl=1
422      if (stress_needed==1) then
423        ider=1;dimffnl=2+2*psps%useylm
424      end if
425      ABI_ALLOCATE(ffnl,(npw_k,dimffnl,psps%lmnmax,ntypat))
426      call mkffnl(psps%dimekb,dimffnl,psps%ekb,ffnl,psps%ffspl,gs_hamk%gmet,gs_hamk%gprimd,&
427 &     ider,idir,psps%indlmn,kg_k,kpg_k,kpoint,psps%lmnmax,psps%lnmax,psps%mpsang,psps%mqgrid_ff,&
428 &     nkpg,npw_k,ntypat,psps%pspso,psps%qgrid_ff,rmet,psps%usepaw,psps%useylm,ylm_k,ylmgr_k)
429      if ((stress_needed==1).and.(usefock_loc).and.(psps%usepaw==1))then
430        ider_str=1; dimffnl_str=7;idir_str=-7
431        ABI_ALLOCATE(ffnl_str,(npw_k,dimffnl_str,psps%lmnmax,ntypat))
432        call mkffnl(psps%dimekb,dimffnl_str,psps%ekb,ffnl_str,psps%ffspl,gs_hamk%gmet,gs_hamk%gprimd,&
433 &       ider_str,idir_str,psps%indlmn,kg_k,kpg_k,kpoint,psps%lmnmax,psps%lnmax,psps%mpsang,psps%mqgrid_ff,&
434 &       nkpg,npw_k,ntypat,psps%pspso,psps%qgrid_ff,rmet,psps%usepaw,psps%useylm,ylm_k,ylmgr_k)
435      end if
436 
437 !    Load k-dependent part in the Hamiltonian datastructure
438 !     - Compute 3D phase factors
439 !     - Prepare various tabs in case of band-FFT parallelism
440 !     - Load k-dependent quantities in the Hamiltonian
441      ABI_ALLOCATE(ph3d,(2,npw_k,gs_hamk%matblk))
442      call load_k_hamiltonian(gs_hamk,kpt_k=kpoint,istwf_k=istwf_k,npw_k=npw_k,&
443 &     kg_k=kg_k,kpg_k=kpg_k,ffnl_k=ffnl,ph3d_k=ph3d,compute_gbound=compute_gbound,compute_ph3d=.true.)
444 
445 !    Load band-FFT tabs (transposed k-dependent arrays)
446      if (mpi_enreg%paral_kgb==1) then
447        call bandfft_kpt_savetabs(my_bandfft_kpt,ffnl=ffnl_sav,ph3d=ph3d_sav,kpg=kpg_k_sav)
448        call prep_bandfft_tabs(gs_hamk,ikpt,mkmem,mpi_enreg)
449        call load_k_hamiltonian(gs_hamk,npw_fft_k=my_bandfft_kpt%ndatarecv, &
450 &       kg_k     =my_bandfft_kpt%kg_k_gather, &
451 &       kpg_k    =my_bandfft_kpt%kpg_k_gather, &
452        ffnl_k   =my_bandfft_kpt%ffnl_gather, &
453        ph3d_k   =my_bandfft_kpt%ph3d_gather,compute_gbound=compute_gbound)
454      end if
455 
456      call timab(922,2,tsec)
457 
458 !    Loop over (blocks of) bands; accumulate forces and/or stresses
459 !    The following is now wrong. In sequential, nblockbd=nband_k/bandpp
460 !    blocksize= bandpp (JB 2016/04/16)
461 !    Note that in sequential mode iblock=iband, nblockbd=nband_k and blocksize=1
462      ABI_ALLOCATE(lambda,(blocksize))
463      ABI_ALLOCATE(occblock,(blocksize))
464      ABI_ALLOCATE(weight,(blocksize))
465      ABI_ALLOCATE(enlout,(nnlout*blocksize))
466      occblock=zero;weight=zero;enlout(:)=zero
467      if (usefock_loc) then
468        if (fockcommon%optstr) then
469          ABI_ALLOCATE(fockcommon%stress_ikpt,(6,nband_k))
470          fockcommon%stress_ikpt=zero
471        end if
472      end if
473      if ((usefock_loc).and.(psps%usepaw==1)) then
474        if (fockcommon%optfor) then
475          fockcommon%forces_ikpt=zero
476        end if
477      end if
478 
479      do iblock=1,nblockbd
480 
481        iband=(iblock-1)*blocksize+1;iband_last=min(iband+blocksize-1,nband_k)
482        iband_cprj=(iblock-1)*bandpp+1
483        if(proc_distrb_cycle(mpi_enreg%proc_distrb,ikpt,iband,iband_last,isppol,me_distrb)) cycle
484 
485 !      Select occupied bandsddk
486        occblock(:)=occ(1+(iblock-1)*blocksize+bdtot_index:iblock*blocksize+bdtot_index)
487        if( abs(maxval(occblock))>=tol8 ) then
488          call timab(923,1,tsec)
489          weight(:)=wtk(ikpt)*occblock(:)
490 
491 !        gs_hamk%ffnl_k is changed in fock_getghc, so that it is necessary to restore it when stresses are to be calculated.
492          if ((stress_needed==1).and.(usefock_loc).and.(psps%usepaw==1))then
493            call load_k_hamiltonian(gs_hamk,ffnl_k=ffnl)
494          end if
495 
496 !        Load contribution from n,k
497          cwavef(:,1:npw_k*my_nspinor*blocksize)=&
498 &         cg(:,1+(iblock-1)*npw_k*my_nspinor*blocksize+icg:iblock*npw_k*my_nspinor*blocksize+icg)
499          if (psps%usepaw==1.and.usecprj_local==1) then
500            call pawcprj_get(gs_hamk%atindx1,cwaveprj,cprj,natom,iband_cprj,ibg,ikpt,0,isppol,&
501 &           mband_cprj,mkmem,natom,bandpp,nband_cprj_k,my_nspinor,nsppol,0,&
502 &           mpicomm=mpi_enreg%comm_kpt,proc_distrb=mpi_enreg%proc_distrb)
503          end if
504 
505          call timab(923,2,tsec)
506          call timab(926,1,tsec)
507 
508          lambda(1:blocksize)= eigen(1+(iblock-1)*blocksize+bdtot_index:iblock*blocksize+bdtot_index)
509          if (mpi_enreg%paral_kgb/=1) then
510            call nonlop(choice,cpopt,cwaveprj,enlout,gs_hamk,idir,lambda,mpi_enreg,blocksize,nnlout,&
511 &           paw_opt,signs,nonlop_dum,tim_nonlop,cwavef,cwavef)
512          else
513            call prep_nonlop(choice,cpopt,cwaveprj,enlout,gs_hamk,idir,lambda,blocksize,&
514 &           mpi_enreg,nnlout,paw_opt,signs,nonlop_dum,tim_nonlop_prep,cwavef,cwavef)
515          end if
516          if ((stress_needed==1).and.(usefock_loc).and.(psps%usepaw==1))then
517            call load_k_hamiltonian(gs_hamk,ffnl_k=ffnl_str)
518          end if
519          call timab(926,2,tsec)
520 
521 !        Accumulate non-local contributions from n,k
522          if (optfor==1) then
523            do iblocksize=1,blocksize
524              ibs=nnlout*(iblocksize-1)
525              grnl(1:3*natom)=grnl(1:3*natom)+weight(iblocksize)*enlout(ibs+1+ishift:ibs+3*natom+ishift)
526            end do
527          end if
528          if (stress_needed==1) then
529            do iblocksize=1,blocksize
530              ibs=nnlout*(iblocksize-1)
531              npsstr(1:6)=npsstr(1:6) + weight(iblocksize)*enlout(ibs+1:ibs+6)
532            end do
533          end if
534 
535 !        Accumulate stress tensor kinetic contributions
536          if (stress_needed==1) then
537            call timab(924,1,tsec)
538            do iblocksize=1,blocksize
539              call meanvalue_g(ar,kstr1,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
540 &             cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),&
541 &             cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),0)
542              kinstr(1)=kinstr(1)+weight(iblocksize)*ar
543              call meanvalue_g(ar,kstr2,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
544 &             cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),&
545 &             cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),0)
546              kinstr(2)=kinstr(2)+weight(iblocksize)*ar
547              call meanvalue_g(ar,kstr3,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
548 &             cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),&
549 &             cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),0)
550              kinstr(3)=kinstr(3)+weight(iblocksize)*ar
551              call meanvalue_g(ar,kstr4,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
552 &             cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),&
553 &             cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),0)
554              kinstr(4)=kinstr(4)+weight(iblocksize)*ar
555              call meanvalue_g(ar,kstr5,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
556 &             cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),&
557 &             cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),0)
558              kinstr(5)=kinstr(5)+weight(iblocksize)*ar
559              call meanvalue_g(ar,kstr6,0,istwf_k,mpi_enreg,npw_k,my_nspinor,&
560 &             cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),&
561 &             cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),0)
562              kinstr(6)=kinstr(6)+weight(iblocksize)*ar
563            end do
564            call timab(924,2,tsec)
565          end if
566 
567 !        Accumulate stress tensor and forces for the Fock part
568          if (usefock_loc) then
569            if(fockcommon%optstr.or.fockcommon%optfor) then
570              if (mpi_enreg%paral_kgb==1) then
571                msg='forsrtnps: Paral_kgb is not yet implemented for fock stresses'
572                MSG_BUG(msg)
573              end if 
574              ndat=mpi_enreg%bandpp
575              if (gs_hamk%usepaw==0) cwaveprj_idat => cwaveprj
576              ABI_ALLOCATE(ghc_dum,(0,0))
577              do iblocksize=1,blocksize
578                fockcommon%ieigen=(iblock-1)*blocksize+iblocksize
579                fockcommon%iband=(iblock-1)*blocksize+iblocksize
580                if (gs_hamk%usepaw==1) then
581                  cwaveprj_idat => cwaveprj(:,(iblocksize-1)*my_nspinor+1:iblocksize*my_nspinor)
582                end if
583                call fock_getghc(cwavef(:,1+(iblocksize-1)*npw_k*my_nspinor:iblocksize*npw_k*my_nspinor),cwaveprj_idat,&
584 &               ghc_dum,gs_hamk,mpi_enreg)
585                if (fockcommon%optstr) then
586                  fockcommon%stress(:)=fockcommon%stress(:)+weight(iblocksize)*fockcommon%stress_ikpt(:,fockcommon%ieigen)
587                end if
588                if (fockcommon%optfor) then
589                  fockcommon%forces(:,:)=fockcommon%forces(:,:)+weight(iblocksize)*fockcommon%forces_ikpt(:,:,fockcommon%ieigen)
590                end if
591              end do 
592              ABI_DEALLOCATE(ghc_dum)
593            end if
594          end if
595        end if
596 
597      end do ! End of loop on block of bands
598 
599 !    Restore the bandfft tabs
600      if (mpi_enreg%paral_kgb==1) then
601        call bandfft_kpt_restoretabs(my_bandfft_kpt,ffnl=ffnl_sav,ph3d=ph3d_sav,kpg=kpg_k_sav)
602      end if
603 
604 !    Increment indexes
605      bdtot_index=bdtot_index+nband_k
606      if (mkmem/=0) then
607        ibg=ibg+my_nspinor*nband_cprj_k
608        icg=icg+npw_k*my_nspinor*nband_k
609        ikg=ikg+npw_k
610      end if
611 
612      if (usefock_loc) then
613        if (fockcommon%optstr) then
614          ABI_DEALLOCATE(fockcommon%stress_ikpt)
615        end if
616      end if
617 
618      if (psps%usepaw==1) then
619        call pawcprj_free(cwaveprj)
620      end if
621      ABI_DATATYPE_DEALLOCATE(cwaveprj)
622      ABI_DEALLOCATE(cwavef)
623 
624      ABI_DEALLOCATE(lambda)
625      ABI_DEALLOCATE(occblock)
626      ABI_DEALLOCATE(weight)
627      ABI_DEALLOCATE(enlout)
628      ABI_DEALLOCATE(ffnl)
629      ABI_DEALLOCATE(kg_k)
630      ABI_DEALLOCATE(kpg_k)
631      ABI_DEALLOCATE(ylm_k)
632      ABI_DEALLOCATE(ylmgr_k)
633      ABI_DEALLOCATE(ph3d)
634      if (stress_needed==1) then
635        ABI_DEALLOCATE(kstr1)
636        ABI_DEALLOCATE(kstr2)
637        ABI_DEALLOCATE(kstr3)
638        ABI_DEALLOCATE(kstr4)
639        ABI_DEALLOCATE(kstr5)
640        ABI_DEALLOCATE(kstr6)
641      end if
642      if ((stress_needed==1).and.(usefock_loc).and.(psps%usepaw==1))then
643        ABI_DEALLOCATE(ffnl_str)
644      end if
645 
646    end do ! End k point loop
647  end do ! End loop over spins
648 
649 !Stress is equal to dE/d_strain * (1/ucvol)
650  npsstr(:)=npsstr(:)/gs_hamk%ucvol
651 
652 !Parallel case: accumulate (n,k) contributions
653  if (xmpi_paral==1) then
654 !  Forces
655    if (optfor==1) then
656      call timab(65,1,tsec)
657      call xmpi_sum(grnl,spaceComm,ierr)
658      call timab(65,2,tsec)
659      if ((usefock_loc).and.(psps%usepaw==1)) then
660        call xmpi_sum(fockcommon%forces,spaceComm,ierr)
661      end if
662    end if
663 !  Stresses
664    if (stress_needed==1) then
665      call timab(65,1,tsec)
666      call xmpi_sum(kinstr,spaceComm,ierr)
667      call xmpi_sum(npsstr,spaceComm,ierr)
668      if ((usefock_loc).and.(fockcommon%optstr)) then
669        call xmpi_sum(fockcommon%stress,spaceComm,ierr)
670      end if
671      call timab(65,2,tsec)
672    end if
673  end if
674 
675  call timab(925,1,tsec)
676 
677 !Do final normalizations and symmetrizations of stress tensor contributions
678  if (stress_needed==1) then
679    renorm_factor=-(two_pi**2)/effmass_free/gs_hamk%ucvol
680    kinstr(:)=kinstr(:)*renorm_factor
681    if (nsym>1) then
682      call stresssym(gs_hamk%gprimd,nsym,kinstr,symrec)
683      call stresssym(gs_hamk%gprimd,nsym,npsstr,symrec)
684      if ((usefock_loc).and.(fockcommon%optstr)) then
685        call stresssym(gs_hamk%gprimd,nsym,fockcommon%stress,symrec)
686      end if
687    end if
688  end if
689 
690 !Need to reorder cprj=<p_lmn|Cnk> (from atom-sorted to unsorted)
691  if (psps%usepaw==1.and.usecprj_local==1) then
692    call pawcprj_reorder(cprj,gs_hamk%atindx1)
693  end if
694 
695 !Deallocate temporary space
696  call destroy_hamiltonian(gs_hamk)
697  if (usefock_loc) then
698    fockcommon%use_ACE=use_ACE_old
699  end if
700  call timab(925,2,tsec)
701  call timab(920,2,tsec)
702 
703 end subroutine forstrnps