TABLE OF CONTENTS


ABINIT/nonlop [ Functions ]

[ Top ] [ Functions ]

NAME

 nonlop

FUNCTION

 This routine is a driver to compute:
 * Application of a nonlocal operator Vnl_k_k^prime in order to get:
    - contracted elements (energy, forces, stresses, ...), if signs=1
    - a function in reciprocal space (|out> = Vnl|in>),    if signs=2
 * Optionally, in case of PAW calculation:
   - Application of the overlap matrix in reciprocal space
     (<in|S|in> or (I+S)|in>).
   - Application of (Vnl-lambda.S) in reciprocal space
 According to user s choice, the routine calls a subroutine, computing all quantities:
   - using Legendre Polynomials Pl (Norm-conserving psps only)
   - using Spherical Harmonics Ylm (N-conserving or PAW ; compulsory for PAW)
   - using GPUs (N-conserving or PAW)

COPYRIGHT

 Copyright (C) 1998-2018 ABINIT group (MT)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 For the initials of contributors, see ~abinit/doc/developers/contributors.txt.

INPUTS

  choice: chooses possible output:
    choice=0 => do nothing (only compute WF projected with NL projectors)
          =1 => non-local energy contribution
          =2 => 1st derivative(s) with respect to atomic position(s)
          =3 => 1st derivative(s) with respect to strain(s)
          =23=> 1st derivative(s) with respect to atomic pos. and
                1st derivative(s) with respect to atomic pos. and strains
          =4 => 2nd derivative(s) with respect to 2 atomic pos.
          =24=> 1st derivative(s) with respect to atm. pos. and
                2nd derivative(s) with respect to 2 atomic pos.
          =5 => 1st derivative(s) with respect to k wavevector, typically
                sum_ij [ |p_i> D_ij <dp_j/dk| + |dp_i/dk> D_ij < p_j| ]
          =6 => 2nd derivative(s) with respect to 2 strains and
                mixed 2nd derivative(s) with respect to strains & atomic pos.
          =51 =>right 1st derivative(s) with respect to k wavevector, typically
                sum_ij [ |p_i> D_ij <dp_j/dk| ]
          =52 =>left 1st derivative(s) with respect to k wavevector, typically
                sum_ij [ |dp_i/dk> D_ij < p_j| ]
          =53 =>twist 1st derivative(s) with respect to k, typically
                sum_ij [ |dp_i/dk_(idir+1)> D_ij <dp_j//dk_(idir-1)|
                        -|dp_i/dk_(idir-1)> D_ij <dp_j//dk_(idir+1)|]
          =54=> mixed 2nd derivative(s) with respect to atomic pos. and left k wavevector
          =55=> mixed 2nd derivative(s) with respect to strain and right k wavevector
          =7 => apply operator $\sum_i [ |p_i> <p_i| ],
                same as overlap operator with s_ij=identity (paw_opt==3 only)
          =8 => 2nd derivatives with respect to 2 k wavevectors
          =81=> partial 2nd derivatives with respect to 2 k wavevectors,
                full derivative with respect to k1, right derivative with respect to k2,
                (derivative with respect to k of choice 51), typically
                sum_ij [ |dp_i/dk1> D_ij <dp_j/dk2| + |p_i> D_ij < d2p_j/dk1dk2| ]
    Only choices 1,2,3,23,4,5,6 are compatible with useylm=0.
    Only choices 1,2,3,5,51,52,53,7,8,81 are compatible with signs=2
  cpopt=flag defining the status of cprjin%cp(:)=<Proj_i|Cnk> scalars (see below, side effects)
  [enl]=optional (if not present, use hamk%ekb); non-local coeffs connecting projectors
        see hamk%ekb description
  hamk <type(gs_hamiltonian_type)>=data defining the Hamiltonian at a given k (NL part involved here)
     | atindx1(natom)=index table for atoms, inverse of atindx
     | dimekb1,dimekb2=dimensions of ekb (see ham%ekb)
     | ekb(dimekb1,dimekb2,nspinor**2)=
     |   ->NC psps (paw_opt=0) : Kleinman-Bylander energies (hartree)
     |                           dimekb1=lmnmax, dimekb2=ntypat
     |   ->PAW (paw_opt=1 or 4): Dij coefs connecting projectors (ij symmetric)
     |                           dimekb1=cplex_ekb*lmnmax*(lmnmax+1)/2, dimekb2=natom
     |                           Complex numbers if cplex_ekb=2
     |                           ekb(:,:,1)= Dij^up-up, ekb(:,:,2)= Dij^dn-dn
     |                           ekb(:,:,3)= Dij^up-dn, ekb(:,:,4)= Dij^dn-up (only if nspinor=2)
     | ffnl_k(npw_k,dimffnl_k,lmnmax,ntypat)=nonlocal form factors at k
     | ffnl_kp(npw_kp,dimffnl_kp,lmnmax,ntypat)=nonlocal form factors at k^prime
     | gmet(3,3)=metric tensor for G vecs (in bohr**-2)
     | gprimd(3,3)=dimensional reciprocal space primitive translations
     | indlmn(6,i,ntypat)= array giving l,m,n,lm,ln,s for i=ln (useylm=0) or i=lmn (useylm=1)
     | istwf_k=option parameter that describes the storage of wfs at k
     | istwf_kp=option parameter that describes the storage of wfs at k^prime
     | lmnmax=max. number of (l,m,n) components over all types of atoms
     | matblk=dimension of the arrays ph3d_k and ph3d_kp
     | mgfft=maximum size of 1D FFTs
     | mpsang= 1+maximum angular momentum for nonlocal pseudopotentials
     | mpssoang= 1+max(spin*angular momentum) for nonlocal pseudopotentials
     | natom=number of atoms in cell
     | nattyp(ntypat)=number of atoms of each type
     | ngfft(18)=contain all needed information about 3D FFT
     ! kg_k(3,npw_k)=integer coords of planewaves in basis sphere, for k
     ! kg_kp(3,npw_kp)=integer coords of planewaves in basis sphere, for k^prime
     ! kpg_k(npw_k,:)= (k+G) components and related data
     ! kpg_kp(npw_kp,:)=(k^prime+G) components and related data,
     ! kpt(3)=k point in terms of recip. translations
     ! kptp(3)=k^prime point in terms of recip. translations
     | nloalg(3)=governs the choice of the algorithm for nonlocal operator
     ! npw_k=number of (k+G) planewaves
     ! npw_kp=number of (k^prime+G) planewaves
     | ntypat=number of types of atoms in cell
     | nspinor=total number of spinorial components of the wavefunctions
     | ph1d(2,3*(2*mgfft+1)*natom)=1D structure factors phase information
     | ph3d_k(2,npw_k,matblk)=3D structure factors, for each atom and (k+g) plane wave
     | ph3d_kp(2,npw_kp,matblk)=3-dim structure factors, for each atom and (k^prime+g) plane wave
     | phkxred(2,natom)=phase factors exp(2 pi k.xred)
     | phkpxred(2,natom)=phase factors exp(2 pi k^prime.xred)
     | sij(dimekb1,ntypat)=overlap matrix components (only if paw_opt=2, 3 or 4)
     | ucvol=unit cell volume (bohr^3)
     | use_gpu_cuda=governs wheter we do the hamiltonian calculation on gpu or not
     | useylm=how the NL operator is to be applied: 1=using Ylm, 0=using Legendre polynomials
  [iatom_only]=optional. If present (and >0), only projectors related to atom of index iatom_only
          will be applied. (used fi to apply derivative of NL operator wrt an atomic displacement)
  idir=direction of the - atom to be moved in the case (choice=2,signs=2),
                        - k point direction in the case (choice=5,51,or 52)
                          for choice 53 signs=2, cross derivatives are in idir-1 and idir+1 directions
                        - strain component (1:6) in the case (choice=3,signs=2) or (choice=6,signs=1)
  lambda=factor to be used when computing (Vln-lambda.S) - only for paw_opt=2
         Typically lambda is the eigenvalue (or its guess)
  mpi_enreg=informations about MPI parallelization
  ndat=number of wavefunctions on which to apply nonlop
  nnlout=dimension of enlout (when signs=1 and choice>0):
         ==== if paw_opt=0, 1 or 2 ====
         choice   nnlout     |  choice   nnlout
              1   1          |      51   6 (complex)
              2   3*natom    |      52   6 (complex)
              3   6          |      53   6
              4   6*natom    |      54   9*natom
             23   6+3*natom  |      55   36 (complex)
             24   9*natom    |       6   36+18*natom
              5   3          |       8   6
                             |      81   18 (complex)
         ==== if paw_opt=3 ====
         choice   nnlout
              1   1
              2   3*natom
              5   3
             51   3
             52   3
             54   9*natom
             55   36
              7   1
              8   6
             81   9
         ==== if paw_opt=4 ====
         not available
  [only_SO]=optional, flag to calculate only the SO part in nonlop
  paw_opt= define the nonlocal operator concerned with:
           paw_opt=0 : Norm-conserving Vnl (use of Kleinman-Bylander ener.)
           paw_opt=1 : PAW nonlocal part of H (use of Dij coeffs)
           paw_opt=2 : PAW: (Vnl-lambda.Sij) (Sij=overlap matrix)
           paw_opt=3 : PAW overlap matrix (Sij)
           paw_opt=4 : both PAW nonlocal part of H (Dij) and overlap matrix (Sij)
  [select_k]=optional, option governing the choice of k points to be used.
             hamk datastructure contains quantities needed to apply NL operator
             in reciprocal space between 2 kpoints, k and k^prime (equal in most cases);
             if select_k=1, <k^prime|Vnl|k>       is applied [default]
             if select_k=2, <k|Vnl|k^prime>       is applied
             if select_k=3, <k|Vnl|k>             is applied
             if select_k=4, <k^prime|Vnl|k^prime> is applied
  signs= if 1, get contracted elements (energy, forces, stress, ...)
         if 2, applies the non-local operator to a function in reciprocal space
  tim_nonlop=timing code of the calling routine (can be set to 0 if not attributed)
  vectin(2,npwin*my_nspinor*ndat)=input cmplx wavefunction coefficients <G|Cnk>

OUTPUT

 ==== if (signs==1) ====
 --If (paw_opt==0, 1 or 2)
    enlout(nnlout)= contribution to the non-local part of the following properties:
      if choice=1 : enlout(1)             -> the energy
      if choice=2 : enlout(3*natom)       -> 1st deriv. of energy wrt atm. pos (forces)
      if choice=3 : enlout(6)             -> 1st deriv. of energy wrt strain (stresses)
      if choice=4 : enlout(6*natom)       -> 2nd deriv. of energy wrt 2 atm. pos (dyn. mat.)
      if choice=23: enlout(6+3*natom)     -> 1st deriv. of energy wrt atm. pos (forces) and
                                             1st deriv. of energy wrt strain (stresses)
      if choice=24: enlout(9*natom)       -> 1st deriv. of energy wrt atm. pos (forces) and
                                             2nd deriv. of energy wrt 2 atm. pos (dyn. mat.)
      if choice=5 : enlout(3)             -> 1st deriv. of energy wrt k
      if choice=51: enlout(3)             -> 1st deriv. (right) of energy wrt k
      if choice=52: enlout(3)             -> 1st deriv. (left) of energy wrt k
      if choice=53: enlout(3)             -> 1st deriv. (twist) of energy wrt k
      if choice=54: enlout(18*natom)      -> 2nd deriv. of energy wrt atm. pos and right k (Born eff. charge)
      if choice=55: enlout(36)            -> 2nd deriv. of energy wrt strain and right k (piezoelastic tensor)
      if choice=6 : enlout(36+18*natom)   -> 2nd deriv. of energy wrt 2 strains (elast. tensor) and
                                             2nd deriv. of energy wrt to atm. pos and strain (internal strain)
      if choice=8 : enlout(6)             -> 2nd deriv. of energy wrt 2 k
      if choice=81: enlout(9)             -> 2nd deriv.of E: full derivative w.r.t. k1, right derivative w.r.t k2
 --If (paw_opt==3)
      if choice=1 : enlout(1)             -> contribution to <c|S|c> (note: not including <c|c>)
      if choice=2 : enlout(3*natom)       -> contribution to <c|dS/d_atm.pos|c>
      if choice=51: enlout(3)             -> contribution to <c|d(right)S/d_k|c>
      if choice=52: enlout(3)             -> contribution to <c|d(left)S/d_k|c>
      if choice=54: enlout(18*natom)      -> 2nd deriv. of energy wrt atm. pos and right k (Born eff. charge)
      if choice=55: enlout(36)            -> 2nd deriv. of energy wrt strain and right k (piezoelastic tensor)
      if choice=7 : enlout(1)             -> contribution to <c|sum_i[p_i><p_i]|c>
      if choice=8 : enlout(6)             -> contribution to <c|d2S/d_k1d_k2|c>
      if choice=81: enlout(9)             -> contribution to <c|dS/d_k1[d(right)d_k2]|c>
 --If (paw_opt==4)
      not available
 ==== if (signs==2) ====
 --if (paw_opt=0, 1 or 4)
    vectout(2,npwout*my_nspinor*ndat)=result of the aplication of the concerned operator
                or one of its derivatives to the input vect.:
      if (choice=1)  <G|V_nonlocal|vect_in>
      if (choice=2)  <G|dV_nonlocal/d(atm. pos)|vect_in>
      if (choice=3)  <G|dV_nonlocal/d(strain)|vect_in>
      if (choice=5)  <G|dV_nonlocal/d(k)|vect_in>
      if (choice=51) <G|d(right)V_nonlocal/d(k)|vect_in>
      if (choice=52) <G|d(left)V_nonlocal/d(k)|vect_in>
      if (choice=53) <G|d(twist)V_nonlocal/d(k)|vect_in>
      if (choice=8)  <G|d2V_nonlocal/d(k)d(k)|vect_in>
      if (choice=81) <G|d[d(right)V_nonlocal/d(k)]/d(k)|vect_in>
 --if (paw_opt=2)
    vectout(2,npwout*my_nspinor*ndat)=final vector in reciprocal space:
      if (choice=1)  <G|V_nonlocal-lamdba.(I+S)|vect_in>
      if (choice=2)  <G|d[V_nonlocal-lamdba.(I+S)]/d(atm. pos)|vect_in>
      if (choice=3)  <G|d[V_nonlocal-lamdba.(I+S)]/d(strain)|vect_in>
      if (choice=5)  <G|d[V_nonlocal-lamdba.(I+S)]/d(k)|vect_in>
      if (choice=51) <G|d(right)[V_nonlocal-lamdba.(I+S)]/d(k)|vect_in>
      if (choice=52) <G|d(left)[V_nonlocal-lamdba.(I+S)]/d(k)|vect_in>
      if (choice=53) <G|d(twist)[V_nonlocal-lamdba.(I+S)]/d(k)|vect_in>
      if (choice=8)  <G|d2[V_nonlocal-lamdba.(I+S)]/d(k)d(k)|vect_in>
      if (choice=81) <G|d[d(right[V_nonlocal-lamdba.(I+S)]/d(k)]/d(k)|vect_in>
 --if (paw_opt=3 or 4)
    svectout(2,npwout*my_nspinor*ndat)=result of the aplication of Sij (overlap matrix)
                  or one of its derivatives to the input vect.:
      if (choice=1)  <G|I+S|vect_in>
      if (choice=2)  <G|dS/d(atm. pos)|vect_in>
      if (choice=3)  <G|dS/d(strain)|vect_in>
      if (choice=5)  <G|dS/d(k)|vect_in>
      if (choice=51) <G|d(right)S/d(k)|vect_in>
      if (choice=52) <G|d(left)S/d(k)|vect_in>
      if (choice=53) <G|d(twist)S/d(k)|vect_in>
      if (choice=3)  <G|d[V_nonlocal-lamdba.(I+S)]/d(strain)|vect_in>
      if (choice=7)  <G|sum_i[p_i><p_i]|vect_in>
      if (choice=8)  <G|d2S/d(k)d(k)|vect_in>
      if (choice=81) <G|d[d(right)S/d(k)]/d(k)|vect_in>

SIDE EFFECTS

  ==== ONLY IF useylm=1
  cprjin(natom,my_nspinor*ndat) <type(pawcprj_type)>=projected input wave function |in> on non-local projectors
                                  =<p_lmn|in> and derivatives
                    Treatment depends on cpopt parameter:
                     if cpopt=-1, <p_lmn|in> (and derivatives)
                                  are computed here (and not saved)
                     if cpopt= 0, <p_lmn|in> are computed here and saved
                                  derivatives are eventually computed but not saved
                     if cpopt= 1, <p_lmn|in> and first derivatives are computed here and saved
                                  other derivatives are eventually computed but not saved
                     if cpopt= 2  <p_lmn|in> are already in memory;
                                  first (and 2nd) derivatives are computed here and not saved
                     if cpopt= 3  <p_lmn|in> are already in memory;
                                  first derivatives are computed here and saved
                                  other derivatives are eventually computed but not saved
                     if cpopt= 4  <p_lmn|in> and first derivatives are already in memory;
                                  other derivatives are not computed
                                  This option is not compatible with choice=4,24 or 6
                     If useylm=0, must have cpopt=-1!
                     Warning: for cpopt= 1 or 3, derivatives wrt strains do not contain
                              the contribution due to the volume change;
                              i.e. <dp_lmn/dEps|in> are incomplete.

NOTES

 * See nonlop_pl and nonlop_ylm to have more comments...
 * In the case signs=1, the array vectout is not used.

PARENTS

      d2frnl,dfpt_nsteltwf,dfptnl_resp,energy,fock_getghc,forstrnps,getgh1c
      getgh2c,getghc,getgsc,m_invovl,m_lobpcgwf,make_grad_berry,nonlop_test
      prep_nonlop,vtowfk,wfd_vnlpsi

CHILDREN

      gemm_nonlop,nonlop_gpu,nonlop_pl,nonlop_ylm,pawcprj_alloc,pawcprj_copy
      pawcprj_free,timab

SOURCE

275 #if defined HAVE_CONFIG_H
276 #include "config.h"
277 #endif
278 
279 #include "abi_common.h"
280 
281 
282 subroutine nonlop(choice,cpopt,cprjin,enlout,hamk,idir,lambda,mpi_enreg,ndat,nnlout,&
283 &                 paw_opt,signs,svectout,tim_nonlop,vectin,vectout,&
284 &                 enl,iatom_only,only_SO,select_k) !optional arguments
285 
286  use defs_basis
287  use defs_abitypes
288  use m_errors
289  use m_profiling_abi
290  use m_gemm_nonlop
291 
292  use m_hamiltonian, only : gs_hamiltonian_type,KPRIME_H_K,K_H_KPRIME,K_H_K,KPRIME_H_KPRIME
293  use m_pawcprj,     only : pawcprj_type,pawcprj_alloc,pawcprj_free,pawcprj_copy
294 
295 !This section has been created automatically by the script Abilint (TD).
296 !Do not modify the following lines by hand.
297 #undef ABI_FUNC
298 #define ABI_FUNC 'nonlop'
299  use interfaces_18_timing
300  use interfaces_66_nonlocal, except_this_one => nonlop
301 !End of the abilint section
302 
303  implicit none
304 
305 !Arguments ------------------------------------
306 !scalars
307  integer,intent(in) :: choice,cpopt,idir,ndat,nnlout,paw_opt,signs,tim_nonlop
308  integer,intent(in),optional :: iatom_only,only_SO,select_k
309  type(MPI_type),intent(in) :: mpi_enreg
310  type(gs_hamiltonian_type),intent(in),target :: hamk
311 !arrays
312  real(dp),intent(in) :: lambda(ndat)
313  real(dp),intent(in),target,optional :: enl(:,:,:)
314  real(dp),intent(inout),target :: vectin(:,:)
315  real(dp),intent(out),target :: enlout(:),svectout(:,:)
316  real(dp),intent(inout),target :: vectout(:,:)
317  type(pawcprj_type),intent(inout),target :: cprjin(:,:)
318 
319 !Local variables-------------------------------
320 !scalars
321  integer :: dimenl1,dimenl2,dimenl2_,dimffnlin,dimffnlout,dimsij,iatm,iatom_only_,idat
322  integer :: ispden,ispinor,istwf_k,itypat,jspinor,matblk_,my_nspinor,n1,n2,n3,natom_,ncpgr_atm
323  integer :: nkpgin,nkpgout,npwin,npwout,ntypat_,only_SO_,select_k_,shift1,shift2,shift3
324  logical :: atom_pert,force_recompute_ph3d,hermdij,kpgin_allocated,kpgout_allocated,use_gemm_nonlop
325  character(len=500) :: msg
326 !arrays
327  integer :: nlmn_atm(1),nloalg_(3)
328  integer,pointer :: kgin(:,:),kgout(:,:)
329  integer, ABI_CONTIGUOUS pointer :: atindx1_(:),indlmn_(:,:,:),nattyp_(:)
330  real(dp) :: tsec(2)
331  real(dp),pointer :: enl_ptr(:,:,:)
332  real(dp),pointer :: ffnlin(:,:,:,:),ffnlin_(:,:,:,:),ffnlout(:,:,:,:),ffnlout_(:,:,:,:)
333  real(dp),pointer :: kpgin(:,:),kpgout(:,:)
334  real(dp) :: kptin(3),kptout(3)
335  real(dp),pointer :: ph3din(:,:,:),ph3din_(:,:,:),ph3dout(:,:,:),ph3dout_(:,:,:)
336  real(dp),pointer :: phkxredin(:,:),phkxredin_(:,:),phkxredout(:,:),phkxredout_(:,:)
337  real(dp), ABI_CONTIGUOUS pointer :: ph1d_(:,:),sij_(:,:)
338  real(dp), pointer :: enl_(:,:,:)
339  type(pawcprj_type),pointer :: cprjin_(:,:)
340   integer :: b0,b1,b2,b3,b4,e0,e1,e2,e3,e4
341 
342 ! **********************************************************************
343 
344  DBG_ENTER("COLL")
345 
346 !Keep track of time spent in this routine (selection of different slots for different choices)
347  call timab(220+tim_nonlop,1,tsec)
348 
349  only_SO_=0; if (present(only_SO)) only_SO_=only_SO
350  my_nspinor=max(1,hamk%nspinor/mpi_enreg%nproc_spinor)
351  hermdij=(any(abs(hamk%nucdipmom)>tol8))
352 
353  force_recompute_ph3d=.false.
354 
355 !Error(s) on incorrect input
356  if (hamk%useylm==0) then
357    if (paw_opt>0) then
358      MSG_BUG('When paw_opt>0 you must use ylm version of nonlop! Set useylm 1.')
359    end if
360    if (cpopt/=-1) then
361      MSG_BUG('If useylm=0, ie no PAW, then cpopt/=-1 is not allowed !')
362    end if
363    if (hermdij) then
364      MSG_BUG('If useylm=0, ie no PAW, nucdipmom/=0 is not allowed !')
365    end if
366    if (hamk%use_gpu_cuda/=0) then
367      msg = 'When use_gpu_cuda/=0 you must use ylm version of nonlop! Set useylm 1.'
368      MSG_BUG(msg)
369    end if
370  end if
371  if ((.not.associated(hamk%kg_k)).or.(.not.associated(hamk%kg_kp))) then
372    MSG_BUG('kg_k/kg_kp should be associated!')
373  end if
374  if ((.not.associated(hamk%ffnl_k)).or.(.not.associated(hamk%ffnl_kp))) then
375    MSG_BUG('ffnl_k/ffnl_kp should be associated!')
376  end if
377 !if (hamk%istwf_k/=hamk%istwf_kp) then
378 !  msg = 'istwf has to be the same for both k-points.'
379 !  MSG_BUG(msg)
380 !end if
381 
382 !Select k-dependent objects according to select_k input parameter
383  select_k_=1;if (present(select_k)) select_k_=select_k
384  nkpgin=0;nkpgout=0;nullify(kpgin);nullify(kpgout)
385  nullify(ph3din);nullify(ph3dout)
386  if (select_k_==KPRIME_H_K) then
387 !  ===== <k^prime|Vnl|k> =====
388    kptin = hamk%kpt_k ; kptout = hamk%kpt_kp
389    npwin=hamk%npw_fft_k ; npwout=hamk%npw_fft_kp
390    kgin => hamk%kg_k ; kgout => hamk%kg_kp
391    if (associated(hamk%kpg_k)) then
392      kpgin => hamk%kpg_k ; nkpgin=size(kpgin,2)
393    end if
394    if (associated(hamk%kpg_kp)) then
395      kpgout => hamk%kpg_kp ; nkpgout=size(kpgout,2)
396    end if
397    phkxredin => hamk%phkxred ; phkxredout => hamk%phkpxred
398    ffnlin => hamk%ffnl_k ; ffnlout => hamk%ffnl_kp
399    if (associated(hamk%ph3d_k )) ph3din  => hamk%ph3d_k
400    if (associated(hamk%ph3d_kp)) ph3dout => hamk%ph3d_kp
401    force_recompute_ph3d=(.not.(associated(hamk%ph3d_k).and.associated(hamk%ph3d_kp)))
402    istwf_k=hamk%istwf_k
403  else if (select_k_==K_H_KPRIME) then
404 !  ===== <k|Vnl|k^prime> =====
405    kptin = hamk%kpt_kp ; kptout = hamk%kpt_k
406    npwin=hamk%npw_fft_kp ; npwout=hamk%npw_fft_k
407    kgin => hamk%kg_kp ; kgout => hamk%kg_k
408    if (associated(hamk%kpg_kp)) then
409      kpgin => hamk%kpg_kp ; nkpgin=size(kpgin,2)
410    end if
411    if (associated(hamk%kpg_k)) then
412      kpgout => hamk%kpg_k ; nkpgout=size(kpgout,2)
413    end if
414    phkxredin => hamk%phkpxred ; phkxredout => hamk%phkxred
415    ffnlin => hamk%ffnl_kp ; ffnlout => hamk%ffnl_k
416    if (associated(hamk%ph3d_kp)) ph3din  => hamk%ph3d_kp
417    if (associated(hamk%ph3d_k )) ph3dout => hamk%ph3d_k
418    force_recompute_ph3d=(.not.(associated(hamk%ph3d_kp).and.associated(hamk%ph3d_k)))
419    istwf_k=hamk%istwf_kp
420  else if (select_k_==K_H_K) then
421 !  ===== <k|Vnl|k> =====
422    kptin = hamk%kpt_k ; kptout = hamk%kpt_k
423    npwin=hamk%npw_fft_k ; npwout=hamk%npw_fft_k
424    kgin => hamk%kg_k ; kgout => hamk%kg_k
425    if (associated(hamk%kpg_k)) then
426      kpgin => hamk%kpg_k ; nkpgin=size(kpgin,2)
427    end if
428    if (associated(hamk%kpg_k)) then
429      kpgout => hamk%kpg_k ; nkpgout=size(kpgout,2)
430    end if
431    phkxredin => hamk%phkxred ; phkxredout => hamk%phkxred
432    ffnlin => hamk%ffnl_k ; ffnlout => hamk%ffnl_k
433    if (associated(hamk%ph3d_k)) ph3din  => hamk%ph3d_k
434    if (associated(hamk%ph3d_k)) ph3dout => hamk%ph3d_k
435    force_recompute_ph3d=(.not.(associated(hamk%ph3d_k)))
436    istwf_k=hamk%istwf_k
437  else if (select_k_==KPRIME_H_KPRIME) then
438 !  ===== <k^prime|Vnl|k^prime> =====
439    kptin = hamk%kpt_kp ; kptout = hamk%kpt_kp
440    npwin=hamk%npw_fft_kp ; npwout=hamk%npw_fft_kp
441    kgin => hamk%kg_kp ; kgout => hamk%kg_kp
442    if (associated(hamk%kpg_kp)) then
443      kpgin => hamk%kpg_kp ; nkpgin=size(kpgin,2)
444    end if
445    if (associated(hamk%kpg_kp)) then
446      kpgout => hamk%kpg_kp ; nkpgout=size(kpgout,2)
447    end if
448    phkxredin => hamk%phkpxred ; phkxredout => hamk%phkpxred
449    ffnlin => hamk%ffnl_kp ; ffnlout => hamk%ffnl_kp
450    if (associated(hamk%ph3d_kp)) ph3din  => hamk%ph3d_kp
451    if (associated(hamk%ph3d_kp)) ph3dout => hamk%ph3d_kp
452    force_recompute_ph3d=(.not.(associated(hamk%ph3d_kp)))
453    istwf_k=hamk%istwf_kp
454  end if
455 
456  if (npwin==0.or.npwout==0) return
457  dimffnlin=size(ffnlin,2);dimffnlout=size(ffnlout,2)
458  kpgin_allocated=(.not.associated(kpgin))
459  if (kpgin_allocated) then
460    ABI_ALLOCATE(kpgin,(npwin,0))
461  end if
462  kpgout_allocated=(.not.associated(kpgout))
463  if (kpgout_allocated) then
464    ABI_ALLOCATE(kpgout,(npwout,0))
465  end if
466 
467 !Check some sizes for safety
468 !if (paw_opt==0.or.cpopt<2.or.((cpopt==2.or.cpopt==3).and.choice>1)) then
469  if (size(ffnlin,1)/=npwin.or.size(ffnlin,3)/=hamk%lmnmax) then
470    msg = 'Incorrect size for ffnlin!'
471 !   MSG_BUG(msg)
472  end if
473  if(signs==2) then
474    if (size(ffnlout,1)/=npwout.or.size(ffnlout,3)/=hamk%lmnmax) then
475      msg = 'Incorrect size for ffnlout!'
476      MSG_BUG(msg)
477    end if
478  end if
479 !This test is OK only because explicit sizes are passed to nonlop_* routines
480  if (size(vectin)<2*npwin*my_nspinor*ndat) then
481    msg = 'Incorrect size for vectin!'
482    MSG_BUG(msg)
483  end if
484  if(choice/=0.and.signs==2) then
485    if(paw_opt/=3) then
486 !    This test is OK only because explicit sizes are passed to nonlop_* routines
487      if (size(vectout)<2*npwout*my_nspinor*ndat) then
488        msg = 'Incorrect size for vectout!'
489        MSG_BUG(msg)
490      end if
491    end if
492    if(paw_opt>=3) then
493      if (size(svectout)<2*npwout*my_nspinor*ndat) then
494        msg = 'Incorrect size for svectout!'
495        MSG_BUG(msg)
496      end if
497    end if
498  end if
499  if(cpopt>=0) then
500    if (size(cprjin)/=hamk%natom*my_nspinor*ndat) then
501      msg = 'Incorrect size for cprjin!'
502      MSG_BUG(msg)
503    end if
504  end if
505 
506 !Non-local coefficients connecting projectors:
507 !If enl is present in the arg list, use it; instead use hamk%ebk
508  if (present(enl)) then
509    enl_ptr => enl
510    dimenl1=size(enl,1);dimenl2=size(enl,2)
511  else
512    enl_ptr => hamk%ekb
513    dimenl1=hamk%dimekb1;dimenl2=hamk%dimekb2
514  end if
515 
516 !In the case of a derivative with respect to an atomic displacement,
517 !and if <g|dVnl/dR|c> is required (signs=2), we only need to compute the
518 !derivatives of the projectors associated with the displaced atom.
519  iatom_only_=-1;if (present(iatom_only)) iatom_only_=iatom_only
520  atom_pert=((signs==2).and.(choice==2.or.choice==4.or.choice==24.or.choice==54))
521 
522  if (iatom_only_>0.and.atom_pert) then
523 !   We consider only atom with index iatom_only
524    iatm=hamk%atindx(iatom_only_);itypat=hamk%typat(iatom_only_)
525    natom_=1 ; ntypat_=1 ; dimenl2_=1 ; matblk_=1
526    nloalg_(:)=hamk%nloalg(:)
527    ABI_ALLOCATE(atindx1_,(1))
528    ABI_ALLOCATE(nattyp_,(1))
529    atindx1_(1)=1 ; nattyp_(1)=1
530 !  Store at the right place the 1d phases
531    n1=hamk%ngfft(1);n2=hamk%ngfft(2);n3=hamk%ngfft(3)
532    ABI_ALLOCATE(ph1d_,(2,(2*n1+1)+(2*n2+1)+(2*n3+1)))
533    shift1=(iatm-1)*(2*n1+1)
534    ph1d_(:,1:2*n1+1)=hamk%ph1d(:,1+shift1:2*n1+1+shift1)
535    shift2=(iatm-1)*(2*n2+1)+hamk%natom*(2*n1+1)
536    ph1d_(:,1+2*n1+1:2*n2+1+2*n1+1)=hamk%ph1d(:,1+shift2:2*n2+1+shift2)
537    shift3=(iatm-1)*(2*n3+1)+hamk%natom*(2*n1+1+2*n2+1)
538    ph1d_(:,1+2*n1+1+2*n2+1:2*n3+1+2*n2+1+2*n1+1)=hamk%ph1d(:,1+shift3:2*n3+1+shift3)
539    ABI_ALLOCATE(phkxredin_,(2,1))
540    ABI_ALLOCATE(phkxredout_,(2,1))
541    phkxredin_(:,1)=phkxredin(:,iatm)
542    phkxredout_(:,1)=phkxredout(:,iatm)
543    ABI_ALLOCATE(ph3din_,(2,npwin,1))
544    ABI_ALLOCATE(ph3dout_,(2,npwout,1))
545    if (force_recompute_ph3d.or.hamk%matblk<hamk%natom) then
546      nloalg_(2)=-abs(nloalg_(2)) !Will compute the 3D phase factors inside nonlop
547    else
548      ph3din_(:,1:npwin,1)=ph3din(:,1:npwin,iatm)
549      ph3dout_(:,1:npwout,1)=ph3dout(:,1:npwout,iatm)
550    end if
551    ABI_ALLOCATE(ffnlin_,(npwin,dimffnlin,hamk%lmnmax,1))
552    ABI_ALLOCATE(ffnlout_,(npwout,dimffnlout,hamk%lmnmax,1))
553    ffnlin_(:,:,:,1)=ffnlin(:,:,:,itypat)
554    ffnlout_(:,:,:,1)=ffnlout(:,:,:,itypat)
555    ABI_DATATYPE_ALLOCATE(cprjin_,(1,my_nspinor*((cpopt+5)/5)))
556    if (cpopt>=0) then
557      nlmn_atm(1)=cprjin(iatm,1)%nlmn
558      ncpgr_atm=cprjin(iatm,1)%ncpgr
559      call pawcprj_alloc(cprjin_,ncpgr_atm,nlmn_atm)
560      do idat=1,ndat
561        do ispinor=1,my_nspinor
562          jspinor=ispinor+(idat-1)*my_nspinor
563          call pawcprj_copy(cprjin(iatm:iatm,jspinor:jspinor),cprjin_(1:1,jspinor:jspinor))
564        end do
565      end do
566    end if
567    ABI_ALLOCATE(enl_,(size(enl_ptr,1),1,hamk%nspinor**2))
568    do ispden=1,hamk%nspinor**2
569      if (dimenl2==hamk%natom .and. hamk%usepaw==1) then
570        enl_(:,1,ispden)=enl_ptr(:,iatom_only_,ispden)
571      else if (dimenl2==hamk%ntypat) then
572        enl_(:,1,ispden)=enl_ptr(:,itypat,ispden)
573      else
574        enl_(:,1,ispden)=enl_ptr(:,1,ispden)
575      end if
576    end do
577    if (allocated(hamk%sij)) then
578      dimsij=size(hamk%sij,1)
579      ABI_ALLOCATE(sij_,(dimsij,1))
580      if (size(hamk%sij,2)==hamk%ntypat) then
581        sij_(:,1)=hamk%sij(:,itypat)
582      else if (size(hamk%sij)>0) then
583        sij_(:,1)=hamk%sij(:,1)
584      end if
585    end if
586    ABI_ALLOCATE(indlmn_,(6,hamk%lmnmax,1))
587    indlmn_(:,:,1)=hamk%indlmn(:,:,itypat)
588 
589  else
590 !  Usual case: all atoms are processed
591    natom_  =hamk%natom; ntypat_=hamk%ntypat
592    dimenl2_=dimenl2   ; matblk_=hamk%matblk
593    nloalg_(:)  = hamk%nloalg(:)
594    atindx1_    => hamk%atindx1
595    nattyp_     => hamk%nattyp
596    ph1d_       => hamk%ph1d
597    phkxredin_  => phkxredin
598    phkxredout_ => phkxredout
599    ffnlin_     => ffnlin
600    ffnlout_    => ffnlout
601    cprjin_     => cprjin
602    enl_        => enl_ptr
603    sij_        => hamk%sij
604    indlmn_     => hamk%indlmn
605    if (force_recompute_ph3d) then
606      nloalg_(2)=-abs(nloalg_(2)) !Will compute the 3D phase factors inside nonlop
607      ABI_ALLOCATE(ph3din_,(2,npwin,hamk%matblk))
608      ABI_ALLOCATE(ph3dout_,(2,npwout,hamk%matblk))
609    else
610      ph3din_     => ph3din
611      ph3dout_    => ph3dout
612    end if
613  end if
614 
615 !A specific version of nonlop based on BLAS3 can be used
616 !But there are several restrictions
617  use_gemm_nonlop= ( gemm_nonlop_use_gemm .and. &
618 & signs == 2 .and. paw_opt /= 2 .and. hamk%nspinor == 1 .and. &
619 & cpopt < 3 .and. hamk%useylm /= 0 .and. &
620 & (choice < 2 .or. choice == 7) )
621  if(use_gemm_nonlop) then
622    call gemm_nonlop(atindx1_,choice,cpopt,cprjin_,dimenl1,dimenl2_,&
623 &   dimffnlin,dimffnlout,enl_,enlout,ffnlin_,ffnlout_,hamk%gmet,hamk%gprimd,&
624 &   idir,indlmn_,istwf_k,kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,&
625 &   hamk%lmnmax,matblk_,hamk%mgfft,mpi_enreg,hamk%mpsang,hamk%mpssoang,&
626 &   natom_,nattyp_,ndat,hamk%ngfft,nkpgin,nkpgout,nloalg_,&
627 &   nnlout,npwin,npwout,my_nspinor,hamk%nspinor,ntypat_,only_SO_,paw_opt,&
628 &   phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,sij_,svectout,&
629 &   tim_nonlop,hamk%ucvol,hamk%useylm,vectin,vectout,hamk%use_gpu_cuda)
630 
631  else
632    !$omp parallel do default(shared), &
633    !$omp& firstprivate(ndat,npwin,my_nspinor,choice,signs,paw_opt,npwout,cpopt,nnlout), &
634    !$omp& private(b0,b1,b2,b3,b4,e0,e1,e2,e3,e4)
635    !!$omp& schedule(static), if(hamk%use_gpu_cuda==0)
636    do idat=1, ndat
637      !vectin_idat => vectin(:,1+npwin*my_nspinor*(idat-1):npwin*my_nspinor*idat)
638      b0 = 1+npwin*my_nspinor*(idat-1)
639      e0 = npwin*my_nspinor*idat
640      if (choice/=0.and.signs==2.and.paw_opt/=3) then
641        !vectout_idat => vectout(:,1+npwout*my_nspinor*(idat-1):npwout*my_nspinor*idat)
642        b1 = 1+npwout*my_nspinor*(idat-1)
643        e1 = npwout*my_nspinor*idat
644      else
645        !vectout_idat => vectout
646        b1 = lbound(vectout,dim=2)
647        e1 = ubound(vectout,dim=2)
648      end if
649      if (choice/=0.and.signs==2.and.paw_opt>=3) then
650        !svectout_idat => svectout(:,1+npwout*my_nspinor*(idat-1):npwout*my_nspinor*idat)
651        b2 = 1+npwout*my_nspinor*(idat-1)
652        e2 = npwout*my_nspinor*idat
653      else
654        !svectout_idat => svectout
655        b2 = lbound(svectout,dim=2)
656        e2 = ubound(svectout,dim=2)
657      end if
658 
659      if (cpopt>=0) then
660        !cprjin_idat => cprjin_(:,my_nspinor*(idat-1)+1:my_nspinor*(idat))
661        b3 = my_nspinor*(idat-1)+1
662        e3 = my_nspinor*(idat)
663      else
664        !cprjin_idat => cprjin_
665        b3 = lbound(cprjin_,dim=2)
666        e3 = ubound(cprjin_,dim=2)
667      end if
668      if (nnlout>0) then
669       !enlout_idat => enlout((idat-1)*nnlout+1:(idat*nnlout))
670        b4 = (idat-1)*nnlout+1
671        e4 = (idat*nnlout)
672      else
673       !enlout_idat => enlout
674        b4 = lbound(enlout,dim=1)
675        e4 = ubound(enlout,dim=1)
676      end if
677 
678 !    Legendre Polynomials version
679      if (hamk%useylm==0) then
680        call nonlop_pl(choice,dimenl1,dimenl2_,dimffnlin,dimffnlout,enl_,&
681 &       enlout(b4:e4),ffnlin_,ffnlout_,hamk%gmet,hamk%gprimd,idir,indlmn_,istwf_k,&
682 &       kgin,kgout,kpgin,kpgout,kptin,kptout,hamk%lmnmax,matblk_,hamk%mgfft,&
683 &       mpi_enreg,hamk%mpsang,hamk%mpssoang,natom_,nattyp_,hamk%ngfft,&
684 &       nkpgin,nkpgout,nloalg_,npwin,npwout,my_nspinor,hamk%nspinor,&
685 &       ntypat_,only_SO_,phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,hamk%ucvol,&
686 &       vectin(:,b0:e0),vectout(:,b1:e1))
687 !    Spherical Harmonics version
688      else if (hamk%use_gpu_cuda==0) then
689        call nonlop_ylm(atindx1_,choice,cpopt,cprjin_(:,b3:e3),dimenl1,dimenl2_,&
690 &       dimffnlin,dimffnlout,enl_,enlout(b4:e4),ffnlin_,ffnlout_,hamk%gprimd,idir,&
691 &       indlmn_,istwf_k,kgin,kgout,kpgin,kpgout,kptin,kptout,lambda(idat),&
692 &       hamk%lmnmax,matblk_,hamk%mgfft,mpi_enreg,natom_,nattyp_,hamk%ngfft,&
693 &       nkpgin,nkpgout,nloalg_,nnlout,npwin,npwout,my_nspinor,hamk%nspinor,&
694 &       ntypat_,paw_opt,phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,sij_,&
695 &       svectout(:,b2:e2),hamk%ucvol,vectin(:,b0:e0),vectout(:,b1:e1),hermdij=hermdij)
696 !    GPU version
697      else
698        call nonlop_gpu(atindx1_,choice,cpopt,cprjin(:,b3:e3),dimenl1,dimenl2_,&
699 &       dimffnlin,dimffnlout,enl_,enlout(b4:e4),ffnlin_,ffnlout_,hamk%gprimd,idir,&
700 &       indlmn_,istwf_k,kgin,kgout,kpgin,kpgout,kptin,kptout,lambda(idat),&
701 &       hamk%lmnmax,matblk_,hamk%mgfft,mpi_enreg,natom_,nattyp_,hamk%ngfft,&
702 &       nkpgin,nkpgout,nloalg_,nnlout,npwin,npwout,my_nspinor,hamk%nspinor,&
703 &       ntypat_,paw_opt,phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,sij_,&
704 &       svectout(:,b2:e2),hamk%ucvol,vectin(:,b0:e0),vectout(:,b1:e1))
705      end if
706    end do
707    !$omp end parallel do
708  end if
709 
710 !Release temporary storage
711  if (iatom_only_>0.and.atom_pert) then
712    if (cpopt>=0) then
713      call pawcprj_free(cprjin_)
714    end if
715    ABI_DEALLOCATE(atindx1_)
716    ABI_DEALLOCATE(nattyp_)
717    ABI_DEALLOCATE(ph1d_)
718    ABI_DEALLOCATE(ph3din_)
719    ABI_DEALLOCATE(ph3dout_)
720    ABI_DEALLOCATE(phkxredin_)
721    ABI_DEALLOCATE(phkxredout_)
722    ABI_DEALLOCATE(ffnlin_)
723    ABI_DEALLOCATE(ffnlout_)
724    ABI_DEALLOCATE(enl_)
725    ABI_DEALLOCATE(indlmn_)
726    ABI_DATATYPE_DEALLOCATE(cprjin_)
727    if (allocated(hamk%sij)) then
728      ABI_DEALLOCATE(sij_)
729    end if
730  else if (force_recompute_ph3d) then
731    ABI_DEALLOCATE(ph3din_)
732    ABI_DEALLOCATE(ph3dout_)
733  end if
734  if (kpgin_allocated) then
735    ABI_DEALLOCATE(kpgin)
736  end if
737  if (kpgout_allocated) then
738    ABI_DEALLOCATE(kpgout)
739  end if
740 
741  call timab(220+tim_nonlop,2,tsec)
742 
743  DBG_EXIT("COLL")
744 
745 end subroutine nonlop