TABLE OF CONTENTS


ABINIT/m_nonlop [ Modules ]

[ Top ] [ Modules ]

NAME

  m_nonlop

FUNCTION

COPYRIGHT

  Copyright (C) 1998-2018 ABINIT group (MT)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

PARENTS

CHILDREN

SOURCE

20 #if defined HAVE_CONFIG_H
21 #include "config.h"
22 #endif
23 
24 #include "abi_common.h"
25 
26 module m_nonlop
27 
28  use defs_basis
29  use defs_abitypes
30  use m_errors
31  use m_abicore
32  use m_xmpi
33  use m_cgtools
34  use m_gemm_nonlop
35 
36  use m_time,        only : timab
37  use m_hamiltonian, only : gs_hamiltonian_type, KPRIME_H_K, K_H_KPRIME, K_H_K, KPRIME_H_KPRIME
38  use m_pawcprj,     only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_copy
39  use m_nonlop_pl,   only : nonlop_pl
40  use m_nonlop_ylm,  only : nonlop_ylm
41 
42  implicit none
43 
44  private

ABINIT/nonlop [ Functions ]

[ Top ] [ Functions ]

NAME

 nonlop

FUNCTION

 This routine is a driver to compute:
 * Application of a nonlocal operator Vnl_k_k^prime in order to get:
    - contracted elements (energy, forces, stresses, ...), if signs=1
    - a function in reciprocal space (|out> = Vnl|in>),    if signs=2
 * Optionally, in case of PAW calculation:
   - Application of the overlap matrix in reciprocal space
     (<in|S|in> or (I+S)|in>).
   - Application of (Vnl-lambda.S) in reciprocal space
 According to user s choice, the routine calls a subroutine, computing all quantities:
   - using Legendre Polynomials Pl (Norm-conserving psps only)
   - using Spherical Harmonics Ylm (N-conserving or PAW ; compulsory for PAW)
   - using GPUs (N-conserving or PAW)

INPUTS

  choice: chooses possible output:
    choice=0 => do nothing (only compute WF projected with NL projectors)
          =1 => non-local energy contribution
          =2 => 1st derivative(s) with respect to atomic position(s)
          =3 => 1st derivative(s) with respect to strain(s)
          =23=> 1st derivative(s) with respect to atomic pos. and
                1st derivative(s) with respect to atomic pos. and strains
          =4 => 2nd derivative(s) with respect to 2 atomic pos.
          =24=> 1st derivative(s) with respect to atm. pos. and
                2nd derivative(s) with respect to 2 atomic pos.
          =5 => 1st derivative(s) with respect to k wavevector, typically
                sum_ij [ |p_i> D_ij <dp_j/dk| + |dp_i/dk> D_ij < p_j| ]
          =6 => 2nd derivative(s) with respect to 2 strains and
                mixed 2nd derivative(s) with respect to strains & atomic pos.
          =51 =>right 1st derivative(s) with respect to k wavevector, typically
                sum_ij [ |p_i> D_ij <dp_j/dk| ]
          =52 =>left 1st derivative(s) with respect to k wavevector, typically
                sum_ij [ |dp_i/dk> D_ij < p_j| ]
          =53 =>twist 1st derivative(s) with respect to k, typically
                sum_ij [ |dp_i/dk_(idir+1)> D_ij <dp_j//dk_(idir-1)|
                        -|dp_i/dk_(idir-1)> D_ij <dp_j//dk_(idir+1)|]
          =54=> mixed 2nd derivative(s) with respect to atomic pos. and left k wavevector
          =55=> mixed 2nd derivative(s) with respect to strain and right k wavevector
          =7 => apply operator $\sum_i [ |p_i> <p_i| ],
                same as overlap operator with s_ij=identity (paw_opt==3 only)
          =8 => 2nd derivatives with respect to 2 k wavevectors
          =81=> partial 2nd derivatives with respect to 2 k wavevectors,
                full derivative with respect to k1, right derivative with respect to k2,
                (derivative with respect to k of choice 51), typically
                sum_ij [ |dp_i/dk1> D_ij <dp_j/dk2| + |p_i> D_ij < d2p_j/dk1dk2| ]
    Only choices 1,2,3,23,4,5,6 are compatible with useylm=0.
    Only choices 1,2,3,5,51,52,53,7,8,81 are compatible with signs=2
  cpopt=flag defining the status of cprjin%cp(:)=<Proj_i|Cnk> scalars (see below, side effects)
  [enl]=optional (if not present, use hamk%ekb); non-local coeffs connecting projectors
        see hamk%ekb description
  hamk <type(gs_hamiltonian_type)>=data defining the Hamiltonian at a given k (NL part involved here)
     | atindx1(natom)=index table for atoms, inverse of atindx
     | dimekb1,dimekb2=dimensions of ekb (see ham%ekb)
     | dimekbq=1 if enl factors do not contain a exp(-iqR) phase, 2 is they do
     | ekb(dimekb1,dimekb2,nspinor**2,dimekbq)=
     |   ->NC psps (paw_opt=0) : Kleinman-Bylander energies (hartree)
     |                           dimekb1=lmnmax, dimekb2=ntypat
     |   ->PAW (paw_opt=1 or 4): Dij coefs connecting projectors (ij symmetric)
     |                           dimekb1=cplex_ekb*lmnmax*(lmnmax+1)/2, dimekb2=natom
     |                           Complex numbers if cplex_ekb=2
     |                           ekb(:,:,1)= Dij^up-up, ekb(:,:,2)= Dij^dn-dn
     |                           ekb(:,:,3)= Dij^up-dn, ekb(:,:,4)= Dij^dn-up (only if nspinor=2)
     | ffnl_k(npw_k,dimffnl_k,lmnmax,ntypat)=nonlocal form factors at k
     | ffnl_kp(npw_kp,dimffnl_kp,lmnmax,ntypat)=nonlocal form factors at k^prime
     | gmet(3,3)=metric tensor for G vecs (in bohr**-2)
     | gprimd(3,3)=dimensional reciprocal space primitive translations
     | indlmn(6,i,ntypat)= array giving l,m,n,lm,ln,s for i=ln (useylm=0) or i=lmn (useylm=1)
     | istwf_k=option parameter that describes the storage of wfs at k
     | istwf_kp=option parameter that describes the storage of wfs at k^prime
     | lmnmax=max. number of (l,m,n) components over all types of atoms
     | matblk=dimension of the arrays ph3d_k and ph3d_kp
     | mgfft=maximum size of 1D FFTs
     | mpsang= 1+maximum angular momentum for nonlocal pseudopotentials
     | mpssoang= 1+max(spin*angular momentum) for nonlocal pseudopotentials
     | natom=number of atoms in cell
     | nattyp(ntypat)=number of atoms of each type
     | ngfft(18)=contain all needed information about 3D FFT
     ! kg_k(3,npw_k)=integer coords of planewaves in basis sphere, for k
     ! kg_kp(3,npw_kp)=integer coords of planewaves in basis sphere, for k^prime
     ! kpg_k(npw_k,:)= (k+G) components and related data
     ! kpg_kp(npw_kp,:)=(k^prime+G) components and related data,
     ! kpt(3)=k point in terms of recip. translations
     ! kptp(3)=k^prime point in terms of recip. translations
     | nloalg(3)=governs the choice of the algorithm for nonlocal operator
     ! npw_k=number of (k+G) planewaves
     ! npw_kp=number of (k^prime+G) planewaves
     | ntypat=number of types of atoms in cell
     | nspinor=total number of spinorial components of the wavefunctions
     | ph1d(2,3*(2*mgfft+1)*natom)=1D structure factors phase information
     | ph3d_k(2,npw_k,matblk)=3D structure factors, for each atom and (k+g) plane wave
     | ph3d_kp(2,npw_kp,matblk)=3-dim structure factors, for each atom and (k^prime+g) plane wave
     | phkxred(2,natom)=phase factors exp(2 pi k.xred)
     | phkpxred(2,natom)=phase factors exp(2 pi k^prime.xred)
     | sij(dimekb1,ntypat)=overlap matrix components (only if paw_opt=2, 3 or 4)
     | ucvol=unit cell volume (bohr^3)
     | use_gpu_cuda=governs wheter we do the hamiltonian calculation on gpu or not
     | useylm=how the NL operator is to be applied: 1=using Ylm, 0=using Legendre polynomials
  [iatom_only]=optional. If present (and >0), only projectors related to atom of index iatom_only
          will be applied. (used fi to apply derivative of NL operator wrt an atomic displacement)
  idir=direction of the - atom to be moved in the case (choice=2,signs=2),
                        - k point direction in the case (choice=5,51,or 52)
                          for choice 53 signs=2, cross derivatives are in idir-1 and idir+1 directions
                        - strain component (1:6) in the case (choice=3,signs=2) or (choice=6,signs=1)
  lambda=factor to be used when computing (Vln-lambda.S) - only for paw_opt=2
         Typically lambda is the eigenvalue (or its guess)
  mpi_enreg=informations about MPI parallelization
  ndat=number of wavefunctions on which to apply nonlop
  nnlout=dimension of enlout (when signs=1 and choice>0):
         ==== if paw_opt=0, 1 or 2 ====
         choice   nnlout     |  choice   nnlout
              1   1          |      51   6 (complex)
              2   3*natom    |      52   6 (complex)
              3   6          |      53   6
              4   6*natom    |      54   9*natom
             23   6+3*natom  |      55   36 (complex)
             24   9*natom    |       6   36+18*natom
              5   3          |       8   6
                             |      81   18 (complex)
         ==== if paw_opt=3 ====
         choice   nnlout
              1   1
              2   3*natom
              5   3
             51   3
             52   3
             54   9*natom
             55   36
              7   1
              8   6
             81   9
         ==== if paw_opt=4 ====
         not available
  [only_SO]=optional, flag to calculate only the SO part in nonlop
  paw_opt= define the nonlocal operator concerned with:
           paw_opt=0 : Norm-conserving Vnl (use of Kleinman-Bylander ener.)
           paw_opt=1 : PAW nonlocal part of H (use of Dij coeffs)
           paw_opt=2 : PAW: (Vnl-lambda.Sij) (Sij=overlap matrix)
           paw_opt=3 : PAW overlap matrix (Sij)
           paw_opt=4 : both PAW nonlocal part of H (Dij) and overlap matrix (Sij)
  [select_k]=optional, option governing the choice of k points to be used.
             hamk datastructure contains quantities needed to apply NL operator
             in reciprocal space between 2 kpoints, k and k^prime (equal in most cases);
             if select_k=1, <k^prime|Vnl|k>       is applied [default]
             if select_k=2, <k|Vnl|k^prime>       is applied
             if select_k=3, <k|Vnl|k>             is applied
             if select_k=4, <k^prime|Vnl|k^prime> is applied
  signs= if 1, get contracted elements (energy, forces, stress, ...)
         if 2, applies the non-local operator to a function in reciprocal space
  tim_nonlop=timing code of the calling routine (can be set to 0 if not attributed)
  vectin(2,npwin*my_nspinor*ndat)=input cmplx wavefunction coefficients <G|Cnk>

OUTPUT

 ==== if (signs==1) ====
 --If (paw_opt==0, 1 or 2)
    enlout(nnlout)= contribution to the non-local part of the following properties:
      if choice=1 : enlout(1)             -> the energy
      if choice=2 : enlout(3*natom)       -> 1st deriv. of energy wrt atm. pos (forces)
      if choice=3 : enlout(6)             -> 1st deriv. of energy wrt strain (stresses)
      if choice=4 : enlout(6*natom)       -> 2nd deriv. of energy wrt 2 atm. pos (dyn. mat.)
      if choice=23: enlout(6+3*natom)     -> 1st deriv. of energy wrt atm. pos (forces) and
                                             1st deriv. of energy wrt strain (stresses)
      if choice=24: enlout(9*natom)       -> 1st deriv. of energy wrt atm. pos (forces) and
                                             2nd deriv. of energy wrt 2 atm. pos (dyn. mat.)
      if choice=5 : enlout(3)             -> 1st deriv. of energy wrt k
      if choice=51: enlout(3)             -> 1st deriv. (right) of energy wrt k
      if choice=52: enlout(3)             -> 1st deriv. (left) of energy wrt k
      if choice=53: enlout(3)             -> 1st deriv. (twist) of energy wrt k
      if choice=54: enlout(18*natom)      -> 2nd deriv. of energy wrt atm. pos and right k (Born eff. charge)
      if choice=55: enlout(36)            -> 2nd deriv. of energy wrt strain and right k (piezoelastic tensor)
      if choice=6 : enlout(36+18*natom)   -> 2nd deriv. of energy wrt 2 strains (elast. tensor) and
                                             2nd deriv. of energy wrt to atm. pos and strain (internal strain)
      if choice=8 : enlout(6)             -> 2nd deriv. of energy wrt 2 k
      if choice=81: enlout(9)             -> 2nd deriv.of E: full derivative w.r.t. k1, right derivative w.r.t k2
 --If (paw_opt==3)
      if choice=1 : enlout(1)             -> contribution to <c|S|c> (note: not including <c|c>)
      if choice=2 : enlout(3*natom)       -> contribution to <c|dS/d_atm.pos|c>
      if choice=51: enlout(3)             -> contribution to <c|d(right)S/d_k|c>
      if choice=52: enlout(3)             -> contribution to <c|d(left)S/d_k|c>
      if choice=54: enlout(18*natom)      -> 2nd deriv. of energy wrt atm. pos and right k (Born eff. charge)
      if choice=55: enlout(36)            -> 2nd deriv. of energy wrt strain and right k (piezoelastic tensor)
      if choice=7 : enlout(1)             -> contribution to <c|sum_i[p_i><p_i]|c>
      if choice=8 : enlout(6)             -> contribution to <c|d2S/d_k1d_k2|c>
      if choice=81: enlout(9)             -> contribution to <c|dS/d_k1[d(right)d_k2]|c>
 --If (paw_opt==4)
      not available
 ==== if (signs==2) ====
 --if (paw_opt=0, 1 or 4)
    vectout(2,npwout*my_nspinor*ndat)=result of the aplication of the concerned operator
                or one of its derivatives to the input vect.:
      if (choice=1)  <G|V_nonlocal|vect_in>
      if (choice=2)  <G|dV_nonlocal/d(atm. pos)|vect_in>
      if (choice=3)  <G|dV_nonlocal/d(strain)|vect_in>
      if (choice=5)  <G|dV_nonlocal/d(k)|vect_in>
      if (choice=51) <G|d(right)V_nonlocal/d(k)|vect_in>
      if (choice=52) <G|d(left)V_nonlocal/d(k)|vect_in>
      if (choice=53) <G|d(twist)V_nonlocal/d(k)|vect_in>
      if (choice=8)  <G|d2V_nonlocal/d(k)d(k)|vect_in>
      if (choice=81) <G|d[d(right)V_nonlocal/d(k)]/d(k)|vect_in>
 --if (paw_opt=2)
    vectout(2,npwout*my_nspinor*ndat)=final vector in reciprocal space:
      if (choice=1)  <G|V_nonlocal-lamdba.(I+S)|vect_in>
      if (choice=2)  <G|d[V_nonlocal-lamdba.(I+S)]/d(atm. pos)|vect_in>
      if (choice=3)  <G|d[V_nonlocal-lamdba.(I+S)]/d(strain)|vect_in>
      if (choice=5)  <G|d[V_nonlocal-lamdba.(I+S)]/d(k)|vect_in>
      if (choice=51) <G|d(right)[V_nonlocal-lamdba.(I+S)]/d(k)|vect_in>
      if (choice=52) <G|d(left)[V_nonlocal-lamdba.(I+S)]/d(k)|vect_in>
      if (choice=53) <G|d(twist)[V_nonlocal-lamdba.(I+S)]/d(k)|vect_in>
      if (choice=8)  <G|d2[V_nonlocal-lamdba.(I+S)]/d(k)d(k)|vect_in>
      if (choice=81) <G|d[d(right[V_nonlocal-lamdba.(I+S)]/d(k)]/d(k)|vect_in>
 --if (paw_opt=3 or 4)
    svectout(2,npwout*my_nspinor*ndat)=result of the aplication of Sij (overlap matrix)
                  or one of its derivatives to the input vect.:
      if (choice=1)  <G|I+S|vect_in>
      if (choice=2)  <G|dS/d(atm. pos)|vect_in>
      if (choice=3)  <G|dS/d(strain)|vect_in>
      if (choice=5)  <G|dS/d(k)|vect_in>
      if (choice=51) <G|d(right)S/d(k)|vect_in>
      if (choice=52) <G|d(left)S/d(k)|vect_in>
      if (choice=53) <G|d(twist)S/d(k)|vect_in>
      if (choice=3)  <G|d[V_nonlocal-lamdba.(I+S)]/d(strain)|vect_in>
      if (choice=7)  <G|sum_i[p_i><p_i]|vect_in>
      if (choice=8)  <G|d2S/d(k)d(k)|vect_in>
      if (choice=81) <G|d[d(right)S/d(k)]/d(k)|vect_in>

SIDE EFFECTS

  ==== ONLY IF useylm=1
  cprjin(natom,my_nspinor*ndat) <type(pawcprj_type)>=projected input wave function |in> on non-local projectors
                                  =<p_lmn|in> and derivatives
                    Treatment depends on cpopt parameter:
                     if cpopt=-1, <p_lmn|in> (and derivatives)
                                  are computed here (and not saved)
                     if cpopt= 0, <p_lmn|in> are computed here and saved
                                  derivatives are eventually computed but not saved
                     if cpopt= 1, <p_lmn|in> and first derivatives are computed here and saved
                                  other derivatives are eventually computed but not saved
                     if cpopt= 2  <p_lmn|in> are already in memory;
                                  first (and 2nd) derivatives are computed here and not saved
                     if cpopt= 3  <p_lmn|in> are already in memory;
                                  first derivatives are computed here and saved
                                  other derivatives are eventually computed but not saved
                     if cpopt= 4  <p_lmn|in> and first derivatives are already in memory;
                                  other derivatives are not computed
                                  This option is not compatible with choice=4,24 or 6
                     If useylm=0, must have cpopt=-1!
                     Warning: for cpopt= 1 or 3, derivatives wrt strains do not contain
                              the contribution due to the volume change;
                              i.e. <dp_lmn/dEps|in> are incomplete.

NOTES

 * See nonlop_pl and nonlop_ylm to have more comments...
 * In the case signs=1, the array vectout is not used.

PARENTS

      d2frnl,dfpt_nsteltwf,dfptnl_resp,energy,fock_getghc,forstrnps,getgh1c
      getgh2c,getghc,getgsc,m_invovl,m_lobpcgwf,make_grad_berry,nonlop_test
      prep_nonlop,vtowfk,wfd_vnlpsi

CHILDREN

      gemm_nonlop,nonlop_gpu,nonlop_pl,nonlop_ylm,pawcprj_alloc,pawcprj_copy
      pawcprj_free,timab

SOURCE

320 subroutine nonlop(choice,cpopt,cprjin,enlout,hamk,idir,lambda,mpi_enreg,ndat,nnlout,&
321 &                 paw_opt,signs,svectout,tim_nonlop,vectin,vectout,&
322 &                 enl,iatom_only,only_SO,select_k) !optional arguments
323 
324 
325 !This section has been created automatically by the script Abilint (TD).
326 !Do not modify the following lines by hand.
327 #undef ABI_FUNC
328 #define ABI_FUNC 'nonlop'
329 !End of the abilint section
330 
331  implicit none
332 
333 !Arguments ------------------------------------
334 !scalars
335  integer,intent(in) :: choice,cpopt,idir,ndat,nnlout,paw_opt,signs,tim_nonlop
336  integer,intent(in),optional :: iatom_only,only_SO,select_k
337  type(MPI_type),intent(in) :: mpi_enreg
338  type(gs_hamiltonian_type),intent(in),target :: hamk
339 !arrays
340  real(dp),intent(in) :: lambda(ndat)
341  real(dp),intent(in),target,optional :: enl(:,:,:,:)
342  real(dp),intent(inout),target :: vectin(:,:)
343  real(dp),intent(out),target :: enlout(:),svectout(:,:)
344  real(dp),intent(inout),target :: vectout(:,:)
345  type(pawcprj_type),intent(inout),target :: cprjin(:,:)
346 
347 !Local variables-------------------------------
348 !scalars
349  integer :: dimenl1,dimenl2,dimenl2_,dimekbq,dimffnlin,dimffnlout,dimsij,iatm,iatom_only_,idat
350  integer :: ii,ispden,ispinor,istwf_k,itypat,jspinor,matblk_,my_nspinor,n1,n2,n3,natom_,ncpgr_atm
351  integer :: nkpgin,nkpgout,npwin,npwout,ntypat_,only_SO_,select_k_,shift1,shift2,shift3
352  logical :: atom_pert,force_recompute_ph3d,kpgin_allocated,kpgout_allocated,use_gemm_nonlop
353  character(len=500) :: msg
354 !arrays
355  integer :: nlmn_atm(1),nloalg_(3)
356  integer,pointer :: kgin(:,:),kgout(:,:)
357  integer, ABI_CONTIGUOUS pointer :: atindx1_(:),indlmn_(:,:,:),nattyp_(:)
358  real(dp) :: tsec(2)
359  real(dp),pointer :: enl_ptr(:,:,:,:)
360  real(dp),pointer :: ffnlin(:,:,:,:),ffnlin_(:,:,:,:),ffnlout(:,:,:,:),ffnlout_(:,:,:,:)
361  real(dp),pointer :: kpgin(:,:),kpgout(:,:)
362  real(dp) :: kptin(3),kptout(3)
363  real(dp),pointer :: ph3din(:,:,:),ph3din_(:,:,:),ph3dout(:,:,:),ph3dout_(:,:,:)
364  real(dp),pointer :: phkxredin(:,:),phkxredin_(:,:),phkxredout(:,:),phkxredout_(:,:)
365  real(dp), ABI_CONTIGUOUS pointer :: ph1d_(:,:),sij_(:,:)
366  real(dp), pointer :: enl_(:,:,:,:)
367  type(pawcprj_type),pointer :: cprjin_(:,:)
368   integer :: b0,b1,b2,b3,b4,e0,e1,e2,e3,e4
369 
370 ! **********************************************************************
371 
372  DBG_ENTER("COLL")
373 
374 !Keep track of time spent in this routine (selection of different slots for different choices)
375  call timab(220+tim_nonlop,1,tsec)
376 
377  only_SO_=0; if (present(only_SO)) only_SO_=only_SO
378  my_nspinor=max(1,hamk%nspinor/mpi_enreg%nproc_spinor)
379 
380  force_recompute_ph3d=.false.
381 
382 !Error(s) on incorrect input
383  if (hamk%useylm==0) then
384    if (paw_opt>0) then
385      MSG_BUG('When paw_opt>0 you must use ylm version of nonlop! Set useylm 1.')
386    end if
387    if (cpopt/=-1) then
388      MSG_BUG('If useylm=0, ie no PAW, then cpopt/=-1 is not allowed !')
389    end if
390    if (hamk%dimekbq/=1) then
391      MSG_BUG('If useylm=0, ie no PAW, then dimekbq/=-1 is not allowed !')
392    end if
393    if (hamk%use_gpu_cuda/=0) then
394      msg = 'When use_gpu_cuda/=0 you must use ylm version of nonlop! Set useylm 1.'
395      MSG_BUG(msg)
396    end if
397  end if
398  if (hamk%use_gpu_cuda/=0.and.hamk%dimekbq/=1) then
399    msg = 'GPU version of nonlop not compatible with a exp(-iqR) phase!'
400    MSG_BUG(msg)
401  end if
402  if ((.not.associated(hamk%kg_k)).or.(.not.associated(hamk%kg_kp))) then
403    MSG_BUG('kg_k/kg_kp should be associated!')
404  end if
405  if ((.not.associated(hamk%ffnl_k)).or.(.not.associated(hamk%ffnl_kp))) then
406    MSG_BUG('ffnl_k/ffnl_kp should be associated!')
407  end if
408 !if (hamk%istwf_k/=hamk%istwf_kp) then
409 !  msg = 'istwf has to be the same for both k-points.'
410 !  MSG_BUG(msg)
411 !end if
412 
413 !Select k-dependent objects according to select_k input parameter
414  select_k_=1;if (present(select_k)) select_k_=select_k
415  nkpgin=0;nkpgout=0;nullify(kpgin);nullify(kpgout)
416  nullify(ph3din);nullify(ph3dout)
417  if (select_k_==KPRIME_H_K) then
418 !  ===== <k^prime|Vnl|k> =====
419    kptin = hamk%kpt_k ; kptout = hamk%kpt_kp
420    npwin=hamk%npw_fft_k ; npwout=hamk%npw_fft_kp
421    kgin => hamk%kg_k ; kgout => hamk%kg_kp
422    if (associated(hamk%kpg_k)) then
423      kpgin => hamk%kpg_k ; nkpgin=size(kpgin,2)
424    end if
425    if (associated(hamk%kpg_kp)) then
426      kpgout => hamk%kpg_kp ; nkpgout=size(kpgout,2)
427    end if
428    phkxredin => hamk%phkxred ; phkxredout => hamk%phkpxred
429    ffnlin => hamk%ffnl_k ; ffnlout => hamk%ffnl_kp
430    if (associated(hamk%ph3d_k )) ph3din  => hamk%ph3d_k
431    if (associated(hamk%ph3d_kp)) ph3dout => hamk%ph3d_kp
432    force_recompute_ph3d=(.not.(associated(hamk%ph3d_k).and.associated(hamk%ph3d_kp)))
433    istwf_k=hamk%istwf_k
434  else if (select_k_==K_H_KPRIME) then
435 !  ===== <k|Vnl|k^prime> =====
436    kptin = hamk%kpt_kp ; kptout = hamk%kpt_k
437    npwin=hamk%npw_fft_kp ; npwout=hamk%npw_fft_k
438    kgin => hamk%kg_kp ; kgout => hamk%kg_k
439    if (associated(hamk%kpg_kp)) then
440      kpgin => hamk%kpg_kp ; nkpgin=size(kpgin,2)
441    end if
442    if (associated(hamk%kpg_k)) then
443      kpgout => hamk%kpg_k ; nkpgout=size(kpgout,2)
444    end if
445    phkxredin => hamk%phkpxred ; phkxredout => hamk%phkxred
446    ffnlin => hamk%ffnl_kp ; ffnlout => hamk%ffnl_k
447    if (associated(hamk%ph3d_kp)) ph3din  => hamk%ph3d_kp
448    if (associated(hamk%ph3d_k )) ph3dout => hamk%ph3d_k
449    force_recompute_ph3d=(.not.(associated(hamk%ph3d_kp).and.associated(hamk%ph3d_k)))
450    istwf_k=hamk%istwf_kp
451  else if (select_k_==K_H_K) then
452 !  ===== <k|Vnl|k> =====
453    kptin = hamk%kpt_k ; kptout = hamk%kpt_k
454    npwin=hamk%npw_fft_k ; npwout=hamk%npw_fft_k
455    kgin => hamk%kg_k ; kgout => hamk%kg_k
456    if (associated(hamk%kpg_k)) then
457      kpgin => hamk%kpg_k ; nkpgin=size(kpgin,2)
458    end if
459    if (associated(hamk%kpg_k)) then
460      kpgout => hamk%kpg_k ; nkpgout=size(kpgout,2)
461    end if
462    phkxredin => hamk%phkxred ; phkxredout => hamk%phkxred
463    ffnlin => hamk%ffnl_k ; ffnlout => hamk%ffnl_k
464    if (associated(hamk%ph3d_k)) ph3din  => hamk%ph3d_k
465    if (associated(hamk%ph3d_k)) ph3dout => hamk%ph3d_k
466    force_recompute_ph3d=(.not.(associated(hamk%ph3d_k)))
467    istwf_k=hamk%istwf_k
468  else if (select_k_==KPRIME_H_KPRIME) then
469 !  ===== <k^prime|Vnl|k^prime> =====
470    kptin = hamk%kpt_kp ; kptout = hamk%kpt_kp
471    npwin=hamk%npw_fft_kp ; npwout=hamk%npw_fft_kp
472    kgin => hamk%kg_kp ; kgout => hamk%kg_kp
473    if (associated(hamk%kpg_kp)) then
474      kpgin => hamk%kpg_kp ; nkpgin=size(kpgin,2)
475    end if
476    if (associated(hamk%kpg_kp)) then
477      kpgout => hamk%kpg_kp ; nkpgout=size(kpgout,2)
478    end if
479    phkxredin => hamk%phkpxred ; phkxredout => hamk%phkpxred
480    ffnlin => hamk%ffnl_kp ; ffnlout => hamk%ffnl_kp
481    if (associated(hamk%ph3d_kp)) ph3din  => hamk%ph3d_kp
482    if (associated(hamk%ph3d_kp)) ph3dout => hamk%ph3d_kp
483    force_recompute_ph3d=(.not.(associated(hamk%ph3d_kp)))
484    istwf_k=hamk%istwf_kp
485  end if
486 
487  if (npwin==0.or.npwout==0) return
488  dimffnlin=size(ffnlin,2);dimffnlout=size(ffnlout,2)
489  kpgin_allocated=(.not.associated(kpgin))
490  if (kpgin_allocated) then
491    ABI_ALLOCATE(kpgin,(npwin,0))
492  end if
493  kpgout_allocated=(.not.associated(kpgout))
494  if (kpgout_allocated) then
495    ABI_ALLOCATE(kpgout,(npwout,0))
496  end if
497 
498 !Check some sizes for safety
499 !if (paw_opt==0.or.cpopt<2.or.((cpopt==2.or.cpopt==3).and.choice>1)) then
500  if (size(ffnlin,1)/=npwin.or.size(ffnlin,3)/=hamk%lmnmax) then
501    msg = 'Incorrect size for ffnlin!'
502 !   MSG_BUG(msg)
503  end if
504  if(signs==2) then
505    if (size(ffnlout,1)/=npwout.or.size(ffnlout,3)/=hamk%lmnmax) then
506      msg = 'Incorrect size for ffnlout!'
507      MSG_BUG(msg)
508    end if
509  end if
510 !This test is OK only because explicit sizes are passed to nonlop_* routines
511  if (size(vectin)<2*npwin*my_nspinor*ndat) then
512    msg = 'Incorrect size for vectin!'
513    MSG_BUG(msg)
514  end if
515  if(choice/=0.and.signs==2) then
516    if(paw_opt/=3) then
517 !    This test is OK only because explicit sizes are passed to nonlop_* routines
518      if (size(vectout)<2*npwout*my_nspinor*ndat) then
519        msg = 'Incorrect size for vectout!'
520        MSG_BUG(msg)
521      end if
522    end if
523    if(paw_opt>=3) then
524      if (size(svectout)<2*npwout*my_nspinor*ndat) then
525        msg = 'Incorrect size for svectout!'
526        MSG_BUG(msg)
527      end if
528    end if
529  end if
530  if(cpopt>=0) then
531    if (size(cprjin)/=hamk%natom*my_nspinor*ndat) then
532      msg = 'Incorrect size for cprjin!'
533      MSG_BUG(msg)
534    end if
535  end if
536 
537 !Non-local coefficients connecting projectors:
538 !If enl is present in the arg list, use it; instead use hamk%ebk
539  if (present(enl)) then
540    enl_ptr => enl
541    dimenl1=size(enl,1);dimenl2=size(enl,2);dimekbq=size(enl,4)
542  else
543    enl_ptr => hamk%ekb
544    dimenl1=hamk%dimekb1;dimenl2=hamk%dimekb2;dimekbq=1
545  end if
546 
547 !In the case of a derivative with respect to an atomic displacement,
548 !and if <g|dVnl/dR|c> is required (signs=2), we only need to compute the
549 !derivatives of the projectors associated with the displaced atom.
550  iatom_only_=-1;if (present(iatom_only)) iatom_only_=iatom_only
551  atom_pert=((signs==2).and.(choice==2.or.choice==4.or.choice==24.or.choice==54))
552 
553  if (iatom_only_>0.and.atom_pert) then
554 !   We consider only atom with index iatom_only
555    iatm=hamk%atindx(iatom_only_);itypat=hamk%typat(iatom_only_)
556    natom_=1 ; ntypat_=1 ; dimenl2_=1 ; matblk_=1
557    nloalg_(:)=hamk%nloalg(:)
558    ABI_ALLOCATE(atindx1_,(1))
559    ABI_ALLOCATE(nattyp_,(1))
560    atindx1_(1)=1 ; nattyp_(1)=1
561 !  Store at the right place the 1d phases
562    n1=hamk%ngfft(1);n2=hamk%ngfft(2);n3=hamk%ngfft(3)
563    ABI_ALLOCATE(ph1d_,(2,(2*n1+1)+(2*n2+1)+(2*n3+1)))
564    shift1=(iatm-1)*(2*n1+1)
565    ph1d_(:,1:2*n1+1)=hamk%ph1d(:,1+shift1:2*n1+1+shift1)
566    shift2=(iatm-1)*(2*n2+1)+hamk%natom*(2*n1+1)
567    ph1d_(:,1+2*n1+1:2*n2+1+2*n1+1)=hamk%ph1d(:,1+shift2:2*n2+1+shift2)
568    shift3=(iatm-1)*(2*n3+1)+hamk%natom*(2*n1+1+2*n2+1)
569    ph1d_(:,1+2*n1+1+2*n2+1:2*n3+1+2*n2+1+2*n1+1)=hamk%ph1d(:,1+shift3:2*n3+1+shift3)
570    ABI_ALLOCATE(phkxredin_,(2,1))
571    ABI_ALLOCATE(phkxredout_,(2,1))
572    phkxredin_(:,1)=phkxredin(:,iatm)
573    phkxredout_(:,1)=phkxredout(:,iatm)
574    ABI_ALLOCATE(ph3din_,(2,npwin,1))
575    ABI_ALLOCATE(ph3dout_,(2,npwout,1))
576    if (force_recompute_ph3d.or.hamk%matblk<hamk%natom) then
577      nloalg_(2)=-abs(nloalg_(2)) !Will compute the 3D phase factors inside nonlop
578    else
579      ph3din_(:,1:npwin,1)=ph3din(:,1:npwin,iatm)
580      ph3dout_(:,1:npwout,1)=ph3dout(:,1:npwout,iatm)
581    end if
582    ABI_ALLOCATE(ffnlin_,(npwin,dimffnlin,hamk%lmnmax,1))
583    ABI_ALLOCATE(ffnlout_,(npwout,dimffnlout,hamk%lmnmax,1))
584    ffnlin_(:,:,:,1)=ffnlin(:,:,:,itypat)
585    ffnlout_(:,:,:,1)=ffnlout(:,:,:,itypat)
586    ABI_DATATYPE_ALLOCATE(cprjin_,(1,my_nspinor*((cpopt+5)/5)))
587    if (cpopt>=0) then
588      nlmn_atm(1)=cprjin(iatm,1)%nlmn
589      ncpgr_atm=cprjin(iatm,1)%ncpgr
590      call pawcprj_alloc(cprjin_,ncpgr_atm,nlmn_atm)
591      do idat=1,ndat
592        do ispinor=1,my_nspinor
593          jspinor=ispinor+(idat-1)*my_nspinor
594          call pawcprj_copy(cprjin(iatm:iatm,jspinor:jspinor),cprjin_(1:1,jspinor:jspinor))
595        end do
596      end do
597    end if
598    if (size(enl_ptr)>0) then
599      ABI_ALLOCATE(enl_,(size(enl_ptr,1),1,hamk%nspinor**2,size(enl_ptr,4)))
600      do ii=1,size(enl_ptr,4)
601        do ispden=1,hamk%nspinor**2
602          if (dimenl2==hamk%natom .and. hamk%usepaw==1) then
603            enl_(:,1,ispden,ii)=enl_ptr(:,iatom_only_,ispden,ii)
604          else if (dimenl2==hamk%ntypat) then
605            enl_(:,1,ispden,ii)=enl_ptr(:,itypat,ispden,ii)
606          else
607            enl_(:,1,ispden,ii)=enl_ptr(:,1,ispden,ii)
608          end if
609        end do
610      end do
611    else
612      ABI_ALLOCATE(enl_,(0,0,0,0))
613    end if
614    if (allocated(hamk%sij)) then
615      dimsij=size(hamk%sij,1)
616      ABI_ALLOCATE(sij_,(dimsij,1))
617      if (size(hamk%sij,2)==hamk%ntypat) then
618        sij_(:,1)=hamk%sij(:,itypat)
619      else if (size(hamk%sij)>0) then
620        sij_(:,1)=hamk%sij(:,1)
621      end if
622    end if
623    ABI_ALLOCATE(indlmn_,(6,hamk%lmnmax,1))
624    indlmn_(:,:,1)=hamk%indlmn(:,:,itypat)
625 
626  else
627 !  Usual case: all atoms are processed
628    natom_  =hamk%natom; ntypat_=hamk%ntypat
629    dimenl2_=dimenl2   ; matblk_=hamk%matblk
630    nloalg_(:)  = hamk%nloalg(:)
631    atindx1_    => hamk%atindx1
632    nattyp_     => hamk%nattyp
633    ph1d_       => hamk%ph1d
634    phkxredin_  => phkxredin
635    phkxredout_ => phkxredout
636    ffnlin_     => ffnlin
637    ffnlout_    => ffnlout
638    cprjin_     => cprjin
639    enl_        => enl_ptr
640    sij_        => hamk%sij
641    indlmn_     => hamk%indlmn
642    if (force_recompute_ph3d) then
643      nloalg_(2)=-abs(nloalg_(2)) !Will compute the 3D phase factors inside nonlop
644      ABI_ALLOCATE(ph3din_,(2,npwin,hamk%matblk))
645      ABI_ALLOCATE(ph3dout_,(2,npwout,hamk%matblk))
646    else
647      ph3din_     => ph3din
648      ph3dout_    => ph3dout
649    end if
650  end if
651 
652 !A specific version of nonlop based on BLAS3 can be used
653 !But there are several restrictions
654  use_gemm_nonlop= ( gemm_nonlop_use_gemm .and. &
655 & signs == 2 .and. paw_opt /= 2 .and. hamk%nspinor == 1 .and. &
656 & cpopt < 3 .and. hamk%useylm /= 0 .and. &
657 & (choice < 2 .or. choice == 7) )
658  if(use_gemm_nonlop) then
659    call gemm_nonlop(atindx1_,choice,cpopt,cprjin_,dimenl1,dimenl2_,dimekbq,&
660 &   dimffnlin,dimffnlout,enl_,enlout,ffnlin_,ffnlout_,hamk%gmet,hamk%gprimd,&
661 &   idir,indlmn_,istwf_k,kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,&
662 &   hamk%lmnmax,matblk_,hamk%mgfft,mpi_enreg,hamk%mpsang,hamk%mpssoang,&
663 &   natom_,nattyp_,ndat,hamk%ngfft,nkpgin,nkpgout,nloalg_,&
664 &   nnlout,npwin,npwout,my_nspinor,hamk%nspinor,ntypat_,only_SO_,paw_opt,&
665 &   phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,sij_,svectout,&
666 &   tim_nonlop,hamk%ucvol,hamk%useylm,vectin,vectout,hamk%use_gpu_cuda)
667 
668  else
669    !$omp parallel do default(shared), &
670    !$omp& firstprivate(ndat,npwin,my_nspinor,choice,signs,paw_opt,npwout,cpopt,nnlout), &
671    !$omp& private(b0,b1,b2,b3,b4,e0,e1,e2,e3,e4)
672    !!$omp& schedule(static), if(hamk%use_gpu_cuda==0)
673    do idat=1, ndat
674      !vectin_idat => vectin(:,1+npwin*my_nspinor*(idat-1):npwin*my_nspinor*idat)
675      b0 = 1+npwin*my_nspinor*(idat-1)
676      e0 = npwin*my_nspinor*idat
677      if (choice/=0.and.signs==2.and.paw_opt/=3) then
678        !vectout_idat => vectout(:,1+npwout*my_nspinor*(idat-1):npwout*my_nspinor*idat)
679        b1 = 1+npwout*my_nspinor*(idat-1)
680        e1 = npwout*my_nspinor*idat
681      else
682        !vectout_idat => vectout
683        b1 = lbound(vectout,dim=2)
684        e1 = ubound(vectout,dim=2)
685      end if
686      if (choice/=0.and.signs==2.and.paw_opt>=3) then
687        !svectout_idat => svectout(:,1+npwout*my_nspinor*(idat-1):npwout*my_nspinor*idat)
688        b2 = 1+npwout*my_nspinor*(idat-1)
689        e2 = npwout*my_nspinor*idat
690      else
691        !svectout_idat => svectout
692        b2 = lbound(svectout,dim=2)
693        e2 = ubound(svectout,dim=2)
694      end if
695 
696      if (cpopt>=0) then
697        !cprjin_idat => cprjin_(:,my_nspinor*(idat-1)+1:my_nspinor*(idat))
698        b3 = my_nspinor*(idat-1)+1
699        e3 = my_nspinor*(idat)
700      else
701        !cprjin_idat => cprjin_
702        b3 = lbound(cprjin_,dim=2)
703        e3 = ubound(cprjin_,dim=2)
704      end if
705      if (nnlout>0) then
706       !enlout_idat => enlout((idat-1)*nnlout+1:(idat*nnlout))
707        b4 = (idat-1)*nnlout+1
708        e4 = (idat*nnlout)
709      else
710       !enlout_idat => enlout
711        b4 = lbound(enlout,dim=1)
712        e4 = ubound(enlout,dim=1)
713      end if
714 
715 !    Legendre Polynomials version
716      if (hamk%useylm==0) then
717        call nonlop_pl(choice,dimenl1,dimenl2_,dimffnlin,dimffnlout,enl_,&
718 &       enlout(b4:e4),ffnlin_,ffnlout_,hamk%gmet,hamk%gprimd,idir,indlmn_,istwf_k,&
719 &       kgin,kgout,kpgin,kpgout,kptin,kptout,hamk%lmnmax,matblk_,hamk%mgfft,&
720 &       mpi_enreg,hamk%mpsang,hamk%mpssoang,natom_,nattyp_,hamk%ngfft,&
721 &       nkpgin,nkpgout,nloalg_,npwin,npwout,my_nspinor,hamk%nspinor,&
722 &       ntypat_,only_SO_,phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,hamk%ucvol,&
723 &       vectin(:,b0:e0),vectout(:,b1:e1))
724 !    Spherical Harmonics version
725      else if (hamk%use_gpu_cuda==0) then
726        call nonlop_ylm(atindx1_,choice,cpopt,cprjin_(:,b3:e3),dimenl1,dimenl2_,dimekbq,&
727 &       dimffnlin,dimffnlout,enl_,enlout(b4:e4),ffnlin_,ffnlout_,hamk%gprimd,idir,&
728 &       indlmn_,istwf_k,kgin,kgout,kpgin,kpgout,kptin,kptout,lambda(idat),&
729 &       hamk%lmnmax,matblk_,hamk%mgfft,mpi_enreg,natom_,nattyp_,hamk%ngfft,&
730 &       nkpgin,nkpgout,nloalg_,nnlout,npwin,npwout,my_nspinor,hamk%nspinor,&
731 &       ntypat_,paw_opt,phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,sij_,&
732 &       svectout(:,b2:e2),hamk%ucvol,vectin(:,b0:e0),vectout(:,b1:e1))
733 !    GPU version
734      else
735        call nonlop_gpu(atindx1_,choice,cpopt,cprjin(:,b3:e3),dimenl1,dimenl2_,&
736 &       dimffnlin,dimffnlout,enl_,enlout(b4:e4),ffnlin_,ffnlout_,hamk%gprimd,idir,&
737 &       indlmn_,istwf_k,kgin,kgout,kpgin,kpgout,kptin,kptout,lambda(idat),&
738 &       hamk%lmnmax,matblk_,hamk%mgfft,mpi_enreg,natom_,nattyp_,hamk%ngfft,&
739 &       nkpgin,nkpgout,nloalg_,nnlout,npwin,npwout,my_nspinor,hamk%nspinor,&
740 &       ntypat_,paw_opt,phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,sij_,&
741 &       svectout(:,b2:e2),hamk%ucvol,vectin(:,b0:e0),vectout(:,b1:e1))
742      end if
743    end do
744    !$omp end parallel do
745  end if
746 
747 !Release temporary storage
748  if (iatom_only_>0.and.atom_pert) then
749    if (cpopt>=0) then
750      call pawcprj_free(cprjin_)
751    end if
752    ABI_DEALLOCATE(atindx1_)
753    ABI_DEALLOCATE(nattyp_)
754    ABI_DEALLOCATE(ph1d_)
755    ABI_DEALLOCATE(ph3din_)
756    ABI_DEALLOCATE(ph3dout_)
757    ABI_DEALLOCATE(phkxredin_)
758    ABI_DEALLOCATE(phkxredout_)
759    ABI_DEALLOCATE(ffnlin_)
760    ABI_DEALLOCATE(ffnlout_)
761    ABI_DEALLOCATE(enl_)
762    ABI_DEALLOCATE(indlmn_)
763    ABI_DATATYPE_DEALLOCATE(cprjin_)
764    if (allocated(hamk%sij)) then
765      ABI_DEALLOCATE(sij_)
766    end if
767  else if (force_recompute_ph3d) then
768    ABI_DEALLOCATE(ph3din_)
769    ABI_DEALLOCATE(ph3dout_)
770  end if
771  if (kpgin_allocated) then
772    ABI_DEALLOCATE(kpgin)
773  end if
774  if (kpgout_allocated) then
775    ABI_DEALLOCATE(kpgout)
776  end if
777 
778  call timab(220+tim_nonlop,2,tsec)
779 
780  DBG_EXIT("COLL")
781 
782 end subroutine nonlop

ABINIT/nonlop_gpu [ Functions ]

[ Top ] [ Functions ]

NAME

 nonlop_gpu

FUNCTION

  Compute application of a nonlocal operator, using GPU (NVidia Cuda)
  This routine is an interface to Cuda Kernel gpu_nonlop.cu

COPYRIGHT

 Copyright (C) 2011-2018 ABINIT group (FDahm, MT)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 For the initials of contributors, see ~abinit/doc/developers/contributors.txt.

INPUTS

  atindx1(natom)=index table for atoms, inverse of atindx
  choice: chooses possible output:
    choice=0 => do nothing (only compute WF projected with NL projectors)
          =1 => a non-local energy contribution
          =2 => a gradient with respect to atomic position(s)
          =3 => a gradient with respect to strain(s)
          =23=> a gradient with respect to atm. pos. and strain(s)
  cpopt=flag defining the status of cprjin%cp(:)=<Proj_i|Cnk> scalars (see below, side effects)
  dimenl1,dimenl2=dimensions of enl (see enl)
  dimffnlin=second dimension of ffnlin (1+number of derivatives)
  dimffnlout=second dimension of ffnlout (1+number of derivatives)
  enl(dimenl1,dimenl2,nspinortot**2)=
  ->Norm conserving : ==== when paw_opt=0 ====
                      (Real) Kleinman-Bylander energies (hartree)
                      dimenl1=lmnmax  -  dimenl2=ntypat
  ->PAW :             ==== when paw_opt=1, 2 or 4 ====
                      (Real or complex, hermitian) Dij coefs to connect projectors
                      dimenl1=cplex_enl*lmnmax*(lmnmax+1)/2  -  dimenl2=natom
  ffnlin(npwin,dimffnlin,lmnmax,ntypat)=nonlocal form factors to be used
          for the application of the nonlocal operator to the |in> vector
  ffnlout(npwout,dimffnlout,lmnmax,ntypat)=nonlocal form factors to be used
          for the application of the nonlocal operator to the |out> vector
  gprimd(3,3)=dimensional reciprocal space primitive translations
  idir=direction of the - atom to be moved in the case (choice=2,signs=2),
                        - k point direction in the case (choice=5,signs=2)
                          for choice 53, twisted derivative involves idir+1 and idir-1
                        - strain component (1:6) in the case (choice=3,signs=2) or (choice=6,signs=1)
  indlmn(6,i,ntypat)= array giving l,m,n,lm,ln,s for i=lmn
  istwf_k=option parameter that describes the storage of wfs
  kgin(3,npwin)=integer coords of planewaves in basis sphere, for the |in> vector
  kgout(3,npwout)=integer coords of planewaves in basis sphere, for the |out> vector
  kpgin(npw,npkgin)= (k+G) components and related data, for the |in> vector
  kpgout(npw,nkpgout)=(k+G) components and related data, for the |out> vector
  kptin(3)=k point in terms of recip. translations, for the |in> vector
  kptout(3)=k point in terms of recip. translations, for the |out> vector
  lambda=factor to be used when computing (Vln-lambda.S) - only for paw_opt=2
         Typically lambda is the eigenvalue (or its guess)
  lmnmax=max. number of (l,m,n) components over all types of atoms
  matblk=dimension of the arrays ph3din and ph3dout
  mgfft=maximum size of 1D FFTs
  mpi_enreg=information about MPI parallelization
  natom=number of atoms in cell
  nattyp(ntypat)=number of atoms of each type
  ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
  nkpgin,nkpgout=second sizes of arrays kpgin/kpgout
  nloalg(3)=governs the choice of the algorithm for nonlocal operator
  nnlout=dimension of enlout (when signs=1 and choice>0):
         ==== if paw_opt=0, 1 or 2 ====
         choice=1=>nnlout=1   choice=2=>nnlout=3*natom    choice=3=>nnlout=6
         ==== if paw_opt=3 ====
         choice=1 =>nnlout=1
         ==== if paw_opt=4 ====
         not available
  npwin=number of planewaves for given k point, for the |in> vector
  npwout=number of planewaves for given k point, for the |out> vector
  nspinor=number of spinorial components of the wavefunctions (on current proc)
  nspinortot=number of spinorial components of the wavefunctions on current proc
  ntypat=number of types of atoms in cell
  paw_opt= define the nonlocal operator concerned with:
           paw_opt=0 : Norm-conserving Vnl (use of Kleinman-Bylander ener.)
           paw_opt=1 : PAW nonlocal part of H (use of Dij coeffs)
           paw_opt=2 : PAW: (Vnl-lambda.Sij) (Sij=overlap matrix)
           paw_opt=3 : PAW overlap matrix (Sij)
           paw_opt=4 : both PAW nonlocal part of H (Dij) and overlap matrix (Sij)
  phkxredin(2,natom)=phase factors exp(2 pi kptin.xred)
  phkxredout(2,natom)=phase factors exp(2 pi kptout.xred)
  ph1d(2,3*(2*mgfft+1)*natom)=1D structure factors phase information
  ph3din(2,npwin,matblk)=3D structure factors, for each atom and plane wave (in)
  ph3dout(2,npwout,matblk)=3-dim structure factors, for each atom and plane wave (out)
  signs= if 1, get contracted elements (energy, forces, stress, ...)
         if 2, applies the non-local operator to a function in reciprocal space
  sij(dimenl1,ntypat*(paw_opt/3))=overlap matrix components (only if paw_opt=2, 3 or 4)
  ucvol=unit cell volume (bohr^3)
  vectin(2,npwin*nspinor)=input cmplx wavefunction coefficients <G|Cnk>
  [cprjin_left(natom,nspinor)]=The projected input wave function <p_nlm|in_left>
    for the left wavefunction. Data are assumed to be in memory, they are NOT recalculated here.
    Only signs==1 and choice==1 are supported.

OUTPUT

 ==== if (signs==1) ====
 --If (paw_opt==0, 1 or 2)
    enlout(nnlout)= contribution to the non-local part of the following properties:
      if choice=1 : enlout(1)               -> the energy
      if choice=2 : enlout(1:3*natom)       -> the forces
      if choice=3 : enlout(1:6)             -> the stresses
      if choice=23: enlout(1:6+3*natom)     -> the forces and the stresses
 --If (paw_opt==3)
    if choice=1 : enlout(nnlout)= contribution to <c|S|c>  (nnlout=1)
 --If (paw_opt==4)
    not available
 ==== if (signs==2) ====
 --if (paw_opt=0, 1 or 4)
    vectout(2,npwout*nspinor)=result of the aplication of the concerned operator
                or one of its derivatives to the input vect.:
      if (choice=1) <G|V_nonlocal|vect_start>
      if (choice=2) <G|dV_nonlocal/d(atm coord)|vect_start>
      if (choice=3) <G|dV_nonlocal/d(strain)|vect_start>
  if (paw_opt=2)
    vectout(2,npwout*nspinor)=final vector in reciprocal space:
      if (choice=1) <G|V_nonlocal-lamdba.(I+S)|vect_start>
      if (choice=2) <G|d[V_nonlocal-lamdba.(I+S)]/d(atm coord)|vect_start>
      if (choice=3) <G|d[V_nonlocal-lamdba.(I+S)]/d(strain)|vect_start>
 --if (paw_opt=3 or 4)
    svectout(2,npwout*nspinor)=result of the aplication of Sij (overlap matrix)
                  or one of its derivatives to the input vect.:
      if (choice=1) <G|I+S|vect_start>
      if (choice=2) <G|dS/d(atm coord)|vect_start>
      if (choice=3) <G|dS/d(strain)|vect_start>

SIDE EFFECTS

  cprjin(natom,nspinor) <type(pawcprj_type)>=projected input wave function |in> on non-local projectors
                                  =<p_lmn|in> and derivatives
                    Treatment depends on cpopt parameter:
                     if cpopt=-1, <p_lmn|in> (and derivatives)
                                  are computed here (and not saved)
                     if cpopt= 0, <p_lmn|in> are computed here and saved
                                  derivatives are eventually computed but not saved
                     if cpopt= 1, <p_lmn|in> and first derivatives are computed here and saved
                                  other derivatives are eventually computed but not saved

TODO

 * Implementation for spinorial wave functions (nspinor=2)
 * Implementation for response function (phonons, ddk, elastic tensor, ...)

PARENTS

      nonlop

CHILDREN

      dotprod_g,gpu_nonlop

SOURCE

 933  subroutine nonlop_gpu(atindx1,choice,cpopt,cprjin,dimenl1,dimenl2,dimffnlin,dimffnlout,&
 934 &                      enl,enlout,ffnlin,ffnlout,gprimd,idir,indlmn,istwf_k,&
 935 &                      kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,lmnmax,matblk,mgfft,&
 936 &                      mpi_enreg,natom,nattyp,ngfft,nkpgin,nkpgout,nloalg,nnlout,&
 937 &                      npwin,npwout,nspinor,nspinortot,ntypat,paw_opt,phkxredin,phkxredout,ph1d,&
 938 &                      ph3din,ph3dout,signs,sij,svectout,ucvol,vectin,vectout)
 939 
 940 
 941 !This section has been created automatically by the script Abilint (TD).
 942 !Do not modify the following lines by hand.
 943 #undef ABI_FUNC
 944 #define ABI_FUNC 'nonlop_gpu'
 945 !End of the abilint section
 946 
 947  implicit none
 948 
 949 !Arguments ------------------------------------
 950 !scalars
 951  integer,intent(in) :: choice,cpopt,dimenl1,dimenl2,dimffnlin,dimffnlout,idir
 952  integer,intent(in) :: istwf_k,lmnmax,matblk,mgfft,natom,nkpgin,nkpgout,nnlout
 953  integer,intent(in) :: npwin,npwout,nspinor,nspinortot,ntypat,paw_opt,signs
 954  real(dp),intent(in) :: lambda,ucvol
 955  type(MPI_type),intent(in) :: mpi_enreg
 956 !arrays
 957  integer,intent(in) :: atindx1(natom),indlmn(6,lmnmax,ntypat),kgin(3,npwin)
 958  integer,intent(in) :: kgout(3,npwout),nattyp(ntypat),ngfft(18),nloalg(3)
 959  real(dp),intent(in) :: enl(dimenl1,dimenl2,nspinortot**2)
 960  real(dp),intent(in) :: ffnlin(npwin,dimffnlin,lmnmax,ntypat)
 961  real(dp),intent(in) :: ffnlout(npwout,dimffnlout,lmnmax,ntypat) !,gmet(3,3)
 962  real(dp),intent(in) :: gprimd(3,3),kpgin(npwin,nkpgin),kpgout(npwout,nkpgout)
 963  real(dp),intent(in) :: kptin(3),kptout(3),ph1d(2,3*(2*mgfft+1)*natom)
 964  real(dp),intent(in) :: phkxredin(2,natom),phkxredout(2,natom)
 965  real(dp),intent(in) :: sij(dimenl1,ntypat*((paw_opt+1)/3))
 966  real(dp),intent(inout) :: ph3din(2,npwin,matblk),ph3dout(2,npwout,matblk)
 967  real(dp),intent(inout) :: vectin(:,:)
 968  real(dp),intent(out) :: enlout(:)
 969  real(dp),intent(out),target :: svectout(:,:)
 970  real(dp),intent(out),target :: vectout (:,:)
 971  type(pawcprj_type),intent(inout) :: cprjin(:,:)
 972 
 973 !Local variables-------------------------------
 974 !scalars
 975  integer :: ia,iatom,ilmn,iproj,ispinor,itypat,signs_
 976  real(dp) :: doti
 977  character(len=500) :: msg
 978 !arrays
 979  real(dp),allocatable :: proj(:,:)
 980  real(dp),pointer :: svectout_(:,:),vectout_(:,:)
 981 
 982 ! **********************************************************************
 983 
 984  DBG_ENTER("COLL")
 985 
 986 !Error on bad choice
 987  if ((choice<0 .or. choice>3).and. choice/=23 .and. choice/=24) then
 988    write(msg,'(a,i0,a)')'Does not presently support this choice=',choice,'.'
 989    MSG_BUG(msg)
 990  end if
 991  if (cpopt<-1.or.cpopt>1) then
 992    msg='  Bad value for cpopt !'
 993    MSG_BUG(msg)
 994  end if
 995  if (nspinor==2) then
 996    msg='  nspinor=2 (spinorial WF) not yet allowed !'
 997    MSG_ERROR(msg)
 998  end if
 999 
1000  if ((cpopt==0).or.(cpopt==1))  then
1001    ABI_ALLOCATE(proj,(2,lmnmax*natom))
1002    proj=zero;
1003  end if
1004 
1005 !Workaround to get choice=1/signs=1 working
1006  if (choice==1.and.signs==1) then
1007    signs_=2
1008    ABI_ALLOCATE(vectout_,(2,npwin*nspinor))
1009    ABI_ALLOCATE(svectout_,(2,npwin*nspinor*(paw_opt/3)))
1010  else
1011    signs_=signs;vectout_=>vectout;svectout_=>svectout
1012  end if
1013 
1014 #if defined HAVE_GPU_CUDA
1015  call gpu_nonlop(atindx1,choice,cpopt,proj,dimenl1,dimenl2,dimffnlin,dimffnlout,&
1016 & enl,enlout,ffnlin,ffnlout,gprimd,idir,indlmn,istwf_k,&
1017 & kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,lmnmax,matblk,mgfft,&
1018 & mpi_enreg%me_g0,natom,nattyp,ngfft,nkpgin,nkpgout,nloalg,nnlout,&
1019 & npwin,npwout,nspinor,ntypat,paw_opt,phkxredin,phkxredout,ph1d,&
1020 & ph3din,ph3dout,signs_,sij,svectout_,pi,ucvol,vectin,vectout_)
1021 #else
1022  ABI_UNUSED(nnlout)
1023 #endif
1024 
1025  if (choice==1.and.signs==1) then
1026    if (paw_opt/=3) then
1027      call dotprod_g(enlout(1),doti,istwf_k,npwin*nspinor,1,vectin,vectout_,mpi_enreg%me_g0,mpi_enreg%comm_spinorfft)
1028    else
1029      call dotprod_g(enlout(1),doti,istwf_k,npwin*nspinor,1,vectin,svectout_,mpi_enreg%me_g0,mpi_enreg%comm_spinorfft)
1030    end if
1031    ABI_DEALLOCATE(vectout_)
1032    ABI_DEALLOCATE(svectout_)
1033  else
1034    nullify(vectout_,svectout_)
1035  end if
1036 
1037  if ((cpopt==0).or.(cpopt==1)) then
1038    iproj=0
1039    do ispinor=1,nspinor
1040      iatom=0
1041      do itypat=1,ntypat
1042        do ia=1,nattyp(itypat)
1043          iatom=iatom+1;cprjin(iatom,1)%nlmn=count(indlmn(3,:,itypat)>0)
1044          do ilmn=1,cprjin(iatom,1)%nlmn
1045            iproj=iproj+1;cprjin(iatom,1)%cp(:,ilmn)=proj(:,iproj)
1046          end do
1047        end do
1048      end do
1049    end do
1050    ABI_DEALLOCATE(proj)
1051  end if
1052 
1053  DBG_EXIT("COLL")
1054 
1055 !Fake statements to satisfy ABI rules
1056 #if ! defined HAVE_GPU_CUDA
1057  if (.false.) then
1058    write(std_out,*) atindx1,enl,ffnlin,ffnlout,gprimd
1059    write(std_out,*) idir,istwf_k,kgin,kgout,kpgin,kpgout
1060    write(std_out,*) kptin,kptout,lambda,mpi_enreg%me
1061    write(std_out,*) ngfft,nloalg,ph1d,ph3din,ph3dout
1062    write(std_out,*) phkxredin,phkxredout,signs,sij
1063    write(std_out,*) ucvol,vectin
1064  end if
1065 #endif
1066 
1067 end subroutine nonlop_gpu