./66_nonlocal/m_nonlop.F90

ABINIT/m_nonlop
ABINIT/nonlop
ABINIT/nonlop_gpu

ABINIT/m_nonlop [ Modules ]

NAME

  m_nonlop

FUNCTION

  Copyright (C) 1998-2024 ABINIT group (MT)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

15 #if defined HAVE_CONFIG_H
16 #include "config.h"
17 #endif
18 
19 #include "abi_common.h"
20 
21 module m_nonlop
22 
23  use defs_basis
24  use m_errors
25  use m_abicore
26  use m_xmpi
27  use m_xomp
28  use m_cgtools
29  use m_gemm_nonlop
30  use m_gemm_nonlop_gpu
31  use m_gemm_nonlop_ompgpu
32 
33  use defs_abitypes, only : MPI_type
34  use m_time,        only : timab
35  use m_fstrings,    only : sjoin, itoa, ftoa
36  use m_hamiltonian, only : gs_hamiltonian_type, KPRIME_H_K, K_H_KPRIME, K_H_K, KPRIME_H_KPRIME
37  use m_pawcprj,     only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_copy
38  use m_nonlop_pl,   only : nonlop_pl
39  use m_nonlop_ylm,  only : nonlop_ylm
40 
41  use, intrinsic :: iso_c_binding, only: c_loc
42 
43 #if defined HAVE_GPU_CUDA
44  use m_manage_cuda
45 #endif
46 
47  implicit none
48 
49  private

ABINIT/nonlop [ Functions ]

[ Top ] [ Functions ]

NAME

 nonlop

FUNCTION

 This routine is a driver to compute:
 * Application of a nonlocal operator Vnl_k_k^prime in order to get:
    - contracted elements (energy, forces, stresses, ...), if signs=1
    - a function in reciprocal space (|out> = Vnl|in>),    if signs=2
 * Optionally, in case of PAW calculation:
   - Application of the overlap matrix in reciprocal space
     (<in|S|in> or (I+S)|in>).
   - Application of (Vnl-lambda.S) in reciprocal space
 According to user's choice, the routine calls a subroutine, computing all quantities:
   - using Legendre Polynomials Pl (Norm-conserving psps only)
   - using Spherical Harmonics Ylm (N-conserving or PAW ; compulsory for PAW)
   - using GPUs (N-conserving or PAW)

INPUTS

  choice: chooses possible output:
    choice=0 => do nothing (only compute WF projected with NL projectors)
          =1 => non-local energy contribution
          =2 => 1st derivative(s) with respect to atomic position(s)
          =3 => 1st derivative(s) with respect to strain(s)
          =22=> mixed 2nd derivative(s) with respect to atomic pos. and q vector (at q=0)
          =25=> mixed 3rd derivative(s) with respect to atomic pos. and two q vectors (at q=0)
          =23=> 1st derivative(s) with respect to atomic pos. and
                1st derivative(s) with respect to atomic pos. and strains
          =4 => 2nd derivative(s) with respect to 2 atomic pos.
          =24=> 1st derivative(s) with respect to atm. pos. and
                2nd derivative(s) with respect to 2 atomic pos.
          =33=> mixed 2nd derivative(s) with respect to strain and q vector (at q=0)
          =5 => 1st derivative(s) with respect to k wavevector, typically
                sum_ij [ |p_i> D_ij <dp_j/dk| + |dp_i/dk> D_ij < p_j| ]
          =6 => 2nd derivative(s) with respect to 2 strains and
                mixed 2nd derivative(s) with respect to strains & atomic pos.
          =51 =>right 1st derivative(s) with respect to k wavevector, typically
                sum_ij [ |p_i> D_ij <dp_j/dk| ]
          =52 =>left 1st derivative(s) with respect to k wavevector, typically
                sum_ij [ |dp_i/dk> D_ij < p_j| ]
          =53 =>twist 1st derivative(s) with respect to k, typically
                sum_ij [ |dp_i/dk_(idir+1)> D_ij <dp_j//dk_(idir+2)|
                where idir, idir+1, idir+2 taken mod 3
          =54=> mixed 2nd derivative(s) with respect to atomic pos. and left k wavevector
          =55=> mixed 2nd derivative(s) with respect to strain and right k wavevector
          =7 => apply operator $\sum_i [ |p_i> <p_i| ],
                same as overlap operator with s_ij=identity (paw_opt==3 only)
          =8 => 2nd derivatives with respect to 2 k wavevectors
          =81=> partial 2nd derivatives with respect to 2 k wavevectors,
                full derivative with respect to k1, right derivative with respect to k2,
                (derivative with respect to k of choice 51), typically
                sum_ij [ |dp_i/dk1> D_ij <dp_j/dk2| + |p_i> D_ij < d2p_j/dk1dk2| ]
    Only choices 1,2,3,23,4,5,6 are compatible with useylm=0.
    Only choices 1,2,22,25,3,5,33,51,52,53,7,8,81 are compatible with signs=2
  cpopt=flag defining the status of cprjin%cp(:)=<Proj_i|Cnk> scalars (see below, side effects)
  [enl]=optional (if not present, use hamk%ekb); non-local coeffs connecting projectors
        see hamk%ekb description
  hamk <type(gs_hamiltonian_type)>=data defining the Hamiltonian at a given k (NL part involved here)
     | atindx1(natom)=index table for atoms, inverse of atindx
     | dimekb1,dimekb2=dimensions of ekb (see ham%ekb)
     | dimekbq=1 if enl factors do not contain a exp(-iqR) phase, 2 is they do
     | ekb(dimekb1,dimekb2,nspinor**2,dimekbq)=
     |   ->NC psps (paw_opt=0) : Kleinman-Bylander energies (hartree)
     |                           dimekb1=lmnmax, dimekb2=ntypat
     |   ->PAW (paw_opt=1 or 4): Dij coefs connecting projectors (ij symmetric)
     |                           dimekb1=cplex_ekb*lmnmax*(lmnmax+1)/2, dimekb2=natom
     |                           Complex numbers if cplex_ekb=2
     |                           ekb(:,:,1)= Dij^up-up, ekb(:,:,2)= Dij^dn-dn
     |                           ekb(:,:,3)= Dij^up-dn, ekb(:,:,4)= Dij^dn-up (only if nspinor=2)
     | ffnl_k(npw_k,dimffnl_k,lmnmax,ntypat)=nonlocal form factors at k
     | ffnl_kp(npw_kp,dimffnl_kp,lmnmax,ntypat)=nonlocal form factors at k^prime
     | gmet(3,3)=metric tensor for G vecs (in bohr**-2)
     | gprimd(3,3)=dimensional reciprocal space primitive translations
     | indlmn(6,i,ntypat)= array giving l,m,n,lm,ln,s for i=ln (useylm=0) or i=lmn (useylm=1)
     | istwf_k=option parameter that describes the storage of wfs at k
     | istwf_kp=option parameter that describes the storage of wfs at k^prime
     | lmnmax=max. number of (l,m,n) components over all types of atoms
     | matblk=dimension of the arrays ph3d_k and ph3d_kp
     | mgfft=maximum size of 1D FFTs
     | mpsang= 1+maximum angular momentum for nonlocal pseudopotentials
     | mpssoang= 1+max(spin*angular momentum) for nonlocal pseudopotentials
     | natom=number of atoms in cell
     | nattyp(ntypat)=number of atoms of each type
     | ngfft(18)=contain all needed information about 3D FFT
     ! kg_k(3,npw_k)=integer coords of planewaves in basis sphere, for k
     ! kg_kp(3,npw_kp)=integer coords of planewaves in basis sphere, for k^prime
     ! kpg_k(npw_k,:)= (k+G) components and related data
     ! kpg_kp(npw_kp,:)=(k^prime+G) components and related data,
     ! kpt(3)=k point in terms of recip. translations
     ! kptp(3)=k^prime point in terms of recip. translations
     | nloalg(3)=governs the choice of the algorithm for nonlocal operator
     ! npw_k=number of (k+G) planewaves
     ! npw_kp=number of (k^prime+G) planewaves
     | ntypat=number of types of atoms in cell
     | nspinor=total number of spinorial components of the wavefunctions
     | ph1d(2,3*(2*mgfft+1)*natom)=1D structure factors phase information
     | ph3d_k(2,npw_k,matblk)=3D structure factors, for each atom and (k+g) plane wave
     | ph3d_kp(2,npw_kp,matblk)=3-dim structure factors, for each atom and (k^prime+g) plane wave
     | phkxred(2,natom)=phase factors exp(2 pi k.xred)
     | phkpxred(2,natom)=phase factors exp(2 pi k^prime.xred)
     | sij(dimekb1,ntypat)=overlap matrix components (only if paw_opt=2, 3 or 4)
     | ucvol=unit cell volume (bohr^3)
     | gpu_option= GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)
     | useylm=how the NL operator is to be applied: 1=using Ylm, 0=using Legendre polynomials
  [iatom_only]=optional. If present (and >0), only projectors related to atom of index iatom_only
          will be applied. (used fi to apply derivative of NL operator wrt an atomic displacement)
  idir=direction of the - atom to be moved in the case (choice=2,signs=2) or (choice=22,signs=2)
                        - k point direction in the case (choice=5,51,or 52)
                          for choice 53 signs=2, cross derivatives are in idir+1 and idir+2 directions (mod 3)
                        - strain component (1:6) in the case (choice=3,signs=2) or (choice=6,signs=1)
                        - strain component (1:9) in the case (choice=33,signs=2)
                        - (1:9) components to specify the atom to be moved and the second q-gradient
                          direction in the case (choice=25,signs=2)
  lambda=factor to be used when computing (Vln-lambda.S) - only for paw_opt=2
         Typically lambda is the eigenvalue (or its guess)
  mpi_enreg=information about MPI parallelization
  ndat=number of wavefunctions on which to apply nonlop
  nnlout=dimension of enlout (when signs=1 and choice>0):
         ==== if paw_opt=0, 1 or 2 ====
         choice   nnlout     |  choice   nnlout
              1   1          |      51   6 (complex)
              2   3*natom    |      52   6 (complex)
              3   6          |      53   6 (complex)
              4   6*natom    |      54   9*natom
             23   6+3*natom  |      55   36 (complex)
             24   9*natom    |       6   36+18*natom
              5   3          |       8   6
                             |      81   18 (complex)
         ==== if paw_opt=3 ====
         choice   nnlout
              1   1
              2   3*natom
              5   3
             51   3
             52   3
             54   9*natom
             55   36
              7   1
              8   6
             81   9
         ==== if paw_opt=4 ====
         not available
  [only_SO]=optional, flag to calculate only the SO part in nonlop
  paw_opt= define the nonlocal operator concerned with:
           paw_opt=0 : Norm-conserving Vnl (use of Kleinman-Bylander ener.)
           paw_opt=1 : PAW nonlocal part of H (use of Dij coeffs)
           paw_opt=2 : PAW: (Vnl-lambda.Sij) (Sij=overlap matrix)
           paw_opt=3 : PAW overlap matrix (Sij)
           paw_opt=4 : both PAW nonlocal part of H (Dij) and overlap matrix (Sij)
  [qdir]= optional, direction of the q-gradient (only for choice=22, choice=25 and choice=33)
  [select_k]=optional, option governing the choice of k points to be used.
             hamk datastructure contains quantities needed to apply NL operator
             in reciprocal space between 2 kpoints, k and k^prime (equal in most cases);
             if select_k=1, <k^prime|Vnl|k>       is applied [default]
             if select_k=2, <k|Vnl|k^prime>       is applied
             if select_k=3, <k|Vnl|k>             is applied
             if select_k=4, <k^prime|Vnl|k^prime> is applied
  signs= if 1, get contracted elements (energy, forces, stress, ...)
         if 2, applies the non-local operator to a function in reciprocal space
  tim_nonlop=timing code of the calling routine (can be set to 0 if not attributed)
  vectin(2,npwin*my_nspinor*ndat)=input cmplx wavefunction coefficients <G|Cnk>
  vectproj(2,nprojs,my_nspinor*ndat)=Optional, vector to be used instead of cprjin%cp when provided

OUTPUT

 ==== if (signs==1) ====
 --If (paw_opt==0, 1 or 2)
    enlout(nnlout)= contribution to the non-local part of the following properties:
      if choice=1 : enlout(1)             -> the energy
      if choice=2 : enlout(3*natom)       -> 1st deriv. of energy wrt atm. pos (forces)
      if choice=3 : enlout(6)             -> 1st deriv. of energy wrt strain (stresses)
      if choice=4 : enlout(6*natom)       -> 2nd deriv. of energy wrt 2 atm. pos (dyn. mat.)
      if choice=23: enlout(6+3*natom)     -> 1st deriv. of energy wrt atm. pos (forces) and
                                             1st deriv. of energy wrt strain (stresses)
      if choice=24: enlout(9*natom)       -> 1st deriv. of energy wrt atm. pos (forces) and
                                             2nd deriv. of energy wrt 2 atm. pos (dyn. mat.)
      if choice=5 : enlout(3)             -> 1st deriv. of energy wrt k
      if choice=51: enlout(3)             -> 1st deriv. (right) of energy wrt k
      if choice=52: enlout(3)             -> 1st deriv. (left) of energy wrt k
      if choice=53: enlout(3)             -> 1st deriv. (twist) of energy wrt k
      if choice=54: enlout(18*natom)      -> 2nd deriv. of energy wrt atm. pos and right k (Born eff. charge)
      if choice=55: enlout(36)            -> 2nd deriv. of energy wrt strain and right k (piezoelastic tensor)
      if choice=6 : enlout(36+18*natom)   -> 2nd deriv. of energy wrt 2 strains (elast. tensor) and
                                             2nd deriv. of energy wrt to atm. pos and strain (internal strain)
      if choice=8 : enlout(6)             -> 2nd deriv. of energy wrt 2 k
      if choice=81: enlout(9)             -> 2nd deriv.of E: full derivative w.r.t. k1, right derivative w.r.t k2
 --If (paw_opt==3)
      if choice=1 : enlout(1)             -> contribution to <c|S|c> (note: not including <c|c>)
      if choice=2 : enlout(3*natom)       -> contribution to <c|dS/d_atm.pos|c>
      if choice=51: enlout(3)             -> contribution to <c|d(right)S/d_k|c>
      if choice=52: enlout(3)             -> contribution to <c|d(left)S/d_k|c>
      if choice=54: enlout(18*natom)      -> 2nd deriv. of energy wrt atm. pos and right k (Born eff. charge)
      if choice=55: enlout(36)            -> 2nd deriv. of energy wrt strain and right k (piezoelastic tensor)
      if choice=7 : enlout(1)             -> contribution to <c|sum_i[p_i><p_i]|c>
      if choice=8 : enlout(6)             -> contribution to <c|d2S/d_k1d_k2|c>
      if choice=81: enlout(9)             -> contribution to <c|dS/d_k1[d(right)d_k2]|c>
 --If (paw_opt==4)
      not available
 ==== if (signs==2) ====
 --if (paw_opt=0)
    vectout(2,npwout*my_nspinor*ndat)=result of the aplication of the concerned operator
                or one of its derivatives to the input vect.
      if (choice=22) <G|d2V_nonlocal/d(atm. pos)dq|vect_in> (at q=0)
      if (choice=25) <G|d3V_nonlocal/d(atm. pos)dqdq|vect_in> (at q=0)
      if (choice=33) <G|d2V_nonlocal/d(strain)dq|vect_in> (at q=0)
 --if (paw_opt=0, 1 or 4)
    vectout(2,npwout*my_nspinor*ndat)=result of the aplication of the concerned operator
                or one of its derivatives to the input vect.:
      if (choice=1)  <G|V_nonlocal|vect_in>
      if (choice=2)  <G|dV_nonlocal/d(atm. pos)|vect_in>
      if (choice=3)  <G|dV_nonlocal/d(strain)|vect_in>
      if (choice=5)  <G|dV_nonlocal/d(k)|vect_in>
      if (choice=51) <G|d(right)V_nonlocal/d(k)|vect_in>
      if (choice=52) <G|d(left)V_nonlocal/d(k)|vect_in>
      if (choice=53) <G|d(twist)V_nonlocal/d(k)|vect_in>
      if (choice=8)  <G|d2V_nonlocal/d(k)d(k)|vect_in>
      if (choice=81) <G|d[d(right)V_nonlocal/d(k)]/d(k)|vect_in>
 --if (paw_opt=2)
    vectout(2,npwout*my_nspinor*ndat)=final vector in reciprocal space:
      if (choice=1)  <G|V_nonlocal-lamdba.(I+S)|vect_in>
      if (choice=2)  <G|d[V_nonlocal-lamdba.(I+S)]/d(atm. pos)|vect_in>
      if (choice=3)  <G|d[V_nonlocal-lamdba.(I+S)]/d(strain)|vect_in>
      if (choice=5)  <G|d[V_nonlocal-lamdba.(I+S)]/d(k)|vect_in>
      if (choice=51) <G|d(right)[V_nonlocal-lamdba.(I+S)]/d(k)|vect_in>
      if (choice=52) <G|d(left)[V_nonlocal-lamdba.(I+S)]/d(k)|vect_in>
      if (choice=53) <G|d(twist)[V_nonlocal-lamdba.(I+S)]/d(k)|vect_in>
      if (choice=8)  <G|d2[V_nonlocal-lamdba.(I+S)]/d(k)d(k)|vect_in>
      if (choice=81) <G|d[d(right[V_nonlocal-lamdba.(I+S)]/d(k)]/d(k)|vect_in>
 --if (paw_opt=3 or 4)
    svectout(2,npwout*my_nspinor*ndat)=result of the aplication of Sij (overlap matrix)
                  or one of its derivatives to the input vect.:
      if (choice=1)  <G|I+S|vect_in>
      if (choice=2)  <G|dS/d(atm. pos)|vect_in>
      if (choice=3)  <G|dS/d(strain)|vect_in>
      if (choice=5)  <G|dS/d(k)|vect_in>
      if (choice=51) <G|d(right)S/d(k)|vect_in>
      if (choice=52) <G|d(left)S/d(k)|vect_in>
      if (choice=53) <G|d(twist)S/d(k)|vect_in>
      if (choice=3)  <G|d[V_nonlocal-lamdba.(I+S)]/d(strain)|vect_in>
      if (choice=7)  <G|sum_i[p_i><p_i]|vect_in>
      if (choice=8)  <G|d2S/d(k)d(k)|vect_in>
      if (choice=81) <G|d[d(right)S/d(k)]/d(k)|vect_in>

SIDE EFFECTS

  ==== ONLY IF useylm=1
  cprjin(natom,my_nspinor*ndat) <type(pawcprj_type)>=projected input wave function |in> on non-local projectors
                                  =<p_lmn|in> and derivatives
                    Treatment depends on cpopt parameter:
                     if cpopt=-1, <p_lmn|in> (and derivatives)
                                  are computed here (and not saved)
                     if cpopt= 0, <p_lmn|in> are computed here and saved
                                  derivatives are eventually computed but not saved
                     if cpopt= 1, <p_lmn|in> and first derivatives are computed here and saved
                                  other derivatives are eventually computed but not saved
                     if cpopt= 2  <p_lmn|in> are already in memory;
                                  first (and 2nd) derivatives are computed here and not saved
                     if cpopt= 3  <p_lmn|in> are already in memory;
                                  first derivatives are computed here and saved
                                  other derivatives are eventually computed but not saved
                     if cpopt= 4  <p_lmn|in> and first derivatives are already in memory;
                                  other derivatives are not computed
                                  This option is not compatible with choice=4,24 or 6
                     If useylm=0, must have cpopt=-1!
                     Warning: for cpopt= 1 or 3, derivatives wrt strains do not contain
                              the contribution due to the volume change;
                              i.e. <dp_lmn/dEps|in> are incomplete.

NOTES

 * See nonlop_pl and nonlop_ylm to have more comments...
 * In the case signs=1, the array vectout is not used.

SOURCE

331 subroutine nonlop(choice,cpopt,cprjin,enlout,hamk,idir,lambda,mpi_enreg,ndat,nnlout,&
332 &                 paw_opt,signs,svectout,tim_nonlop,vectin,vectout,&
333 &                 cprjin_left,enl,enlout_im,iatom_only,ndat_left,only_SO,qdir,select_k,vectproj) !optional arguments
334 
335 !Arguments ------------------------------------
336 !scalars
337  integer,intent(in) :: choice,cpopt,idir,ndat,nnlout,paw_opt,signs,tim_nonlop
338  integer,intent(in),optional :: iatom_only,only_SO,qdir,ndat_left,select_k
339  type(MPI_type),intent(in) :: mpi_enreg
340  type(gs_hamiltonian_type),intent(in),target :: hamk
341 !arrays
342  real(dp),intent(in) :: lambda(ndat)
343  real(dp),intent(in),target,optional :: enl(:,:,:,:)
344  real(dp),intent(inout),target :: vectin(:,:)
345  real(dp),intent(out),target :: enlout(:),svectout(:,:)
346  real(dp),intent(out),optional :: enlout_im(:)
347  real(dp),intent(inout),target :: vectout(:,:)
348  type(pawcprj_type),intent(inout),target :: cprjin(:,:)
349  type(pawcprj_type),intent(inout),target,optional :: cprjin_left(:,:)
350  real(dp),intent(inout), ABI_CONTIGUOUS optional :: vectproj(:,:,:)
351 
352 
353 !Local variables-------------------------------
354 !scalars
355  integer :: dimenl1,dimenl2,dimenl2_,dimekbq,dimffnlin,dimffnlout,dimsij,iatm,iatom_only_,idat
356  integer :: ii,ispden,ispinor,istwf_k,itypat,jspinor,matblk_,my_nspinor,n1,n2,n3,natom_,ncpgr_atm,ndat_left_
357  integer :: nkpgin,nkpgout,npwin,npwout,ntypat_,only_SO_,select_k_,shift1,shift2,shift3
358  logical :: atom_pert,force_recompute_ph3d,kpgin_allocated,kpgout_allocated
359  logical :: use_gemm_nonlop
360  character(len=500) :: msg
361 !arrays
362  integer :: nlmn_atm(1),nloalg_(3)
363  integer,pointer :: kgin(:,:),kgout(:,:)
364  integer, ABI_CONTIGUOUS pointer :: atindx1_(:),indlmn_(:,:,:),nattyp_(:)
365  real(dp) :: tsec(2)
366  real(dp),pointer :: enl_ptr(:,:,:,:)
367  real(dp),pointer :: ffnlin(:,:,:,:),ffnlin_(:,:,:,:),ffnlout(:,:,:,:),ffnlout_(:,:,:,:)
368  real(dp),pointer :: kpgin(:,:),kpgout(:,:)
369  real(dp) :: kptin(3),kptout(3)
370  real(dp),pointer :: ph3din(:,:,:),ph3din_(:,:,:),ph3dout(:,:,:),ph3dout_(:,:,:)
371  real(dp),pointer :: phkxredin(:,:),phkxredin_(:,:),phkxredout(:,:),phkxredout_(:,:)
372  real(dp), ABI_CONTIGUOUS pointer :: ph1d_(:,:),sij_(:,:)
373  real(dp), pointer :: enl_(:,:,:,:)
374  type(pawcprj_type),pointer :: cprjin_(:,:)
375  integer :: b0,b1,b2,b3,b4,e0,e1,e2,e3,e4
376 
377 ! **********************************************************************
378 
379  DBG_ENTER("COLL")
380 
381 !Keep track of time spent in this routine (selection of different slots for different choices)
382  call timab(220+tim_nonlop,1,tsec)
383 
384 ! Increment global counter
385 !$OMP MASTER
386  nonlop_counter = nonlop_counter + ndat
387 !$OMP END MASTER
388 
389  only_SO_=0; if (present(only_SO)) only_SO_=only_SO
390  my_nspinor=max(1,hamk%nspinor/mpi_enreg%nproc_spinor)
391 
392  force_recompute_ph3d=.false.
393 
394 !Error(s) on incorrect input
395  if (hamk%useylm==0) then
396    if (paw_opt>0) then
397      ABI_BUG('When paw_opt>0 you must use ylm version of nonlop! Set useylm 1.')
398    end if
399    if (cpopt/=-1) then
400      ABI_BUG('If useylm=0, ie no PAW, then cpopt/=-1 is not allowed !')
401    end if
402    if (hamk%dimekbq/=1) then
403      ABI_BUG('If useylm=0, ie no PAW, then dimekbq/=-1 is not allowed !')
404    end if
405    if (hamk%gpu_option/=ABI_GPU_DISABLED) then
406      ABI_BUG('When gpu_option/=0 you must use ylm version of nonlop! Set useylm to 1.')
407    end if
408  end if
409  if (hamk%gpu_option/=ABI_GPU_DISABLED.and.hamk%dimekbq/=1) then
410    ABI_BUG('GPU version of nonlop not compatible with a exp(-iqR) phase!')
411  end if
412  if ((.not.associated(hamk%kg_k)).or.(.not.associated(hamk%kg_kp))) then
413    ABI_BUG('kg_k/kg_kp should be associated!')
414  end if
415  if ((.not.associated(hamk%ffnl_k)).or.(.not.associated(hamk%ffnl_kp))) then
416    ABI_BUG('ffnl_k/ffnl_kp should be associated!')
417  end if
418 !if (hamk%istwf_k/=hamk%istwf_kp) then
419 !  ABI_BUG('istwf has to be the same for both k-points.')
420 !end if
421 
422 !Select k-dependent objects according to select_k input parameter
423  select_k_=1;if (present(select_k)) select_k_=select_k
424  nkpgin=0;nkpgout=0;nullify(kpgin);nullify(kpgout)
425  nullify(ph3din);nullify(ph3dout)
426  if (select_k_==KPRIME_H_K) then
427 !  ===== <k^prime|Vnl|k> =====
428    kptin = hamk%kpt_k ; kptout = hamk%kpt_kp
429    npwin=hamk%npw_fft_k ; npwout=hamk%npw_fft_kp
430    kgin => hamk%kg_k ; kgout => hamk%kg_kp
431    if (associated(hamk%kpg_k)) then
432      kpgin => hamk%kpg_k ; nkpgin=size(kpgin,2)
433    end if
434    if (associated(hamk%kpg_kp)) then
435      kpgout => hamk%kpg_kp ; nkpgout=size(kpgout,2)
436    end if
437    phkxredin => hamk%phkxred ; phkxredout => hamk%phkpxred
438    ffnlin => hamk%ffnl_k ; ffnlout => hamk%ffnl_kp
439    if (associated(hamk%ph3d_k )) ph3din  => hamk%ph3d_k
440    if (associated(hamk%ph3d_kp)) ph3dout => hamk%ph3d_kp
441    force_recompute_ph3d=(.not.(associated(hamk%ph3d_k).and.associated(hamk%ph3d_kp)))
442    istwf_k=hamk%istwf_k
443  else if (select_k_==K_H_KPRIME) then
444 !  ===== <k|Vnl|k^prime> =====
445    kptin = hamk%kpt_kp ; kptout = hamk%kpt_k
446    npwin=hamk%npw_fft_kp ; npwout=hamk%npw_fft_k
447    kgin => hamk%kg_kp ; kgout => hamk%kg_k
448    if (associated(hamk%kpg_kp)) then
449      kpgin => hamk%kpg_kp ; nkpgin=size(kpgin,2)
450    end if
451    if (associated(hamk%kpg_k)) then
452      kpgout => hamk%kpg_k ; nkpgout=size(kpgout,2)
453    end if
454    phkxredin => hamk%phkpxred ; phkxredout => hamk%phkxred
455    ffnlin => hamk%ffnl_kp ; ffnlout => hamk%ffnl_k
456    if (associated(hamk%ph3d_kp)) ph3din  => hamk%ph3d_kp
457    if (associated(hamk%ph3d_k )) ph3dout => hamk%ph3d_k
458    force_recompute_ph3d=(.not.(associated(hamk%ph3d_kp).and.associated(hamk%ph3d_k)))
459    istwf_k=hamk%istwf_kp
460  else if (select_k_==K_H_K) then
461 !  ===== <k|Vnl|k> =====
462    kptin = hamk%kpt_k ; kptout = hamk%kpt_k
463    npwin=hamk%npw_fft_k ; npwout=hamk%npw_fft_k
464    kgin => hamk%kg_k ; kgout => hamk%kg_k
465    if (associated(hamk%kpg_k)) then
466      kpgin => hamk%kpg_k ; nkpgin=size(kpgin,2)
467    end if
468    if (associated(hamk%kpg_k)) then
469      kpgout => hamk%kpg_k ; nkpgout=size(kpgout,2)
470    end if
471    phkxredin => hamk%phkxred ; phkxredout => hamk%phkxred
472    ffnlin => hamk%ffnl_k ; ffnlout => hamk%ffnl_k
473    if (associated(hamk%ph3d_k)) ph3din  => hamk%ph3d_k
474    if (associated(hamk%ph3d_k)) ph3dout => hamk%ph3d_k
475    force_recompute_ph3d=(.not.(associated(hamk%ph3d_k)))
476    istwf_k=hamk%istwf_k
477  else if (select_k_==KPRIME_H_KPRIME) then
478 !  ===== <k^prime|Vnl|k^prime> =====
479    kptin = hamk%kpt_kp ; kptout = hamk%kpt_kp
480    npwin=hamk%npw_fft_kp ; npwout=hamk%npw_fft_kp
481    kgin => hamk%kg_kp ; kgout => hamk%kg_kp
482    if (associated(hamk%kpg_kp)) then
483      kpgin => hamk%kpg_kp ; nkpgin=size(kpgin,2)
484    end if
485    if (associated(hamk%kpg_kp)) then
486      kpgout => hamk%kpg_kp ; nkpgout=size(kpgout,2)
487    end if
488    phkxredin => hamk%phkpxred ; phkxredout => hamk%phkpxred
489    ffnlin => hamk%ffnl_kp ; ffnlout => hamk%ffnl_kp
490    if (associated(hamk%ph3d_kp)) ph3din  => hamk%ph3d_kp
491    if (associated(hamk%ph3d_kp)) ph3dout => hamk%ph3d_kp
492    force_recompute_ph3d=(.not.(associated(hamk%ph3d_kp)))
493    istwf_k=hamk%istwf_kp
494  end if
495 
496  if (npwin==0.or.npwout==0) return
497  dimffnlin=size(ffnlin,2);dimffnlout=size(ffnlout,2)
498  kpgin_allocated=(.not.associated(kpgin))
499  if (kpgin_allocated) then
500    ABI_MALLOC(kpgin,(npwin,0))
501  end if
502  kpgout_allocated=(.not.associated(kpgout))
503  if (kpgout_allocated) then
504    ABI_MALLOC(kpgout,(npwout,0))
505  end if
506 
507 !Check some sizes for safety
508 !if (paw_opt==0.or.cpopt<2.or.((cpopt==2.or.cpopt==3).and.choice>1)) then
509  if (size(ffnlin,1)/=npwin.or.size(ffnlin,3)/=hamk%lmnmax) then
510    msg = 'Incorrect size for ffnlin!'
511 !   ABI_BUG(msg)
512  end if
513  if(signs==2) then
514    if (size(ffnlout,1)/=npwout.or.size(ffnlout,3)/=hamk%lmnmax) then
515      ABI_BUG('Incorrect size for ffnlout!')
516    end if
517  end if
518 !This test is OK only because explicit sizes are passed to nonlop_* routines
519  if (size(vectin)<2*npwin*my_nspinor*ndat) then
520    !FB: Allow the usage of nonlop from the "linalg" representation where
521    !FB: the cg are distributed over the plane waves with npband > 1
522    !FB: in case signs=1 & choice=1
523    if (signs==1 .and. choice==1) then
524       npwin = size(vectin,2)/ndat/my_nspinor
525    else
526       ABI_BUG('Incorrect size for vectin!')
527    end if
528  end if
529  if(choice/=0.and.signs==2) then
530    if(paw_opt/=3) then
531 !    This test is OK only because explicit sizes are passed to nonlop_* routines
532      if (size(vectout)<2*npwout*my_nspinor*ndat) then
533        ABI_BUG('Incorrect size for vectout!')
534      end if
535    end if
536    if(paw_opt>=3) then
537      if (size(svectout)<2*npwout*my_nspinor*ndat) then
538        ABI_BUG('Incorrect size for svectout!')
539      end if
540    end if
541  end if
542  if(cpopt>=0 .and. .not. present(vectproj)) then
543    if (size(cprjin)<hamk%natom*my_nspinor*ndat) then
544      ABI_BUG('Incorrect size for cprjin!')
545    end if
546  end if
547  ndat_left_ = 1
548  if (present(ndat_left)) then
549    ndat_left_ = ndat_left
550  end if
551  if(present(cprjin_left)) then
552    if (size(cprjin_left)/=hamk%natom*my_nspinor*ndat*ndat_left_) then
553      msg = 'Incorrect size for cprjin_left!'
554      ABI_BUG(msg)
555    end if
556  end if
557 
558 !Non-local coefficients connecting projectors:
559 !If enl is present in the arg list, use it; instead use hamk%ebk
560  if (present(enl)) then
561    enl_ptr => enl
562    dimenl1=size(enl,1);dimenl2=size(enl,2);dimekbq=size(enl,4)
563  else
564    enl_ptr => hamk%ekb
565    dimenl1=hamk%dimekb1;dimenl2=hamk%dimekb2;dimekbq=1
566  end if
567 
568 !In the case of a derivative with respect to an atomic displacement,
569 !and if <g|dVnl/dR|c> is required (signs=2), we only need to compute the
570 !derivatives of the projectors associated with the displaced atom.
571  iatom_only_=-1;if (present(iatom_only)) iatom_only_=iatom_only
572  atom_pert=((signs==2).and.(choice==2.or.choice==4.or.choice==22.or.choice==24.or.choice==25.or.choice==54))
573 
574  if (iatom_only_>0.and.atom_pert) then
575 !   We consider only atom with index iatom_only
576    iatm=hamk%atindx(iatom_only_);itypat=hamk%typat(iatom_only_)
577    natom_=1 ; ntypat_=1 ; dimenl2_=1 ; matblk_=1
578    nloalg_(:)=hamk%nloalg(:)
579    ABI_MALLOC(atindx1_,(1))
580    ABI_MALLOC(nattyp_,(1))
581    atindx1_(1)=1 ; nattyp_(1)=1
582 !  Store at the right place the 1d phases
583    n1=hamk%ngfft(1);n2=hamk%ngfft(2);n3=hamk%ngfft(3)
584    ABI_MALLOC(ph1d_,(2,(2*n1+1)+(2*n2+1)+(2*n3+1)))
585    shift1=(iatm-1)*(2*n1+1)
586    ph1d_(:,1:2*n1+1)=hamk%ph1d(:,1+shift1:2*n1+1+shift1)
587    shift2=(iatm-1)*(2*n2+1)+hamk%natom*(2*n1+1)
588    ph1d_(:,1+2*n1+1:2*n2+1+2*n1+1)=hamk%ph1d(:,1+shift2:2*n2+1+shift2)
589    shift3=(iatm-1)*(2*n3+1)+hamk%natom*(2*n1+1+2*n2+1)
590    ph1d_(:,1+2*n1+1+2*n2+1:2*n3+1+2*n2+1+2*n1+1)=hamk%ph1d(:,1+shift3:2*n3+1+shift3)
591    ABI_MALLOC(phkxredin_,(2,1))
592    ABI_MALLOC(phkxredout_,(2,1))
593    phkxredin_(:,1)=phkxredin(:,iatm)
594    phkxredout_(:,1)=phkxredout(:,iatm)
595    ABI_MALLOC(ph3din_,(2,npwin,1))
596    ABI_MALLOC(ph3dout_,(2,npwout,1))
597    if (force_recompute_ph3d.or.hamk%matblk<hamk%natom) then
598      nloalg_(2)=-abs(nloalg_(2)) !Will compute the 3D phase factors inside nonlop
599    else
600      ph3din_(:,1:npwin,1)=ph3din(:,1:npwin,iatm)
601      ph3dout_(:,1:npwout,1)=ph3dout(:,1:npwout,iatm)
602    end if
603    ABI_MALLOC(ffnlin_,(npwin,dimffnlin,hamk%lmnmax,1))
604    ABI_MALLOC(ffnlout_,(npwout,dimffnlout,hamk%lmnmax,1))
605    ffnlin_(:,:,:,1)=ffnlin(:,:,:,itypat)
606    ffnlout_(:,:,:,1)=ffnlout(:,:,:,itypat)
607    ABI_MALLOC(cprjin_,(1,my_nspinor*((cpopt+5)/5)))
608    if (cpopt>=0) then
609      nlmn_atm(1)=cprjin(iatm,1)%nlmn
610      ncpgr_atm=cprjin(iatm,1)%ncpgr
611      call pawcprj_alloc(cprjin_,ncpgr_atm,nlmn_atm)
612      do idat=1,ndat
613        do ispinor=1,my_nspinor
614          jspinor=ispinor+(idat-1)*my_nspinor
615          call pawcprj_copy(cprjin(iatm:iatm,jspinor:jspinor),cprjin_(1:1,jspinor:jspinor))
616        end do
617      end do
618    end if
619    if (size(enl_ptr)>0) then
620      ABI_MALLOC(enl_,(size(enl_ptr,1),1,hamk%nspinor**2,size(enl_ptr,4)))
621      do ii=1,size(enl_ptr,4)
622        do ispden=1,hamk%nspinor**2
623          if (dimenl2==hamk%natom .and. hamk%usepaw==1) then
624            enl_(:,1,ispden,ii)=enl_ptr(:,iatom_only_,ispden,ii)
625          else if (dimenl2==hamk%ntypat) then
626            enl_(:,1,ispden,ii)=enl_ptr(:,itypat,ispden,ii)
627          else
628            enl_(:,1,ispden,ii)=enl_ptr(:,1,ispden,ii)
629          end if
630        end do
631      end do
632    else
633      ABI_MALLOC(enl_,(0,0,0,0))
634    end if
635    if (allocated(hamk%sij)) then
636      dimsij=size(hamk%sij,1)
637      ABI_MALLOC(sij_,(dimsij,1))
638      if (size(hamk%sij,2)==hamk%ntypat) then
639        sij_(:,1)=hamk%sij(:,itypat)
640      else if (size(hamk%sij)>0) then
641        sij_(:,1)=hamk%sij(:,1)
642      end if
643    end if
644    ABI_MALLOC(indlmn_,(6,hamk%lmnmax,1))
645    indlmn_(:,:,1)=hamk%indlmn(:,:,itypat)
646 
647  else
648 !  Usual case: all atoms are processed
649    natom_  =hamk%natom; ntypat_=hamk%ntypat
650    dimenl2_=dimenl2   ; matblk_=hamk%matblk
651    nloalg_(:)  = hamk%nloalg(:)
652    atindx1_    => hamk%atindx1
653    nattyp_     => hamk%nattyp
654    ph1d_       => hamk%ph1d
655    phkxredin_  => phkxredin
656    phkxredout_ => phkxredout
657    ffnlin_     => ffnlin
658    ffnlout_    => ffnlout
659    cprjin_     => cprjin
660    enl_        => enl_ptr
661    sij_        => hamk%sij
662    indlmn_     => hamk%indlmn
663    if (force_recompute_ph3d) then
664      nloalg_(2)=-abs(nloalg_(2)) !Will compute the 3D phase factors inside nonlop
665      ABI_MALLOC(ph3din_,(2,npwin,hamk%matblk))
666      ABI_MALLOC(ph3dout_,(2,npwout,hamk%matblk))
667    else
668      ph3din_     => ph3din
669      ph3dout_    => ph3dout
670    end if
671  end if
672 
673 !A specific version of nonlop based on BLAS3 can be used
674 !But there are several restrictions
675 
676  use_gemm_nonlop=.false.
677  if (gemm_nonlop_use_gemm) then
678    use_gemm_nonlop=gemm_nonlop_kpt(gemm_nonlop_ikpt_this_proc_being_treated)%nprojs>0
679    if(signs==2) then
680      use_gemm_nonlop= ( use_gemm_nonlop .and. &
681 &      ( paw_opt /= 2 .and. &
682 &        cpopt < 3 .and. hamk%useylm /= 0 .and. &
683 &        (choice < 2 .or. choice == 7) ) )
684    end if
685    if(signs==1) then
686      use_gemm_nonlop= ( use_gemm_nonlop .and. hamk%useylm/=0 .and. &
687        ! Forces and stress (forstr)
688 &      ( ((choice >= 1 .and. choice <= 3) .or. choice == 23) .and. &
689 &        gemm_nonlop_kpt(gemm_nonlop_ikpt_this_proc_being_treated)%ngrads>0 ) .or. &
690        ! Rho ij
691 &      choice == 0  )
692      !FIXME forces and constraints computation not handled in CUDA GEMM nonlop
693      if(choice > 0 .and. (hamk%gpu_option==ABI_GPU_LEGACY .or. hamk%gpu_option==ABI_GPU_KOKKOS)) use_gemm_nonlop=.false.
694    end if
695  end if
696 
697  if(use_gemm_nonlop) then
698 
699    !FIXME Settle this
700    if(hamk%gpu_option==ABI_GPU_OPENMP) then
701 
702      call gemm_nonlop_ompgpu(atindx1_,choice,cpopt,cprjin_,dimenl1,dimenl2_,dimekbq,&
703          dimffnlin,dimffnlout,enl_,enlout,ffnlin_,ffnlout_,hamk%gmet,hamk%gprimd,&
704          idir,indlmn_,istwf_k,kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,&
705          hamk%lmnmax,matblk_,hamk%mgfft,mpi_enreg,hamk%mpsang,hamk%mpssoang,&
706          natom_,nattyp_,ndat,hamk%ngfft,nkpgin,nkpgout,nloalg_,&
707          nnlout,npwin,npwout,my_nspinor,hamk%nspinor,ntypat_,only_SO_,paw_opt,&
708          phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,sij_,svectout,&
709          tim_nonlop,hamk%ucvol,hamk%useylm,vectin,vectout,vectproj=vectproj,&
710          gpu_option=hamk%gpu_option)
711 
712    else if (hamk%gpu_option==ABI_GPU_LEGACY .or. hamk%gpu_option==ABI_GPU_KOKKOS) then
713 
714 #if defined HAVE_GPU_CUDA
715      call gemm_nonlop_gpu(atindx1_, choice, cpopt, cprjin_, dimenl1, dimenl2_, dimekbq, &
716          dimffnlin, dimffnlout, &
717          enl_, indlmn_, istwf_k, &
718          lambda, hamk%lmnmax, matblk_, &
719          mpi_enreg, natom_, nattyp_, ndat, nkpgin, nkpgout, &
720          nnlout, npwin, npwout, my_nspinor, hamk%nspinor, ntypat_, paw_opt, &
721          sij_, svectout, &
722          hamk%useylm, vectin, vectout, &
723          vectproj=vectproj,gpu_option=hamk%gpu_option)
724 #else
725    ABI_ERROR("abinit was not compiled with GPU support")
726 #endif
727 
728    else
729 
730      call gemm_nonlop(atindx1_,choice,cpopt,cprjin_,dimenl1,dimenl2_,dimekbq,&
731          dimffnlin,dimffnlout,enl_,enlout,ffnlin_,ffnlout_,hamk%gmet,hamk%gprimd,&
732          idir,indlmn_,istwf_k,kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,&
733          hamk%lmnmax,matblk_,hamk%mgfft,mpi_enreg,hamk%mpsang,hamk%mpssoang,&
734          natom_,nattyp_,ndat,hamk%ngfft,nkpgin,nkpgout,nloalg_,&
735          nnlout,npwin,npwout,my_nspinor,hamk%nspinor,ntypat_,only_SO_,paw_opt,&
736          phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,sij_,svectout,&
737          tim_nonlop,hamk%ucvol,hamk%useylm,vectin,vectout,vectproj=vectproj,&
738          gpu_option=hamk%gpu_option)
739 
740    end if
741 
742  else
743 
744    if(xomp_target_is_present(c_loc(vectin))) then
745 #ifdef HAVE_OPENMP_OFFLOAD
746      !$OMP TARGET UPDATE FROM(vectin,vectout,svectout) IF(hamk%gpu_option==ABI_GPU_OPENMP)
747 #endif
748    end if
749 
750    !$omp parallel do default(shared), &
751    !$omp& firstprivate(ndat,npwin,my_nspinor,choice,signs,paw_opt,npwout,cpopt,nnlout), &
752    !$omp& private(b0,b1,b2,b3,b4,e0,e1,e2,e3,e4)
753    !!$omp& schedule(static), if(hamk%gpu_option==ABI_GPU_DISABLED)
754    do idat=1, ndat
755      !vectin_idat => vectin(:,1+npwin*my_nspinor*(idat-1):npwin*my_nspinor*idat)
756      b0 = 1+npwin*my_nspinor*(idat-1)
757      e0 = npwin*my_nspinor*idat
758      if (choice/=0.and.signs==2.and.paw_opt/=3) then
759        !vectout_idat => vectout(:,1+npwout*my_nspinor*(idat-1):npwout*my_nspinor*idat)
760        b1 = 1+npwout*my_nspinor*(idat-1)
761        e1 = npwout*my_nspinor*idat
762      else
763        !vectout_idat => vectout
764        b1 = lbound(vectout,dim=2)
765        e1 = ubound(vectout,dim=2)
766      end if
767      if (choice/=0.and.signs==2.and.paw_opt>=3) then
768        !svectout_idat => svectout(:,1+npwout*my_nspinor*(idat-1):npwout*my_nspinor*idat)
769        b2 = 1+npwout*my_nspinor*(idat-1)
770        e2 = npwout*my_nspinor*idat
771      else
772        !svectout_idat => svectout
773        b2 = lbound(svectout,dim=2)
774        e2 = ubound(svectout,dim=2)
775      end if
776 
777      if (cpopt>=0) then
778        !cprjin_idat => cprjin_(:,my_nspinor*(idat-1)+1:my_nspinor*(idat))
779        b3 = my_nspinor*(idat-1)+1
780        e3 = my_nspinor*(idat)
781      else
782        !cprjin_idat => cprjin_
783        b3 = lbound(cprjin_,dim=2)
784        e3 = ubound(cprjin_,dim=2)
785      end if
786      if (nnlout>0) then
787       !enlout_idat => enlout((idat-1)*nnlout+1:(idat*nnlout))
788        b4 = (idat-1)*nnlout*ndat_left_+1
789        e4 = (idat*nnlout*ndat_left_)
790      else
791       !enlout_idat => enlout
792        b4 = lbound(enlout,dim=1)
793        e4 = ubound(enlout,dim=1)
794      end if
795 
796 !    Legendre Polynomials version
797      if (hamk%useylm==0) then
798        call nonlop_pl(choice,dimenl1,dimenl2_,dimffnlin,dimffnlout,enl_,&
799 &       enlout(b4:e4),ffnlin_,ffnlout_,hamk%gmet,hamk%gprimd,idir,indlmn_,istwf_k,&
800 &       kgin,kgout,kpgin,kpgout,kptin,kptout,hamk%lmnmax,matblk_,hamk%mgfft,&
801 &       mpi_enreg,hamk%mpsang,hamk%mpssoang,natom_,nattyp_,hamk%ngfft,&
802 &       nkpgin,nkpgout,nloalg_,npwin,npwout,my_nspinor,hamk%nspinor,&
803 &       ntypat_,only_SO_,phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,hamk%ucvol,&
804 &       vectin(:,b0:e0),vectout(:,b1:e1))
805 !    Spherical Harmonics version
806      else if (hamk%gpu_option==ABI_GPU_DISABLED .or. hamk%gpu_option==ABI_GPU_OPENMP) then
807        if (present(cprjin_left).and.present(enlout_im)) then
808          call nonlop_ylm(atindx1_,choice,cpopt,cprjin_(:,b3:e3),dimenl1,dimenl2_,dimekbq,&
809 &         dimffnlin,dimffnlout,enl_,enlout(b4:e4),ffnlin_,ffnlout_,hamk%gprimd,idir,&
810 &         indlmn_,istwf_k,kgin,kgout,kpgin,kpgout,kptin,kptout,lambda(idat),&
811 &         hamk%lmnmax,matblk_,hamk%mgfft,mpi_enreg,natom_,nattyp_,hamk%ngfft,&
812 &         nkpgin,nkpgout,nloalg_,nnlout,npwin,npwout,my_nspinor,hamk%nspinor,&
813 &         ntypat_,paw_opt,phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,sij_,&
814 &         svectout(:,b2:e2),hamk%ucvol,vectin(:,b0:e0),vectout(:,b1:e1),qdir=qdir,&
815           cprjin_left=cprjin_left,enlout_im=enlout_im,ndat_left=ndat_left_)
816       else
817          call nonlop_ylm(atindx1_,choice,cpopt,cprjin_(:,b3:e3),dimenl1,dimenl2_,dimekbq,&
818 &         dimffnlin,dimffnlout,enl_,enlout(b4:e4),ffnlin_,ffnlout_,hamk%gprimd,idir,&
819 &         indlmn_,istwf_k,kgin,kgout,kpgin,kpgout,kptin,kptout,lambda(idat),&
820 &         hamk%lmnmax,matblk_,hamk%mgfft,mpi_enreg,natom_,nattyp_,hamk%ngfft,&
821 &         nkpgin,nkpgout,nloalg_,nnlout,npwin,npwout,my_nspinor,hamk%nspinor,&
822 &         ntypat_,paw_opt,phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,sij_,&
823 &         svectout(:,b2:e2),hamk%ucvol,vectin(:,b0:e0),vectout(:,b1:e1),qdir=qdir)
824        end if
825 !    GPU version
826      else
827        call nonlop_gpu(atindx1_,choice,cpopt,cprjin(:,b3:e3),dimenl1,dimenl2_,&
828 &       dimffnlin,dimffnlout,enl_,enlout(b4:e4),ffnlin_,ffnlout_,hamk%gprimd,idir,&
829 &       indlmn_,istwf_k,kgin,kgout,kpgin,kpgout,kptin,kptout,lambda(idat),&
830 &       hamk%lmnmax,matblk_,hamk%mgfft,mpi_enreg,natom_,nattyp_,hamk%ngfft,&
831 &       nkpgin,nkpgout,nloalg_,nnlout,npwin,npwout,my_nspinor,hamk%nspinor,&
832 &       ntypat_,paw_opt,phkxredin_,phkxredout_,ph1d_,ph3din_,ph3dout_,signs,sij_,&
833 &       svectout(:,b2:e2),hamk%ucvol,vectin(:,b0:e0),vectout(:,b1:e1))
834     end if
835 
836    end do
837    !$omp end parallel do
838 
839    if(xomp_target_is_present(c_loc(vectin))) then
840 #ifdef HAVE_OPENMP_OFFLOAD
841      !$OMP TARGET UPDATE TO(vectin,vectout,svectout) IF(hamk%gpu_option==ABI_GPU_OPENMP)
842 #endif
843    end if
844 
845  end if
846 
847 !Release temporary storage
848  if (iatom_only_>0.and.atom_pert) then
849    if (cpopt>=0) then
850      call pawcprj_free(cprjin_)
851    end if
852    ABI_FREE(atindx1_)
853    ABI_FREE(nattyp_)
854    ABI_FREE(ph1d_)
855    ABI_FREE(ph3din_)
856    ABI_FREE(ph3dout_)
857    ABI_FREE(phkxredin_)
858    ABI_FREE(phkxredout_)
859    ABI_FREE(ffnlin_)
860    ABI_FREE(ffnlout_)
861    ABI_FREE(enl_)
862    ABI_FREE(indlmn_)
863    ABI_FREE(cprjin_)
864    if (allocated(hamk%sij)) then
865      ABI_FREE(sij_)
866    end if
867  else if (force_recompute_ph3d) then
868    ABI_FREE(ph3din_)
869    ABI_FREE(ph3dout_)
870  end if
871  if (kpgin_allocated) then
872    ABI_FREE(kpgin)
873  end if
874  if (kpgout_allocated) then
875    ABI_FREE(kpgout)
876  end if
877 
878  call timab(220+tim_nonlop,2,tsec)
879 
880  DBG_EXIT("COLL")
881 
882 end subroutine nonlop

ABINIT/nonlop_gpu [ Functions ]

[ Top ] [ Functions ]

NAME

 nonlop_gpu

FUNCTION

  Compute application of a nonlocal operator, using GPU (NVidia Cuda)
  This routine is an interface to Cuda Kernel gpu_nonlop.cu

 Copyright (C) 2011-2024 ABINIT group (FDahm, MT)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 For the initials of contributors, see ~abinit/doc/developers/contributors.txt.

INPUTS

  atindx1(natom)=index table for atoms, inverse of atindx
  choice: chooses possible output:
    choice=0 => do nothing (only compute WF projected with NL projectors)
          =1 => a non-local energy contribution
          =2 => a gradient with respect to atomic position(s)
          =3 => a gradient with respect to strain(s)
          =23=> a gradient with respect to atm. pos. and strain(s)
  cpopt=flag defining the status of cprjin%cp(:)=<Proj_i|Cnk> scalars (see below, side effects)
  dimenl1,dimenl2=dimensions of enl (see enl)
  dimffnlin=second dimension of ffnlin (1+number of derivatives)
  dimffnlout=second dimension of ffnlout (1+number of derivatives)
  enl(dimenl1,dimenl2,nspinortot**2)=
  ->Norm conserving : ==== when paw_opt=0 ====
                      (Real) Kleinman-Bylander energies (hartree)
                      dimenl1=lmnmax  -  dimenl2=ntypat
  ->PAW :             ==== when paw_opt=1, 2 or 4 ====
                      (Real or complex, hermitian) Dij coefs to connect projectors
                      dimenl1=cplex_enl*lmnmax*(lmnmax+1)/2  -  dimenl2=natom
  ffnlin(npwin,dimffnlin,lmnmax,ntypat)=nonlocal form factors to be used
          for the application of the nonlocal operator to the |in> vector
  ffnlout(npwout,dimffnlout,lmnmax,ntypat)=nonlocal form factors to be used
          for the application of the nonlocal operator to the |out> vector
  gprimd(3,3)=dimensional reciprocal space primitive translations
  idir=direction of the - atom to be moved in the case (choice=2,signs=2),
                        - k point direction in the case (choice=5,signs=2)
                          for choice 53, twisted derivative involves idir+1 and idir+2 (mod 3)
                        - strain component (1:6) in the case (choice=3,signs=2) or (choice=6,signs=1)
  indlmn(6,i,ntypat)= array giving l,m,n,lm,ln,s for i=lmn
  istwf_k=option parameter that describes the storage of wfs
  kgin(3,npwin)=integer coords of planewaves in basis sphere, for the |in> vector
  kgout(3,npwout)=integer coords of planewaves in basis sphere, for the |out> vector
  kpgin(npw,npkgin)= (k+G) components and related data, for the |in> vector
  kpgout(npw,nkpgout)=(k+G) components and related data, for the |out> vector
  kptin(3)=k point in terms of recip. translations, for the |in> vector
  kptout(3)=k point in terms of recip. translations, for the |out> vector
  lambda=factor to be used when computing (Vln-lambda.S) - only for paw_opt=2
         Typically lambda is the eigenvalue (or its guess)
  lmnmax=max. number of (l,m,n) components over all types of atoms
  matblk=dimension of the arrays ph3din and ph3dout
  mgfft=maximum size of 1D FFTs
  mpi_enreg=information about MPI parallelization
  natom=number of atoms in cell
  nattyp(ntypat)=number of atoms of each type
  ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
  nkpgin,nkpgout=second sizes of arrays kpgin/kpgout
  nloalg(3)=governs the choice of the algorithm for nonlocal operator
  nnlout=dimension of enlout (when signs=1 and choice>0):
         ==== if paw_opt=0, 1 or 2 ====
         choice=1=>nnlout=1   choice=2=>nnlout=3*natom    choice=3=>nnlout=6
         ==== if paw_opt=3 ====
         choice=1 =>nnlout=1
         ==== if paw_opt=4 ====
         not available
  npwin=number of planewaves for given k point, for the |in> vector
  npwout=number of planewaves for given k point, for the |out> vector
  nspinor=number of spinorial components of the wavefunctions (on current proc)
  nspinortot=number of spinorial components of the wavefunctions on current proc
  ntypat=number of types of atoms in cell
  paw_opt= define the nonlocal operator concerned with:
           paw_opt=0 : Norm-conserving Vnl (use of Kleinman-Bylander ener.)
           paw_opt=1 : PAW nonlocal part of H (use of Dij coeffs)
           paw_opt=2 : PAW: (Vnl-lambda.Sij) (Sij=overlap matrix)
           paw_opt=3 : PAW overlap matrix (Sij)
           paw_opt=4 : both PAW nonlocal part of H (Dij) and overlap matrix (Sij)
  phkxredin(2,natom)=phase factors exp(2 pi kptin.xred)
  phkxredout(2,natom)=phase factors exp(2 pi kptout.xred)
  ph1d(2,3*(2*mgfft+1)*natom)=1D structure factors phase information
  ph3din(2,npwin,matblk)=3D structure factors, for each atom and plane wave (in)
  ph3dout(2,npwout,matblk)=3-dim structure factors, for each atom and plane wave (out)
  signs= if 1, get contracted elements (energy, forces, stress, ...)
         if 2, applies the non-local operator to a function in reciprocal space
  sij(dimenl1,ntypat*(paw_opt/3))=overlap matrix components (only if paw_opt=2, 3 or 4)
  ucvol=unit cell volume (bohr^3)
  vectin(2,npwin*nspinor)=input cmplx wavefunction coefficients <G|Cnk>
  [cprjin_left(natom,nspinor)]=The projected input wave function <p_nlm|in_left>
    for the left wavefunction. Data are assumed to be in memory, they are NOT recalculated here.
    Only signs==1 and choice==1 are supported.

OUTPUT

 ==== if (signs==1) ====
 --If (paw_opt==0, 1 or 2)
    enlout(nnlout)= contribution to the non-local part of the following properties:
      if choice=1 : enlout(1)               -> the energy
      if choice=2 : enlout(1:3*natom)       -> the forces
      if choice=3 : enlout(1:6)             -> the stresses
      if choice=23: enlout(1:6+3*natom)     -> the forces and the stresses
 --If (paw_opt==3)
    if choice=1 : enlout(nnlout)= contribution to <c|S|c>  (nnlout=1)
 --If (paw_opt==4)
    not available
 ==== if (signs==2) ====
 --if (paw_opt=0, 1 or 4)
    vectout(2,npwout*nspinor)=result of the aplication of the concerned operator
                or one of its derivatives to the input vect.:
      if (choice=1) <G|V_nonlocal|vect_start>
      if (choice=2) <G|dV_nonlocal/d(atm coord)|vect_start>
      if (choice=3) <G|dV_nonlocal/d(strain)|vect_start>
  if (paw_opt=2)
    vectout(2,npwout*nspinor)=final vector in reciprocal space:
      if (choice=1) <G|V_nonlocal-lamdba.(I+S)|vect_start>
      if (choice=2) <G|d[V_nonlocal-lamdba.(I+S)]/d(atm coord)|vect_start>
      if (choice=3) <G|d[V_nonlocal-lamdba.(I+S)]/d(strain)|vect_start>
 --if (paw_opt=3 or 4)
    svectout(2,npwout*nspinor)=result of the aplication of Sij (overlap matrix)
                  or one of its derivatives to the input vect.:
      if (choice=1) <G|I+S|vect_start>
      if (choice=2) <G|dS/d(atm coord)|vect_start>
      if (choice=3) <G|dS/d(strain)|vect_start>

SIDE EFFECTS

  cprjin(natom,nspinor) <type(pawcprj_type)>=projected input wave function |in> on non-local projectors
                                  =<p_lmn|in> and derivatives
                    Treatment depends on cpopt parameter:
                     if cpopt=-1, <p_lmn|in> (and derivatives)
                                  are computed here (and not saved)
                     if cpopt= 0, <p_lmn|in> are computed here and saved
                                  derivatives are eventually computed but not saved
                     if cpopt= 1, <p_lmn|in> and first derivatives are computed here and saved
                                  other derivatives are eventually computed but not saved

TODO

 * Implementation for spinorial wave functions (nspinor=2)
 * Implementation for response function (phonons, ddk, elastic tensor, ...)

SOURCE

1028  subroutine nonlop_gpu(atindx1,choice,cpopt,cprjin,dimenl1,dimenl2,dimffnlin,dimffnlout,&
1029 &                      enl,enlout,ffnlin,ffnlout,gprimd,idir,indlmn,istwf_k,&
1030 &                      kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,lmnmax,matblk,mgfft,&
1031 &                      mpi_enreg,natom,nattyp,ngfft,nkpgin,nkpgout,nloalg,nnlout,&
1032 &                      npwin,npwout,nspinor,nspinortot,ntypat,paw_opt,phkxredin,phkxredout,ph1d,&
1033 &                      ph3din,ph3dout,signs,sij,svectout,ucvol,vectin,vectout)
1034 
1035 !Arguments ------------------------------------
1036 !scalars
1037  integer,intent(in) :: choice,cpopt,dimenl1,dimenl2,dimffnlin,dimffnlout,idir
1038  integer,intent(in) :: istwf_k,lmnmax,matblk,mgfft,natom,nkpgin,nkpgout,nnlout
1039  integer,intent(in) :: npwin,npwout,nspinor,nspinortot,ntypat,paw_opt,signs
1040  real(dp),intent(in) :: lambda,ucvol
1041  type(MPI_type),intent(in) :: mpi_enreg
1042 !arrays
1043  integer,intent(in) :: atindx1(natom),indlmn(6,lmnmax,ntypat),kgin(3,npwin)
1044  integer,intent(in) :: kgout(3,npwout),nattyp(ntypat),ngfft(18),nloalg(3)
1045  real(dp),intent(in) :: enl(dimenl1,dimenl2,nspinortot**2)
1046  real(dp),intent(in) :: ffnlin(npwin,dimffnlin,lmnmax,ntypat)
1047  real(dp),intent(in) :: ffnlout(npwout,dimffnlout,lmnmax,ntypat) !,gmet(3,3)
1048  real(dp),intent(in) :: gprimd(3,3),kpgin(npwin,nkpgin),kpgout(npwout,nkpgout)
1049  real(dp),intent(in) :: kptin(3),kptout(3),ph1d(2,3*(2*mgfft+1)*natom)
1050  real(dp),intent(in) :: phkxredin(2,natom),phkxredout(2,natom)
1051  real(dp),intent(in) :: sij(dimenl1,ntypat*((paw_opt+1)/3))
1052  real(dp),intent(inout) :: ph3din(2,npwin,matblk),ph3dout(2,npwout,matblk)
1053  real(dp),intent(inout) :: vectin(:,:)
1054  real(dp),intent(out) :: enlout(:)
1055  real(dp),intent(out),target :: svectout(:,:)
1056  real(dp),intent(out),target :: vectout (:,:)
1057  type(pawcprj_type),intent(inout) :: cprjin(:,:)
1058 
1059 !Local variables-------------------------------
1060 !scalars
1061  integer :: ia,iatom,ilmn,iproj,ispinor,itypat,signs_
1062  real(dp) :: doti
1063  character(len=500) :: msg
1064 !arrays
1065  real(dp),allocatable :: proj(:,:)
1066  real(dp),pointer :: svectout_(:,:),vectout_(:,:)
1067 
1068 ! **********************************************************************
1069 
1070  DBG_ENTER("COLL")
1071 
1072 !Error on bad choice
1073  if ((choice<0 .or. (choice>3.and.choice/=7)).and. choice/=23 .and. choice/=24) then
1074    write(msg,'(a,i0,a)')'Does not presently support this choice=',choice,'.'
1075    ABI_BUG(msg)
1076  end if
1077  if (cpopt<-1.or.cpopt>2) then
1078    msg='  Bad value for cpopt !'
1079    ABI_BUG(msg)
1080  end if
1081  if (nspinor==2) then
1082    msg='  nspinor=2 (spinorial WF) not yet allowed !'
1083    ABI_ERROR(msg)
1084  end if
1085 
1086  if ((cpopt==0).or.(cpopt==1).or.(cpopt==2))  then
1087    ABI_MALLOC(proj,(2,lmnmax*natom))
1088    proj=zero;
1089  end if
1090 
1091 !Workaround to get choice=1/signs=1 working
1092  if (choice==1.and.signs==1) then
1093    signs_=2
1094    ABI_MALLOC(vectout_,(2,npwin*nspinor))
1095    ABI_MALLOC(svectout_,(2,npwin*nspinor*(paw_opt/3)))
1096  else
1097    signs_=signs;vectout_=>vectout;svectout_=>svectout
1098  end if
1099 
1100 !if cpot==2, the projections are already in memory
1101  if (cpopt>=2) then
1102    iproj=0
1103    do ispinor=1,nspinor
1104      iatom=0
1105      do itypat=1,ntypat
1106        do ia=1,nattyp(itypat)
1107          iatom=iatom+1
1108          do ilmn=1,cprjin(iatom,1)%nlmn
1109            iproj=iproj+1
1110            proj(:,iproj)=cprjin(iatom,1)%cp(:,ilmn)
1111          end do
1112        end do
1113      end do
1114    end do
1115  end if
1116 
1117 #if defined HAVE_GPU_CUDA
1118  call gpu_nonlop(atindx1,choice,cpopt,proj,dimenl1,dimenl2,dimffnlin,dimffnlout,&
1119 & enl,enlout,ffnlin,ffnlout,gprimd,idir,indlmn,istwf_k,&
1120 & kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,lmnmax,matblk,mgfft,&
1121 & mpi_enreg%me_g0_fft,natom,nattyp,ngfft,nkpgin,nkpgout,nloalg,nnlout,&
1122 & npwin,npwout,nspinor,ntypat,paw_opt,phkxredin,phkxredout,ph1d,&
1123 & ph3din,ph3dout,signs_,sij,svectout_,pi,ucvol,vectin,vectout_)
1124 #else
1125  ABI_UNUSED(nnlout)
1126 #endif
1127 
1128  if (choice==1.and.signs==1) then
1129    if (paw_opt/=3) then
1130      call dotprod_g(enlout(1),doti,istwf_k,npwin*nspinor,1,vectin,vectout_,mpi_enreg%me_g0_fft,mpi_enreg%comm_spinorfft)
1131    else
1132      call dotprod_g(enlout(1),doti,istwf_k,npwin*nspinor,1,vectin,svectout_,mpi_enreg%me_g0_fft,mpi_enreg%comm_spinorfft)
1133    end if
1134    ABI_FREE(vectout_)
1135    ABI_FREE(svectout_)
1136  else
1137    nullify(vectout_,svectout_)
1138  end if
1139 
1140  if ((cpopt==0).or.(cpopt==1)) then
1141    iproj=0
1142    do ispinor=1,nspinor
1143      iatom=0
1144      do itypat=1,ntypat
1145        do ia=1,nattyp(itypat)
1146          iatom=iatom+1;cprjin(iatom,1)%nlmn=count(indlmn(3,:,itypat)>0)
1147          do ilmn=1,cprjin(iatom,1)%nlmn
1148            iproj=iproj+1;cprjin(iatom,1)%cp(:,ilmn)=proj(:,iproj)
1149          end do
1150        end do
1151      end do
1152    end do
1153  end if
1154 
1155  if (allocated(proj)) then
1156    ABI_FREE(proj)
1157  end if
1158 
1159  DBG_EXIT("COLL")
1160 
1161 !Fake statements to satisfy ABI rules
1162 #if ! defined HAVE_GPU_CUDA
1163  if (.false.) then
1164    write(std_out,*) atindx1,enl,ffnlin,ffnlout,gprimd
1165    write(std_out,*) idir,istwf_k,kgin,kgout,kpgin,kpgout
1166    write(std_out,*) kptin,kptout,lambda,mpi_enreg%me
1167    write(std_out,*) ngfft,nloalg,ph1d,ph3din,ph3dout
1168    write(std_out,*) phkxredin,phkxredout,signs,sij
1169    write(std_out,*) ucvol,vectin
1170  end if
1171 #endif
1172 
1173 end subroutine nonlop_gpu

TABLE OF CONTENTS

ABINIT/m_nonlop [ Modules ]

ABINIT/nonlop [ Functions ]

ABINIT/nonlop_gpu [ Functions ]