TABLE OF CONTENTS


ABINIT/m_gemm_nonlop [ Modules ]

[ Top ] [ Modules ]

NAME

 m_gemm_nonlop

FUNCTION

  This module provides functions to compute the nonlocal operator by means of the BLAS GEMM
  routine. By treating ndat simultaneous wavefunctions, it is able to exploit BLAS3 routines,
  which leads to excellent CPU efficiency and OpenMP scalability.

COPYRIGHT

 Copyright (C) 2014-2018 ABINIT group (AL)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .

PARENTS

CHILDREN

SOURCE

23 ! TODO list :
24 ! Don't allocate the full nkpt structures, only those that are treated by this proc: use same init as in m_bandfft_kpt
25 ! support more options (forces & stresses mostly)
26 ! Support RF/other computations (only GS right now)
27 ! handle the case where nloalg(2) < 0, ie no precomputation of ph3d
28 ! more systematic checking of the workflow (right now, only works if init/make/gemm/destroy, no multiple makes, etc)
29 ! Avoid allocating the complex matrix when istwfk > 1
30 ! Merge with chebfi's invovl
31 
32 
33 #if defined HAVE_CONFIG_H
34 #include "config.h"
35 #endif
36 
37 #include "abi_common.h"
38 
39 ! Commented for now because it causes bugs on Shiva's OpenBLAS (?!)
40 ! #if defined HAVE_LINALG_GEMM3M
41 ! #define ZGEMM ZGEMM3M
42 ! #endif
43 
44 module m_gemm_nonlop
45 
46  use defs_basis
47  use defs_abitypes
48  use m_errors
49  use m_abicore
50  use m_xmpi
51 
52  use m_opernlc_ylm,    only :  opernlc_ylm
53 
54  implicit none
55 
56  private
57 
58  ! Use these routines in order : first call init, then call make_gemm_nonlop for each k point,
59  ! then call gemm_nonlop to do the actual computation, and call destroy when done. See gstate and vtorho.
60  public :: init_gemm_nonlop
61  public :: make_gemm_nonlop
62  public :: gemm_nonlop
63  public :: destroy_gemm_nonlop

m_gemm_nonlop/destroy_gemm_nonlop [ Functions ]

[ Top ] [ m_gemm_nonlop ] [ Functions ]

NAME

 destroy_gemm_nonlop

FUNCTION

 Initalization of the gemm_nonlop_kpt array

INPUTS

 nkpt= number of k-points

PARENTS

      gstate

CHILDREN

      dgemm,opernlc_ylm,xmpi_sum,zgemm

SOURCE

155  subroutine destroy_gemm_nonlop(nkpt)
156 
157 
158 !This section has been created automatically by the script Abilint (TD).
159 !Do not modify the following lines by hand.
160 #undef ABI_FUNC
161 #define ABI_FUNC 'destroy_gemm_nonlop'
162 !End of the abilint section
163 
164   implicit none
165 
166   integer,intent(in) :: nkpt
167   integer :: ikpt
168 
169 ! *************************************************************************
170 
171   ! TODO add cycling if kpt parallelism
172   do ikpt = 1,nkpt
173     if(gemm_nonlop_kpt(ikpt)%nprojs /= -1) then
174       ABI_DEALLOCATE(gemm_nonlop_kpt(ikpt)%projs)
175       ABI_DEALLOCATE(gemm_nonlop_kpt(ikpt)%projs_r)
176       ABI_DEALLOCATE(gemm_nonlop_kpt(ikpt)%projs_i)
177       gemm_nonlop_kpt(ikpt)%nprojs = -1
178     end if
179   end do
180 
181   ABI_DATATYPE_DEALLOCATE(gemm_nonlop_kpt)
182 
183  end subroutine destroy_gemm_nonlop

m_gemm_nonlop/gemm_nonlop [ Functions ]

[ Top ] [ m_gemm_nonlop ] [ Functions ]

NAME

 gemm_nonlop

FUNCTION

 Replacement of nonlop

 same prototype as nonlop

INPUTS

PARENTS

      nonlop

CHILDREN

      dgemm,opernlc_ylm,xmpi_sum,zgemm

SOURCE

341  subroutine gemm_nonlop(atindx1,choice,cpopt,cprjin,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout,&
342 &                 enl,enlout,ffnlin,ffnlout,gmet,gprimd,idir,indlmn,istwf_k,&
343 &                 kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,lmnmax,matblk,mgfft,&
344 &                 mpi_enreg,mpsang,mpssoang,natom,nattyp,ndat,ngfft,nkpgin,nkpgout,nloalg,&
345 &                 nnlout,npwin,npwout,nspinor,nspinortot,ntypat,only_SO,paw_opt,phkxredin,&
346 &                 phkxredout,ph1d,ph3din,ph3dout,signs,sij,svectout,&
347 &                 tim_nonlop,ucvol,useylm,vectin,vectout,&
348 &                 use_gpu_cuda)
349 
350   use m_pawcprj, only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_axpby
351   use m_time,    only : cwtime
352 
353 !This section has been created automatically by the script Abilint (TD).
354 !Do not modify the following lines by hand.
355 #undef ABI_FUNC
356 #define ABI_FUNC 'gemm_nonlop'
357 !End of the abilint section
358 
359   implicit none
360 
361   !Arguments ------------------------------------
362   !scalars
363   integer,intent(in) :: choice,cpopt,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout,idir
364   integer,intent(in) :: istwf_k,lmnmax,matblk,mgfft,mpsang,mpssoang,natom,ndat,nkpgin
365   integer,intent(in) :: nkpgout,nnlout,npwin,npwout,nspinor,nspinortot,ntypat,only_SO
366   integer,intent(in) :: paw_opt,signs,tim_nonlop,useylm
367   integer,optional,intent(in) :: use_gpu_cuda
368   real(dp),intent(in) :: lambda(ndat),ucvol
369   type(MPI_type),intent(in) :: mpi_enreg
370   !arrays
371   integer,intent(in) :: atindx1(natom),indlmn(6,lmnmax,ntypat),kgin(3,npwin)
372   integer,intent(in) :: kgout(3,npwout),nattyp(ntypat),ngfft(18),nloalg(3)
373   real(dp),intent(in) :: enl(dimenl1,dimenl2,nspinortot**2,dimekbq)
374   real(dp),intent(in) :: ffnlin(npwin,dimffnlin,lmnmax,ntypat)
375   real(dp),intent(in) :: ffnlout(npwout,dimffnlout,lmnmax,ntypat),gmet(3,3)
376   real(dp),intent(in) :: gprimd(3,3),kpgin(npwin,nkpgin*useylm)
377   real(dp),intent(in) :: kpgout(npwout,nkpgout*useylm),kptin(3),kptout(3)
378   real(dp),intent(in) :: phkxredin(2,natom),phkxredout(2,natom),ph1d(2,*)
379   real(dp),intent(in) :: sij(dimenl1,ntypat*((paw_opt+1)/3))
380   real(dp),intent(inout) :: ph3din(2,npwin,matblk),ph3dout(2,npwout,matblk)
381   real(dp),intent(inout) :: vectin(2,npwin*nspinor*ndat)
382   real(dp),intent(inout) :: enlout(nnlout*ndat)
383   real(dp),intent(out) :: svectout(2,npwout*nspinor*(paw_opt/3)*ndat)
384   real(dp),intent(inout) :: vectout(2,npwout*nspinor*ndat) !vz_i
385   type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat)
386 
387   ! locals
388   integer :: idat, nprojs, shift, iatom, nlmn, ierr, ibeg, iend
389   integer :: cplex, cplex_enl, cplex_fac
390   integer :: iatm, ndgxdt, ndgxdtfac, nd2gxdt, nd2gxdtfac, optder, itypat, ilmn
391   integer :: cplex_dgxdt(1), cplex_d2gxdt(1)
392   real(dp) :: dgxdt_dum_in(1,1,1,1,1), dgxdt_dum_out(1,1,1,1,1),dgxdt_dum_out2(1,1,1,1,1)
393   real(dp) :: d2gxdt_dum_in(1,1,1,1,1), d2gxdt_dum_out(1,1,1,1,1),d2gxdt_dum_out2(1,1,1,1,1)
394   real(dp), allocatable :: sij_typ(:)
395   real(dp), allocatable :: projections(:,:,:), s_projections(:,:,:), vnl_projections(:,:,:)
396   real(dp), allocatable :: temp_realvec(:)
397 
398 ! *************************************************************************
399 
400   ! We keep the same interface as nonlop, but we don't use many of those
401   ABI_UNUSED((/ffnlin,ffnlout,gmet,gprimd,kpgin,kpgout/))
402   ABI_UNUSED((/ph1d(1,1),ph3din,ph3dout/))
403   ABI_UNUSED((/phkxredin,phkxredout,ucvol/))
404   ABI_UNUSED((/mgfft,mpsang,mpssoang/))
405   ABI_UNUSED((/kptin,kptout/))
406   ABI_UNUSED((/idir,nloalg,ngfft,kgin,kgout,ngfft,only_SO,tim_nonlop,use_gpu_cuda,signs/))
407   ABI_UNUSED((/enlout/))
408 
409   cplex=2;if (istwf_k>1) cplex=1
410   cplex_enl=1;if (paw_opt>0) cplex_enl=2*dimenl1/(lmnmax*(lmnmax+1)) ! is enl complex?
411   cplex_fac=max(cplex,dimekbq)
412   if ((nspinortot==2.or.cplex_enl==2).and.paw_opt>0.and.choice/=7) cplex_fac=2 ! is vnl_projections complex?
413 
414   nprojs = gemm_nonlop_kpt(gemm_nonlop_ikpt_this_proc_being_treated)%nprojs
415 
416   ! These will store the non-local factors for vectin, svectout and vectout respectively
417   ABI_ALLOCATE(projections,(cplex, nprojs,nspinor*ndat))
418   ABI_ALLOCATE(s_projections,(cplex, nprojs,nspinor*ndat))
419   ABI_ALLOCATE(vnl_projections,(cplex_fac, nprojs,nspinor*ndat))
420   projections = zero
421   s_projections = zero
422   vnl_projections = zero
423 
424   if(nprojs == 0) then
425     ! TODO check if this is correct
426     vectout = zero
427     if(paw_opt>0) svectout = vectin
428     return
429   end if
430 
431   if(cpopt >= 2) then
432     ! retrieve from cprjin
433     do idat=1, ndat*nspinor
434       shift = 0
435       do iatom = 1, natom
436         nlmn = cprjin(iatom, idat)%nlmn
437         projections(1:cplex, shift+1:shift+nlmn, idat) = cprjin(iatom, idat)%cp(1:cplex, 1:nlmn)
438         shift = shift + nlmn
439       end do
440     end do
441   else
442     ! opernla
443     if(cplex == 2) then
444       call ZGEMM('C', 'N', nprojs, ndat, npwin*nspinor, cone, &
445 &                gemm_nonlop_kpt(gemm_nonlop_ikpt_this_proc_being_treated)%projs, npwin*nspinor,&
446 &                vectin, npwin*nspinor, czero, projections, nprojs)
447     else
448        ABI_ALLOCATE(temp_realvec,(MAX(npwout,npwin)*nspinor*ndat))
449       ! only compute real part of projections = P^* psi => projections_r = P_r^T psi_r + P_i^T psi_i
450       temp_realvec(1:npwin*nspinor*ndat) = vectin(1,1:npwin*nspinor*ndat)
451       if(istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then
452         do idat=1, ndat
453           temp_realvec(1+(idat-1)*npwin*nspinor) = temp_realvec(1+(idat-1)*npwin*nspinor)/2
454         end do
455       end if
456       call DGEMM('T', 'N', nprojs, ndat, npwin*nspinor, one, &
457 &                gemm_nonlop_kpt(gemm_nonlop_ikpt_this_proc_being_treated)%projs_r, npwin*nspinor, &
458 &                temp_realvec, npwin*nspinor, zero, projections, nprojs)
459       temp_realvec(1:npwin*nspinor*ndat) = vectin(2,1:npwin*nspinor*ndat)
460       if(istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then
461         do idat=1, ndat
462           temp_realvec(1+(idat-1)*npwin*nspinor) = zero
463         end do
464       end if
465       call DGEMM('T', 'N', nprojs, ndat, npwin*nspinor, one, &
466 &                gemm_nonlop_kpt(gemm_nonlop_ikpt_this_proc_being_treated)%projs_i, npwin*nspinor, &
467 &                temp_realvec, npwin*nspinor, one , projections, nprojs)
468       projections = projections * 2
469        ABI_DEALLOCATE(temp_realvec)
470     end if
471     call xmpi_sum(projections,mpi_enreg%comm_fft,ierr)
472 
473     if(cpopt >= 0) then
474       ! store in cprjin
475       do idat=1, ndat*nspinor
476         shift = 0
477         do iatom = 1, natom
478           nlmn = cprjin(iatom, idat)%nlmn
479           cprjin(iatom, idat)%cp(1:cplex, 1:nlmn) = projections(1:cplex, shift+1:shift+nlmn, idat)
480           shift = shift + nlmn
481         end do
482       end do
483     end if
484   end if
485 
486   if(choice > 0) then
487 
488     if(choice /= 7) then
489       ! opernlc
490       iatm = 0
491       ndgxdt = 0
492       ndgxdtfac = 0
493       nd2gxdt = 0
494       nd2gxdtfac = 0
495       optder = 0
496 
497       shift = 0
498       ABI_ALLOCATE(sij_typ,(((paw_opt+1)/3)*lmnmax*(lmnmax+1)/2))
499       do itypat=1, ntypat
500         nlmn=count(indlmn(3,:,itypat)>0)
501         if (paw_opt>=2) then
502           if (cplex_enl==1) then
503             do ilmn=1,nlmn*(nlmn+1)/2
504               sij_typ(ilmn)=sij(ilmn,itypat)
505             end do
506           else
507             do ilmn=1,nlmn*(nlmn+1)/2
508               sij_typ(ilmn)=sij(2*ilmn-1,itypat)
509             end do
510           end if
511         end if
512 
513         ibeg = shift+1
514         iend = shift+nattyp(itypat)*nlmn
515 
516         do idat = 1,ndat
517           call opernlc_ylm(atindx1,cplex,cplex_dgxdt,cplex_d2gxdt,cplex_enl,cplex_fac,dgxdt_dum_in,dgxdt_dum_out,dgxdt_dum_out2,&
518 &         d2gxdt_dum_in,d2gxdt_dum_out,d2gxdt_dum_out2,dimenl1,dimenl2,dimekbq,enl,projections(:, ibeg:iend, idat),&
519 &         vnl_projections(:, ibeg:iend, idat),s_projections(:, ibeg:iend, idat),&
520 &         iatm,indlmn(:,:,itypat),itypat,lambda(idat),mpi_enreg,natom,ndgxdt,ndgxdtfac,nd2gxdt,nd2gxdtfac,&
521 &         nattyp(itypat),nlmn,nspinor,nspinortot,optder,paw_opt,sij_typ)
522         end do
523 
524         shift = shift + nattyp(itypat)*nlmn
525         iatm = iatm+nattyp(itypat)
526       end do
527       ABI_DEALLOCATE(sij_typ)
528     else
529       s_projections = projections
530     end if
531 
532     ! opernlb
533     if(paw_opt == 3 .or. paw_opt == 4) then
534       ! Get svectout from s_projections
535       if(cplex == 2) then
536         call ZGEMM('N', 'N', npwout*nspinor, ndat, nprojs, cone, &
537 &                  gemm_nonlop_kpt(gemm_nonlop_ikpt_this_proc_being_treated)%projs, npwout*nspinor, &
538 &                  s_projections, nprojs, czero, svectout, npwout*nspinor)
539       else
540          ABI_ALLOCATE(temp_realvec,(MAX(npwout,npwin)*nspinor*ndat))
541         call DGEMM('N', 'N', npwout*nspinor, ndat, nprojs, one, &
542 &                  gemm_nonlop_kpt(gemm_nonlop_ikpt_this_proc_being_treated)%projs_r, npwout*nspinor, &
543 &                  s_projections, nprojs, zero, temp_realvec, npwout*nspinor)
544         svectout(1,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
545         call DGEMM('N', 'N', npwout*nspinor, ndat, nprojs, one, &
546 &                  gemm_nonlop_kpt(gemm_nonlop_ikpt_this_proc_being_treated)%projs_i, npwout*nspinor,&
547 &                  s_projections, nprojs, zero, temp_realvec, npwout*nspinor)
548         svectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
549          ABI_DEALLOCATE(temp_realvec)
550       end if
551       if(choice /= 7) svectout = svectout + vectin ! TODO understand this
552     end if
553     if(paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4) then
554       ! Get vectout from vnl_projections
555       if(cplex_fac == 2) then
556         call ZGEMM('N', 'N', npwout*nspinor, ndat, nprojs, cone, &
557 &                 gemm_nonlop_kpt(gemm_nonlop_ikpt_this_proc_being_treated)%projs, npwout*nspinor, &
558 &                 vnl_projections, nprojs, czero, vectout, npwout*nspinor)
559       else
560          ABI_ALLOCATE(temp_realvec,(MAX(npwout,npwin)*nspinor*ndat))
561         call DGEMM('N', 'N', npwout*nspinor, ndat, nprojs, one, &
562 &                  gemm_nonlop_kpt(gemm_nonlop_ikpt_this_proc_being_treated)%projs_r, npwout*nspinor, &
563 &                  vnl_projections, nprojs, zero, temp_realvec, npwout*nspinor)
564         vectout(1,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
565         call DGEMM('N', 'N', npwout*nspinor, ndat, nprojs, one, &
566 &                  gemm_nonlop_kpt(gemm_nonlop_ikpt_this_proc_being_treated)%projs_i, npwout*nspinor, &
567 &                  vnl_projections, nprojs, zero, temp_realvec, npwout*nspinor)
568         vectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
569          ABI_DEALLOCATE(temp_realvec)
570       end if
571     end if
572   end if
573 
574   ABI_DEALLOCATE(projections)
575   ABI_DEALLOCATE(s_projections)
576   ABI_DEALLOCATE(vnl_projections)
577  end subroutine gemm_nonlop
578 !***
579 end module m_gemm_nonlop

m_gemm_nonlop/gemm_nonlop_type [ Types ]

[ Top ] [ m_gemm_nonlop ] [ Types ]

NAME

 gemm_nonlop_type

FUNCTION

 Contains information needed to apply the nonlocal operator

SOURCE

74  type,public :: gemm_nonlop_type
75 
76    integer :: nprojs
77 
78    real(dp), allocatable :: projs(:, :, :)
79    ! (2, npw, nprojs)
80    real(dp), allocatable :: projs_r(:, :, :)
81    ! (1, npw, nprojs)
82    real(dp), allocatable :: projs_i(:, :, :)
83    ! (1, npw, nprojs)
84  end type gemm_nonlop_type

m_gemm_nonlop/init_gemm_nonlop [ Functions ]

[ Top ] [ m_gemm_nonlop ] [ Functions ]

NAME

 init_gemm_nonlop

FUNCTION

 Initalization of the gemm_nonlop_kpt array

INPUTS

 nkpt= number of k-points

PARENTS

      gstate

CHILDREN

      dgemm,opernlc_ylm,xmpi_sum,zgemm

SOURCE

115  subroutine init_gemm_nonlop(nkpt)
116 
117 
118 !This section has been created automatically by the script Abilint (TD).
119 !Do not modify the following lines by hand.
120 #undef ABI_FUNC
121 #define ABI_FUNC 'init_gemm_nonlop'
122 !End of the abilint section
123 
124   integer,intent(in) :: nkpt
125   integer :: ikpt
126 
127 ! *************************************************************************
128 
129   ! TODO only allocate the number of kpt treated by this proc
130   ABI_DATATYPE_ALLOCATE(gemm_nonlop_kpt, (nkpt))
131   do ikpt=1,nkpt
132     gemm_nonlop_kpt(ikpt)%nprojs = -1
133   end do
134 
135  end subroutine init_gemm_nonlop

m_gemm_nonlop/make_gemm_nonlop [ Functions ]

[ Top ] [ m_gemm_nonlop ] [ Functions ]

NAME

 make_gemm_nonlop

FUNCTION

 Build the gemm_nonlop array

INPUTS

PARENTS

      energy,vtorho

CHILDREN

      dgemm,opernlc_ylm,xmpi_sum,zgemm

SOURCE

203  subroutine make_gemm_nonlop(ikpt,npw,lmnmax,ntypat,indlmn,nattyp,istwf_k,ucvol,ffnl_k,ph3d_k)
204 
205   use m_abi_linalg
206 
207 !This section has been created automatically by the script Abilint (TD).
208 !Do not modify the following lines by hand.
209 #undef ABI_FUNC
210 #define ABI_FUNC 'make_gemm_nonlop'
211 !End of the abilint section
212 
213   implicit none
214 
215   integer, intent(in) :: ikpt
216   integer, intent(in) :: npw, lmnmax,ntypat
217   integer, intent(in) :: indlmn(:,:,:)
218   integer, intent(in) :: nattyp(ntypat)
219   integer, intent(in) :: istwf_k
220   real(dp), intent(in) :: ucvol
221   real(dp), intent(in) :: ffnl_k(:,:,:,:)
222   real(dp), intent(in) :: ph3d_k(:,:,:)
223 
224   integer :: nprojs
225 
226   real(dp) :: atom_projs(2, npw, lmnmax)
227   real(dp) :: temp(npw)
228 
229   integer :: itypat, ilmn, nlmn, ia, iaph3d, shift
230   integer :: il, ipw
231   logical :: parity
232 
233 ! *************************************************************************
234 
235   iaph3d = 1
236 
237   if(gemm_nonlop_kpt(ikpt)%nprojs /= -1) then
238     ! We have been here before, cleanup before remaking
239     ABI_DEALLOCATE(gemm_nonlop_kpt(ikpt)%projs)
240     ABI_DEALLOCATE(gemm_nonlop_kpt(ikpt)%projs_r)
241     ABI_DEALLOCATE(gemm_nonlop_kpt(ikpt)%projs_i)
242     gemm_nonlop_kpt(ikpt)%nprojs = -1
243   end if
244 
245   ! build nprojs
246   nprojs = 0
247   do itypat=1,ntypat
248     nprojs = nprojs + count(indlmn(3,:,itypat)>0)*nattyp(itypat)
249   end do
250 
251   gemm_nonlop_kpt(ikpt)%nprojs = nprojs
252 
253   ABI_ALLOCATE(gemm_nonlop_kpt(ikpt)%projs, (2, npw, gemm_nonlop_kpt(ikpt)%nprojs))
254   gemm_nonlop_kpt(ikpt)%projs = zero
255   if(istwf_k > 1) then
256     ! We still allocate the complex matrix, in case we need it for spinors. TODO could be avoided
257     ABI_ALLOCATE(gemm_nonlop_kpt(ikpt)%projs_r, (1, npw, gemm_nonlop_kpt(ikpt)%nprojs))
258     ABI_ALLOCATE(gemm_nonlop_kpt(ikpt)%projs_i, (1, npw, gemm_nonlop_kpt(ikpt)%nprojs))
259     gemm_nonlop_kpt(ikpt)%projs_r = zero
260     gemm_nonlop_kpt(ikpt)%projs_i = zero
261   else
262     ! Still allocate so we can deallocate it in destroy_gemm_nonlop
263     ABI_ALLOCATE(gemm_nonlop_kpt(ikpt)%projs_r, (1, 1, 1))
264     ABI_ALLOCATE(gemm_nonlop_kpt(ikpt)%projs_i, (1, 1, 1))
265   end if
266 
267   shift = 0
268   do itypat = 1, ntypat
269     nlmn = count(indlmn(3,:,itypat)>0)
270 
271     do ia = 1, nattyp(itypat)
272 
273       !! build atom_projs, from opernlb
274       !! P = 4pi/sqrt(ucvol)* conj(diag(ph3d)) * ffnl * diag(parity), with parity = (-i)^l
275       atom_projs(:,:,:) = 0
276 
277       ! start from 4pi/sqrt(ucvol)*ffnl
278       ! atom_projs(1, :, 1:nlmn) = four_pi/sqrt(ham%ucvol) * ham%ffnl_k(:, 1, 1:nlmn)
279       ! TODO vectorize (DCOPY with stride)
280       do ipw=1, npw
281         atom_projs(1,ipw, 1:nlmn) = four_pi/sqrt(ucvol) * ffnl_k(ipw, 1, 1:nlmn, itypat)
282       end do
283 
284       ! multiply by (-i)^l
285       do ilmn=1,nlmn
286         il=mod(indlmn(1,ilmn, itypat),4);
287         parity=(mod(il,2)==0)
288         if (il>1) then
289           ! multiply by -1
290           atom_projs(:,:,ilmn) = -atom_projs(:,:,ilmn)
291         end if
292         if(.not. parity) then
293           ! multiply by -i
294           temp = atom_projs(2,:,ilmn)
295           atom_projs(2,:,ilmn) = -atom_projs(1,:,ilmn)
296           atom_projs(1,:,ilmn) =  temp
297         end if
298       end do
299 
300       ! multiply by conj(ph3d)
301       do ilmn=1,nlmn
302         temp = atom_projs(1, :, ilmn)
303         atom_projs(1, :, ilmn) = atom_projs(1, :, ilmn) * ph3d_k(1, :, iaph3d) &
304 &                              + atom_projs(2, :, ilmn) * ph3d_k(2, :, iaph3d)
305         atom_projs(2, :, ilmn) = atom_projs(2, :, ilmn) * ph3d_k(1, :, iaph3d) &
306 &                              - temp                   * ph3d_k(2, :, iaph3d)
307       end do
308 
309       !! atom_projs is built, copy to projs
310       gemm_nonlop_kpt(ikpt)%projs(:, :, shift+1:shift+nlmn) = atom_projs(:, :, 1:nlmn)
311       if(istwf_k > 1) then
312         gemm_nonlop_kpt(ikpt)%projs_r(1, :, shift+1:shift+nlmn) = atom_projs(1, :, 1:nlmn)
313         gemm_nonlop_kpt(ikpt)%projs_i(1, :, shift+1:shift+nlmn) = atom_projs(2, :, 1:nlmn)
314       end if
315       shift = shift + nlmn
316       iaph3d = iaph3d + 1
317     end do
318   end do
319 
320  end subroutine make_gemm_nonlop