./66_nonlocal/m_gemm_nonlop

ABINIT/m_gemm_nonlop_ompgpu
m_gemm_nonlop/free_gemm_nonlop_ompgpu_ikpt
m_gemm_nonlop_ompgpu/destroy_gpu_nonlop_ompgpu
m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu
m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu_distributed_gemm_opernla
m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu_distributed_gemm_opernlb
m_gemm_nonlop_ompgpu/init_gemm_nonlop_ompgpu
m_gemm_nonlop_ompgpu/make_gemm_nonlop_ompgpu

ABINIT/m_gemm_nonlop_ompgpu [ Modules ]

NAME

 m_gemm_nonlop_ompgpu

FUNCTION

  This module provides functions to compute the nonlocal operator by means of the BLAS GEMM
  routine. By treating ndat simultaneous wavefunctions, it is able to exploit BLAS3 routines,
  which leads to excellent CPU efficiency and OpenMP scalability.

 Copyright (C) 2014-2024 ABINIT group (MS)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

18 ! TODO list :
19 ! Don't allocate the full nkpt structures, only those that are treated by this proc: use same init as in m_bandfft_kpt
20 ! support more options (forces & stresses mostly)
21 ! Support RF/other computations (only GS right now)
22 ! handle the case where nloalg(2) < 0, ie no precomputation of ph3d
23 ! more systematic checking of the workflow (right now, only works if init/make/gemm/destroy, no multiple makes, etc)
24 ! Avoid allocating the complex matrix when istwfk > 1
25 ! Merge with chebfi's invovl
26 
27 
28 #if defined HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include "abi_common.h"
33 
34 module m_gemm_nonlop_ompgpu
35 
36  use defs_basis
37  use m_errors
38  use m_abicore
39  use m_xmpi
40  use m_xomp
41  use m_abi_linalg
42 
43 #ifdef HAVE_FC_ISO_C_BINDING
44  use iso_c_binding
45 #endif
46 
47 #ifdef HAVE_GPU
48   use m_gpu_toolbox
49 #endif
50 
51  use defs_abitypes, only : MPI_type
52  use m_opernlc_ylm_ompgpu, only : opernlc_ylm_ompgpu
53  use m_pawcprj, only : pawcprj_type
54  use m_geometry, only : strconv
55  use m_kg, only : mkkpg
56  use m_gemm_nonlop
57 
58 #if defined HAVE_MPI2
59  use mpi
60 #endif
61 
62  implicit none
63 
64  private
65 
66  ! Use these routines in order: first call init, then call make_gemm_nonlop for each k point,
67  ! then call gemm_nonlop to do the actual computation, and call destroy when done. See gstate and vtorho.
68  public :: init_gemm_nonlop_ompgpu
69  public :: destroy_gemm_nonlop_ompgpu
70  public :: make_gemm_nonlop_ompgpu
71  public :: gemm_nonlop_ompgpu
72 
73  ! Those routines are here to assess memory requirements
74  public :: gemm_nonlop_ompgpu_work_mem
75  public :: gemm_nonlop_ompgpu_static_mem

m_gemm_nonlop/free_gemm_nonlop_ompgpu_ikpt [ Functions ]

[ Top ] [ m_gemm_nonlop ] [ Functions ]

NAME

 free_destroy_gemm_nonlop_ompgpuikpt

FUNCTION

 Release memory for one kpt value of the gemm_nonlop_kpt array

INPUTS

 ikpt= index of gemm_nonlop_kptto be released

SOURCE

467  subroutine free_gemm_nonlop_ompgpu_ikpt(ikpt)
468 
469   integer,intent(in) :: ikpt
470 
471 ! *************************************************************************
472 
473  if(gemm_nonlop_kpt(ikpt)%nprojs /= -1) then
474    if (allocated(gemm_nonlop_kpt(ikpt)%projs)) then
475      ABI_FREE(gemm_nonlop_kpt(ikpt)%projs)
476    end if
477    if (allocated(gemm_nonlop_kpt(ikpt)%projs_r)) then
478      ABI_FREE(gemm_nonlop_kpt(ikpt)%projs_r)
479    end if
480    if (allocated(gemm_nonlop_kpt(ikpt)%projs_i)) then
481    ABI_FREE(gemm_nonlop_kpt(ikpt)%projs_i)
482    end if
483    gemm_nonlop_kpt(ikpt)%nprojs = -1
484    if(gemm_nonlop_kpt(ikpt)%ngrads /= -1) then
485      if (allocated(gemm_nonlop_kpt(ikpt)%dprojs)) then
486        ABI_FREE(gemm_nonlop_kpt(ikpt)%dprojs)
487      end if
488      if (allocated(gemm_nonlop_kpt(ikpt)%dprojs_r)) then
489        ABI_FREE(gemm_nonlop_kpt(ikpt)%dprojs_r)
490      end if
491      if (allocated(gemm_nonlop_kpt(ikpt)%dprojs_i)) then
492        ABI_FREE(gemm_nonlop_kpt(ikpt)%dprojs_i)
493      end if
494      gemm_nonlop_kpt(ikpt)%ngrads = -1
495    end if
496  end if
497 
498  end subroutine free_gemm_nonlop_ompgpu_ikpt

m_gemm_nonlop_ompgpu/destroy_gpu_nonlop_ompgpu [ Functions ]

[ Top ] [ m_gemm_nonlop_ompgpu ] [ Functions ]

NAME

 destroy_gemm_nonlop_ompgpu

FUNCTION

 Destruction of the gemm_nonlop_kpt array

INPUTS

 nkpt= number of k-points

SOURCE

429  subroutine destroy_gemm_nonlop_ompgpu()
430 
431   integer :: ikpt
432   character(len=200) :: msg
433 
434 ! *************************************************************************
435 
436   if(mod__nkpt==0) then
437     msg="GPU nonlop arrays aren't initialized. Redundant call"
438     ABI_ERROR(msg)
439   end if
440 
441   call free_work_buffers()
442   call free_gemm_nonlop_kpt_ompgpu()
443   ! TODO add cycling if kpt parallelism
444   do ikpt = 1,mod__nkpt
445     call free_gemm_nonlop_ompgpu_ikpt(ikpt)
446   end do
447 
448   ABI_FREE(gemm_nonlop_kpt)
449   mod__nkpt=0
450 
451  end subroutine destroy_gemm_nonlop_ompgpu

m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu [ Functions ]

[ Top ] [ m_gemm_nonlop_ompgpu ] [ Functions ]

NAME

 gemm_nonlop_ompgpu

FUNCTION

 Replacement of nonlop. same prototype as nonlop although not all options are implemented.

INPUTS

 [gpu_option] = GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)

SOURCE

 768  subroutine gemm_nonlop_ompgpu(atindx1,choice,cpopt,cprjin,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout,&
 769 &                 enl,enlout,ffnlin,ffnlout,gmet,gprimd,idir,indlmn,istwf_k,&
 770 &                 kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,lmnmax,matblk,mgfft,&
 771 &                 mpi_enreg,mpsang,mpssoang,natom,nattyp,ndat,ngfft,nkpgin,nkpgout,nloalg,&
 772 &                 nnlout,npwin,npwout,nspinor,nspinortot,ntypat,only_SO,paw_opt,phkxredin,&
 773 &                 phkxredout,ph1d,ph3din,ph3dout,signs,sij,svectout,&
 774 &                 tim_nonlop,ucvol,useylm,vectin,vectout,&
 775 &                 vectproj,gpu_option)
 776 
 777   !Arguments ------------------------------------
 778   !scalars
 779   integer,intent(in) :: choice,cpopt,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout,idir
 780   integer,intent(in) :: istwf_k,lmnmax,matblk,mgfft,mpsang,mpssoang,natom,ndat,nkpgin
 781   integer,intent(in) :: nkpgout,nnlout,npwin,npwout,nspinor,nspinortot,ntypat,only_SO
 782   integer,intent(in) :: paw_opt,signs,tim_nonlop,useylm
 783   integer,optional,intent(in) :: gpu_option
 784   real(dp),intent(in) :: lambda(ndat),ucvol
 785   type(MPI_type),intent(in) :: mpi_enreg
 786   !arrays
 787   integer,intent(in) :: atindx1(natom),indlmn(6,lmnmax,ntypat),kgin(3,npwin)
 788   integer,intent(in) :: kgout(3,npwout),nattyp(ntypat),ngfft(18),nloalg(3)
 789   real(dp),intent(in) :: enl(dimenl1,dimenl2,nspinortot**2,dimekbq)
 790   real(dp),intent(in) :: ffnlin(npwin,dimffnlin,lmnmax,ntypat)
 791   real(dp),intent(in) :: ffnlout(npwout,dimffnlout,lmnmax,ntypat),gmet(3,3)
 792   real(dp),intent(in) :: gprimd(3,3),kpgin(npwin,nkpgin*useylm)
 793   real(dp),intent(in) :: kpgout(npwout,nkpgout*useylm),kptin(3),kptout(3)
 794   real(dp),intent(in) :: phkxredin(2,natom),phkxredout(2,natom),ph1d(2,*)
 795   real(dp),intent(in) :: sij(dimenl1,ntypat*((paw_opt+1)/3))
 796   real(dp),intent(inout) :: ph3din(2,npwin,matblk),ph3dout(2,npwout,matblk)
 797   real(dp),intent(inout),target :: vectin(2,npwin*nspinor*ndat)
 798   real(dp),intent(inout) :: enlout(nnlout*ndat)
 799   real(dp),intent(out),target :: svectout(:,:)
 800   real(dp),intent(inout),target :: vectout(:,:)
 801   real(dp),intent(inout),optional, ABI_CONTIGUOUS target :: vectproj(:,:,:)
 802   type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat)
 803 
 804 
 805   ! locals
 806   integer :: ii, ia, idat, igrad, nprojs, ngrads, shift, iatom, nlmn, ierr, ibeg, iend, i1, i2, i, ikpt
 807   integer :: cplex, cplex_enl, cplex_fac, proj_shift, grad_shift, nattyp_i
 808   integer :: enlout_shift, force_shift, nnlout_test
 809   integer :: iatm, ndgxdt, ndgxdtfac, nd2gxdt, nd2gxdtfac, optder, itypat, ilmn
 810   integer :: cplex_dgxdt(1), cplex_d2gxdt(1)
 811   real(dp) :: esum,esumk(9)
 812   real(dp) :: work(6)
 813   real(dp) :: dgxdt_dum_in(1,1,1,1,1), dgxdt_dum_out(1,1,1,1,1),dgxdt_dum_out2(1,1,1,1,1)
 814   real(dp) :: d2gxdt_dum_in(1,1,1,1,1), d2gxdt_dum_out(1,1,1,1,1),d2gxdt_dum_out2(1,1,1,1,1)
 815   real(dp), allocatable :: enlk(:)
 816   real(dp),pointer :: projections_ptr(:,:,:)
 817   integer :: nprojs_my_blk, ipw, iproj, iblock, nprojs_blk
 818   integer :: rank, nprocs, req(2)
 819   logical :: is_last
 820   real(dp), allocatable :: projs_recv(:,:,:), dprojs_recv(:,:,:)
 821   logical :: local_vectproj
 822   logical :: transfer_vectin,transfer_vectout,transfer_svectout
 823   character(len=500) :: msg
 824   integer(C_SIZE_T) :: byte_count
 825   real(dp), ABI_CONTIGUOUS pointer :: vectin_(:,:),vectout_(:,:),svectout_(:,:)
 826 
 827 ! *************************************************************************
 828 
 829   ! We keep the same interface as nonlop, but we don't use many of those
 830   ABI_UNUSED((/ffnlin,ffnlout,gmet,kpgin,kpgout/))
 831   ABI_UNUSED((/ph1d(1,1),ph3din,ph3dout/))
 832   ABI_UNUSED((/phkxredin,phkxredout,ucvol/))
 833   ABI_UNUSED((/mgfft,mpsang,mpssoang/))
 834   ABI_UNUSED((/kptin,kptout/))
 835   ABI_UNUSED((/idir,nloalg,ngfft,kgin,kgout,ngfft,only_SO,tim_nonlop,gpu_option/))
 836 
 837   ! Check supported options
 838   if ( (choice>1.and.choice/=7.and.signs==2) .or. &
 839 &      (choice>3.and.choice/=7.and.choice/=23.and.signs==1) .or. &
 840 &      (useylm/=1) ) then
 841     ABI_BUG('gemm_nonlop option not supported!')
 842   end if
 843   if (signs==1) then
 844     nnlout_test=0
 845     if (choice==1) nnlout_test=1
 846     if (choice==2) nnlout_test=3*natom
 847     if (choice==3) nnlout_test=6
 848     if (choice==23) nnlout_test=6+3*natom
 849     if (nnlout<nnlout_test) then
 850       ABI_BUG('wrong nnlout size!')
 851     end if
 852   end if
 853 
 854   ikpt=gemm_nonlop_ikpt_this_proc_being_treated
 855   cplex=2;if (istwf_k>1) cplex=1
 856   cplex_enl=1;if (paw_opt>0) cplex_enl=2*dimenl1/(lmnmax*(lmnmax+1)) ! is enl complex?
 857   cplex_fac=max(cplex,dimekbq)
 858   if ((nspinortot==2.or.cplex_enl==2).and.paw_opt>0.and.choice/=7) cplex_fac=2 ! is vnl_projections complex?
 859 
 860   nprojs = gemm_nonlop_kpt(ikpt)%nprojs
 861   ngrads = gemm_nonlop_kpt(ikpt)%ngrads
 862   nprojs_my_blk = gemm_nonlop_kpt(ikpt)%nprojs_blk
 863 
 864   if(gemm_nonlop_is_distributed) then
 865     rank = xmpi_comm_rank(gemm_nonlop_block_comm); nprocs = xmpi_comm_size(gemm_nonlop_block_comm)
 866     is_last = (rank==nprocs-1)
 867     if(is_last) nprojs_my_blk = gemm_nonlop_kpt(ikpt)%nprojs_last_blk
 868   end if
 869 
 870   if(choice==1) ngrads=0
 871   ABI_CHECK(ngrads>=3.or.choice/=2 ,"Bad allocation in gemm_nonlop (2)!")
 872   ABI_CHECK(ngrads>=6.or.choice/=3 ,"Bad allocation in gemm_nonlop (3)!")
 873   ABI_CHECK(ngrads>=9.or.choice/=23,"Bad allocation in gemm_nonlop (23)!")
 874 
 875   ! Allocate and copy GPU buffers if user doesn't manage them
 876   transfer_vectin=.not. xomp_target_is_present(c_loc(vectin)) &
 877       .and. (cpopt < 2 .or. (choice/=7 .and.paw_opt >=3))
 878   transfer_vectout=.not. xomp_target_is_present(c_loc(vectout)) &
 879       .and. (signs==2 .and. (paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4))
 880   transfer_svectout=.not. xomp_target_is_present(c_loc(svectout)) &
 881       .and. (signs==2 .and. (paw_opt == 3 .or. paw_opt == 4))
 882   !$OMP TARGET ENTER DATA MAP(to:vectin)      IF(transfer_vectin)
 883   !$OMP TARGET ENTER DATA MAP(alloc:vectout)  IF(transfer_vectout)
 884   !$OMP TARGET ENTER DATA MAP(alloc:svectout) IF(transfer_svectout)
 885 
 886   !FIXME These seemingly useless pointers are used in BLAS operations for
 887   ! working around a bug in AOMP LLVM misreading mapped device pointers.
 888   ! I chose to generalise the workaround to avoid dupplicating each
 889   ! BLAS call specifically for handling AOMP LLVM.
 890   vectin_   => vectin
 891   vectout_  => vectout
 892   svectout_ => svectout
 893 
 894   !$OMP TARGET ENTER DATA MAP(to:atindx1,indlmn,enl)
 895   if(gpu_initialised == 0 .or. mod__ndat /= ndat*nspinor) then
 896     call alloc_work_buffers(cplex, cplex_fac,&
 897 &        nspinor*ndat, nprojs, ntypat, lmnmax, MAX(npwin,npwout))
 898     if(paw_opt>=2 .and. choice > 0 .and. choice /= 7) then
 899       if (cplex_enl==1) then
 900         do itypat=1, ntypat
 901         nlmn=count(indlmn(3,:,itypat)>0)
 902           do ilmn=1,nlmn*(nlmn+1)/2
 903             sij_typ(ilmn,itypat)=sij(ilmn,itypat)
 904           end do
 905         end do
 906       else
 907         do itypat=1, ntypat
 908           nlmn=count(indlmn(3,:,itypat)>0)
 909           do ilmn=1,nlmn*(nlmn+1)/2
 910             sij_typ(ilmn,itypat)=sij(2*ilmn-1,itypat)
 911           end do
 912         end do
 913       end if
 914     end if
 915     !$OMP TARGET UPDATE TO(sij_typ)
 916   end if
 917   if(current_ikpt_in_gpu /= gemm_nonlop_ikpt_this_proc_being_treated) then
 918     call refresh_gemm_nonlop_kpt_ompgpu(gemm_nonlop_ikpt_this_proc_being_treated)
 919   end if
 920 
 921    ! If vectproj is provided, use it for further calculations, use static array otherwise
 922    projections_ptr => projections
 923    local_vectproj=.false.
 924    if(PRESENT(vectproj)) then
 925      if(size(vectproj)>1) local_vectproj=.true.
 926    end if
 927    if (local_vectproj) projections_ptr => vectproj
 928 
 929 #ifdef HAVE_GPU_CUDA
 930   !Work buffers allocated at each call to save memory in CUDA
 931   !$OMP TARGET ENTER DATA MAP(alloc:s_projections,vnl_projections)
 932   if(.not. local_vectproj) then
 933     !$OMP TARGET ENTER DATA MAP(alloc:projections_ptr)
 934   end if
 935 #endif
 936 
 937   if(gemm_nonlop_is_distributed) then
 938     ABI_MALLOC(projs_recv,   (cplex, npwin, gemm_nonlop_kpt(ikpt)%nprojs_last_blk))
 939     !$OMP TARGET ENTER DATA MAP(alloc:projs_recv)
 940     if (signs==1.and.ngrads>0) then
 941       ABI_MALLOC(dprojs_recv,   (cplex, npwin, ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk))
 942       !$OMP TARGET ENTER DATA MAP(alloc:dprojs_recv)
 943     end if
 944   end if
 945 
 946   ! These will store the non-local factors for vectin, svectout and vectout respectively
 947 #if defined HAVE_GPU_CUDA
 948   byte_count=sizeof(projections_ptr)
 949   !$OMP TARGET DATA USE_DEVICE_PTR(projections_ptr,s_projections,vnl_projections)
 950   if(cpopt < 2) call gpu_memset(c_loc(projections_ptr),     0, byte_count)
 951   call gpu_memset(c_loc(s_projections),   0, byte_count)
 952   call gpu_memset(c_loc(vnl_projections), 0, byte_count)
 953   !$OMP END TARGET DATA
 954 #elif defined HAVE_GPU_HIP
 955   if(cpopt < 2) then
 956     !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) &
 957     !$OMP& MAP(to:projections_ptr) PRIVATE(i1,i2)
 958     do i2=1, nspinor*ndat
 959       do i1=1, nprojs
 960         projections_ptr(:,i1,i2) = zero
 961       end do
 962     end do
 963   end if
 964   !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) &
 965   !$OMP& MAP(to:s_projections,vnl_projections) PRIVATE(i1,i2)
 966   do i2=1, nspinor*ndat
 967     do i1=1, nprojs
 968       s_projections(:,i1,i2) = zero
 969       vnl_projections(:,i1,i2) = zero
 970     end do
 971   end do
 972 #endif
 973 
 974   if (signs==1.and.ngrads>0) then
 975     ABI_MALLOC(dprojections,(cplex, ngrads*nprojs, nspinor*ndat))
 976     !$OMP TARGET ENTER DATA MAP(alloc:dprojections)
 977 #if defined HAVE_GPU_CUDA
 978     byte_count=sizeof(dprojections)
 979     !$OMP TARGET DATA USE_DEVICE_PTR(dprojections)
 980     call gpu_memset(c_loc(dprojections), 0, byte_count)
 981     !$OMP END TARGET DATA
 982 #elif defined HAVE_GPU_HIP
 983     !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) &
 984     !$OMP& MAP(to:dprojections) PRIVATE(i1,i2)
 985     do i2=1, nspinor*ndat
 986       do i1=1, nprojs*ngrads
 987         dprojections(:,i1,i2) = zero
 988       end do
 989     end do
 990 #endif
 991     if(choice==1.or.choice==3.or.choice==23) then
 992       ABI_MALLOC(enlk,(ndat))
 993       enlk=zero
 994       !$OMP TARGET ENTER DATA MAP(to:enlk,nattyp)
 995     end if
 996   end if
 997 
 998   if(nprojs == 0) then
 999     ! TODO check if this is correct
1000     if(signs == 1) then
1001       enlout=zero
1002       return
1003     end if
1004     if(signs == 2) then
1005       vectout = zero
1006       if(paw_opt>0) svectout = vectin
1007       return
1008     end if
1009   end if
1010 
1011   if(signs == 1) then
1012     enlout=zero
1013     !$OMP TARGET ENTER DATA MAP(to:enlout)
1014   end if
1015 
1016   !$OMP TASKWAIT
1017   if(cpopt >= 2) then
1018     ! retrieve from cprjin
1019     if(.not. local_vectproj .and. cpopt/=3) then
1020       !TODO This use-case is extremely painful for GEMM OpenGPU nonlop performance
1021       ABI_WARNING("Inefficient call of OpenGPU nonlop. Was vectproj provided with OpenMP mapping?")
1022       !$OMP TARGET UPDATE FROM(projections_ptr)
1023       !$OMP PARALLEL DO PRIVATE(shift,idat,iatom,nlmn)
1024       do idat=1, ndat*nspinor
1025         shift = 0
1026         do iatom = 1, natom
1027           nlmn = cprjin(iatom, idat)%nlmn
1028           projections_ptr(1:cplex, shift+1:shift+nlmn, idat) = cprjin(iatom, idat)%cp(1:cplex, 1:nlmn)
1029           shift = shift + nlmn
1030         end do
1031       end do
1032       !$OMP TARGET UPDATE TO(projections_ptr)
1033     end if
1034     if(cpopt==4.and.allocated(dprojections)) then
1035       !$OMP TARGET UPDATE FROM(dprojections)
1036       ABI_CHECK(cprjin(1,1)%ncpgr>=ngrads,"cprjin%ncpgr not correct! (1)")
1037       !$OMP PARALLEL DO PRIVATE(shift,idat,iatom,igrad,nlmn)
1038       do idat=1, ndat*nspinor
1039         shift = 0
1040         do iatom = 1, natom
1041           nlmn  = cprjin(iatom, idat)%nlmn
1042           do igrad=1,ngrads
1043             dprojections(1:cplex, shift+1:shift+nlmn, idat) = &
1044 &                   cprjin(iatom, idat)%dcp(1:cplex,igrad,1:ilmn)
1045             shift = shift + nlmn
1046           end do
1047         end do
1048       end do
1049       !$OMP TARGET UPDATE TO(dprojections)
1050     end if
1051   else
1052     ! opernla
1053     if(cplex == 2) then
1054       if(.not. gemm_nonlop_is_distributed) then
1055         !$OMP TARGET DATA USE_DEVICE_PTR(projections_ptr,current_ikpt_projs,vectin_)
1056         call abi_gpu_xgemm(cplex, 'C', 'N', nprojs, ndat*nspinor, npwin, cone, &
1057 &                c_loc(current_ikpt_projs), npwin,&
1058 &                c_loc(vectin_), npwin, czero, c_loc(projections_ptr), nprojs)
1059         !$OMP END TARGET DATA
1060         if(signs==1.and.ngrads>0) then
1061           !$OMP TARGET DATA USE_DEVICE_PTR(dprojections,current_ikpt_dprojs,vectin_)
1062           call abi_gpu_xgemm(cplex, 'C', 'N', ngrads*nprojs, ndat*nspinor, npwin, cone, &
1063                  c_loc(current_ikpt_dprojs), npwin,&
1064                  c_loc(vectin_), npwin, czero, c_loc(dprojections), ngrads*nprojs)
1065           !$OMP END TARGET DATA
1066         end if
1067       else
1068         call gemm_nonlop_ompgpu_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,&
1069         &                                         nprojs,&
1070         &                                         gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1071         &                                         gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1072         &                                         nprojs_my_blk,cplex,czero,&
1073         &                                         current_ikpt_projs,projs_recv,&
1074         &                                         vectin,projections_ptr)
1075         if(signs==1.and.ngrads>0) then
1076           call gemm_nonlop_ompgpu_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,&
1077           &                                         ngrads*nprojs,&
1078           &                                         ngrads*gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1079           &                                         ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1080           &                                         ngrads*nprojs_my_blk,cplex,czero,&
1081           &                                         current_ikpt_dprojs,dprojs_recv,&
1082           &                                         vectin,dprojections)
1083         end if
1084       end if
1085     else
1086       ! only compute real part of projections = P^* psi => projections_r = P_r^T psi_r + P_i^T psi_i
1087       !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO &
1088       !!$OMP TARGET LOOP &
1089       !$OMP& MAP(to:temp_realvec_r,vectin) PRIVATE(i)
1090       do i=1, npwin*nspinor*ndat
1091         temp_realvec_r(i) = vectin(1,i)
1092       end do
1093       if(istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then
1094         !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO &
1095         !!$OMP TARGET LOOP &
1096         !$OMP& MAP(to:temp_realvec_r) PRIVATE(idat)
1097         do idat=1, ndat*nspinor
1098           temp_realvec_r(1+(idat-1)*npwin) = temp_realvec_r(1+(idat-1)*npwin)/2
1099         end do
1100       end if
1101       !$OMP TARGET DATA USE_DEVICE_PTR(projections_ptr,current_ikpt_projs_r,temp_realvec_r)
1102       call abi_gpu_xgemm(cplex, 'T', 'N', nprojs, ndat*nspinor, npwin, cone, &
1103 &                c_loc(current_ikpt_projs_r), npwin, &
1104 &                c_loc(temp_realvec_r), npwin, czero, c_loc(projections_ptr), nprojs)
1105       !$OMP END TARGET DATA
1106       if(signs==1.and.ngrads>0) then
1107         !$OMP TARGET DATA USE_DEVICE_PTR(dprojections,current_ikpt_dprojs_r,temp_realvec_r)
1108         call abi_gpu_xgemm(cplex, 'T', 'N', ngrads*nprojs, ndat*nspinor, npwin, cone, &
1109 &                  c_loc(current_ikpt_dprojs_r), npwin, &
1110 &                  c_loc(temp_realvec_r), npwin, czero, c_loc(dprojections), ngrads*nprojs)
1111         !$OMP END TARGET DATA
1112       end if
1113 
1114       !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO &
1115       !!$OMP TARGET LOOP &
1116       !$OMP& MAP(to:temp_realvec_i,vectin) PRIVATE(i)
1117       do i=1, npwin*nspinor*ndat
1118         temp_realvec_i(i) = vectin(2,i)
1119       end do
1120       if(istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then
1121         !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO &
1122         !!$OMP TARGET LOOP &
1123         !$OMP& MAP(to:temp_realvec_i) PRIVATE(idat)
1124         do idat=1, ndat*nspinor
1125           temp_realvec_i(1+(idat-1)*npwin) = zero
1126         end do
1127       end if
1128 
1129       !$OMP TARGET DATA USE_DEVICE_PTR(projections_ptr,current_ikpt_projs_i,temp_realvec_i)
1130       call abi_gpu_xgemm(cplex, 'T', 'N', nprojs, ndat*nspinor, npwin, cone, &
1131                c_loc(current_ikpt_projs_i), npwin, &
1132                c_loc(temp_realvec_i), npwin, cone , c_loc(projections_ptr), nprojs)
1133       !$OMP END TARGET DATA
1134 
1135       !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) &
1136       !!$OMP TARGET LOOP &
1137       !$OMP& MAP(to:projections_ptr) PRIVATE(i1,i2)
1138       do i2=1, nspinor*ndat
1139         do i1=1, nprojs
1140           projections_ptr(1,i1,i2) = projections_ptr(1,i1,i2) * 2
1141         end do
1142       end do
1143 
1144       if(signs==1.and.ngrads>0) then
1145         !$OMP TARGET DATA USE_DEVICE_PTR(dprojections,current_ikpt_dprojs_i,temp_realvec_i)
1146         call abi_gpu_xgemm(cplex, 'T', 'N', ngrads*nprojs, ndat*nspinor, npwin, cone, &
1147                   c_loc(current_ikpt_dprojs_i), npwin, &
1148                   c_loc(temp_realvec_i), npwin, cone , c_loc(dprojections), ngrads*nprojs)
1149         !$OMP END TARGET DATA
1150 
1151         !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) &
1152         !!$OMP TARGET LOOP &
1153         !$OMP& MAP(to:dprojections) PRIVATE(i1,i2)
1154         do i2=1, nspinor*ndat
1155           do i1=1, ngrads*nprojs
1156             dprojections(1,i1,i2) = dprojections(1,i1,i2) * 2
1157           end do
1158         end do
1159       end if
1160     end if
1161 
1162     if(cpopt >= 0) then
1163       ! store in cprjin
1164       if(.not. local_vectproj .and. cpopt/=3) then
1165         !TODO This use-case is extremely painful for GEMM OpenGPU nonlop performance
1166         !$OMP TARGET UPDATE FROM(projections_ptr)
1167         !$OMP PARALLEL DO PRIVATE(shift,idat,iatom,nlmn)
1168         do idat=1, ndat*nspinor
1169           shift = 0
1170           do iatom = 1, natom
1171             nlmn = cprjin(iatom, idat)%nlmn
1172             cprjin(iatom, idat)%cp(1:cplex, 1:nlmn) = projections_ptr(1:cplex, shift+1:shift+nlmn, idat)
1173             shift = shift + nlmn
1174           end do
1175         end do
1176       end if
1177       if(cpopt==3) then
1178         ABI_CHECK(cprjin(1,1)%ncpgr>=ngrads,"cprjin%ncpgr not correct! (2)")
1179         !$OMP TARGET UPDATE FROM(dprojections)
1180         !$OMP PARALLEL DO PRIVATE(shift,idat,iatom,igrad,nlmn)
1181         do idat=1, ndat*nspinor
1182           shift = 0
1183           do iatom = 1, natom
1184             nlmn = cprjin(iatom, idat)%nlmn
1185             do igrad=1,ngrads
1186               cprjin(iatom, idat)%dcp(1:cplex,igrad,1:nlmn) = &
1187   &              dprojections(1:cplex, shift+1:shift+nlmn, idat)
1188               shift = shift + nlmn
1189             end do
1190           end do
1191         end do
1192       end if
1193     end if
1194   end if
1195 
1196 
1197 
1198   if(choice > 0) then
1199 
1200     if(choice /= 7) then
1201       ! opernlc
1202       iatm = 0
1203       ndgxdt = 0
1204       ndgxdtfac = 0
1205       nd2gxdt = 0
1206       nd2gxdtfac = 0
1207       optder = 0
1208 
1209       shift = 0
1210       do itypat=1, ntypat
1211         nlmn=count(indlmn(3,:,itypat)>0)
1212 
1213         ibeg = shift+1
1214         iend = shift+nattyp(itypat)*nlmn
1215 
1216         call opernlc_ylm_ompgpu(atindx1,cplex,cplex_dgxdt,cplex_d2gxdt,cplex_enl,cplex_fac,&
1217 &         dgxdt_dum_in,dgxdt_dum_out,dgxdt_dum_out2,&
1218 &         d2gxdt_dum_in,d2gxdt_dum_out,d2gxdt_dum_out2,dimenl1,dimenl2,dimekbq,enl,&
1219 &         projections_ptr,&
1220 &         vnl_projections,&
1221 &         s_projections,&
1222 &         iatm,indlmn,itypat,lambda,mpi_enreg,natom,ndgxdt,ndgxdtfac,nd2gxdt,nd2gxdtfac,&
1223 &         nattyp(itypat),nlmn,nspinor,nspinortot,optder,paw_opt,sij_typ,ndat,ibeg-1,iend,nprojs,ntypat)
1224 
1225         shift = shift + nattyp(itypat)*nlmn
1226         iatm = iatm+nattyp(itypat)
1227       end do
1228     else
1229       !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) &
1230       !!$OMP TARGET LOOP &
1231       !$OMP& MAP(to:projections_ptr,s_projections) PRIVATE(i1,i2)
1232       do i2=1, nspinor*ndat
1233         do i1=1, nprojs
1234           s_projections(1,i1,i2) = projections_ptr(1,i1,i2)
1235           s_projections(2,i1,i2) = projections_ptr(2,i1,i2)
1236         end do
1237       end do
1238     end if
1239 
1240     ! opernlb (only choice=1)
1241     if(signs==2) then
1242       if(paw_opt == 3 .or. paw_opt == 4) then
1243         ! Get svectout from s_projections
1244         if(cplex == 2) then
1245           if(.not. gemm_nonlop_is_distributed) then
1246             !$OMP TARGET DATA USE_DEVICE_PTR(s_projections,current_ikpt_projs,svectout_)
1247             call abi_gpu_xgemm(cplex, 'N', 'N', npwout, ndat*nspinor, nprojs, cone, &
1248                     c_loc(current_ikpt_projs), npwout,&
1249                     c_loc(s_projections), nprojs, czero, c_loc(svectout_), npwout)
1250             !$OMP END TARGET DATA
1251           else
1252             call gemm_nonlop_ompgpu_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,&
1253             &                                         nprojs,&
1254             &                                         gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1255             &                                         gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1256             &                                         nprojs_my_blk,cplex,&
1257             &                                         current_ikpt_projs,projs_recv,&
1258             &                                         s_projections,svectout)
1259           end if
1260         else
1261           !$OMP TARGET DATA USE_DEVICE_PTR(s_projections,current_ikpt_projs_r,temp_realvec_r)
1262           call abi_gpu_xgemm(cplex, 'N', 'N', npwout, ndat*nspinor, nprojs, cone, &
1263                     c_loc(current_ikpt_projs_r), npwout, &
1264                     c_loc(s_projections), nprojs, czero, c_loc(temp_realvec_r), npwout)
1265           !$OMP END TARGET DATA
1266           !$OMP TARGET DATA USE_DEVICE_PTR(s_projections,current_ikpt_projs_i,temp_realvec_i)
1267           call abi_gpu_xgemm(cplex, 'N', 'N', npwout, ndat*nspinor, nprojs, cone, &
1268                     c_loc(current_ikpt_projs_i), npwout,&
1269                     c_loc(s_projections), nprojs, czero, c_loc(temp_realvec_i), npwout)
1270           !$OMP END TARGET DATA
1271 
1272           !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO &
1273           !!$OMP TARGET LOOP &
1274           !$OMP& MAP(to:temp_realvec_r,temp_realvec_i,svectout) PRIVATE(i)
1275           do i=1, npwin*nspinor*ndat
1276             svectout(1,i) = temp_realvec_r(i)
1277             svectout(2,i) = temp_realvec_i(i)
1278           end do
1279         end if
1280         if(choice /= 7) then
1281           !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO &
1282           !!$OMP TARGET LOOP &
1283           !$OMP& MAP(to:vectin,svectout) PRIVATE(i)
1284           do i=1, npwin*nspinor*ndat
1285             svectout(1,i) = svectout(1,i) + vectin(1,i)
1286             svectout(2,i) = svectout(2,i) + vectin(2,i)
1287           end do
1288         end if
1289       end if
1290       if(paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4) then
1291         ! Get vectout from vnl_projections
1292         if(cplex_fac == 2) then
1293           if(.not. gemm_nonlop_is_distributed) then
1294             !$OMP TARGET DATA USE_DEVICE_PTR(vnl_projections,current_ikpt_projs,vectout_)
1295             call abi_gpu_xgemm(cplex, 'N', 'N', npwout, ndat*nspinor, nprojs, cone, &
1296                     c_loc(current_ikpt_projs), npwout, &
1297                     c_loc(vnl_projections), nprojs, czero, c_loc(vectout_), npwout)
1298             !$OMP END TARGET DATA
1299           else
1300             call gemm_nonlop_ompgpu_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,&
1301             &                                         nprojs,&
1302             &                                         gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1303             &                                         gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1304             &                                         nprojs_my_blk,cplex,&
1305             &                                         current_ikpt_projs,projs_recv,&
1306             &                                         vnl_projections,vectout)
1307           end if
1308         else
1309           !$OMP TARGET DATA USE_DEVICE_PTR(vnl_projections,current_ikpt_projs_r,temp_realvec_r)
1310           call abi_gpu_xgemm(cplex, 'N', 'N', npwout, ndat*nspinor, nprojs, cone, &
1311                   c_loc(current_ikpt_projs_r), npwout, &
1312                   c_loc(vnl_projections), nprojs, czero, c_loc(temp_realvec_r), npwout)
1313           !$OMP END TARGET DATA
1314           !$OMP TARGET DATA USE_DEVICE_PTR(vnl_projections,current_ikpt_projs_i,temp_realvec_i)
1315           call abi_gpu_xgemm(cplex, 'N', 'N', npwout, ndat*nspinor, nprojs, cone, &
1316                   c_loc(current_ikpt_projs_i), npwout, &
1317                   c_loc(vnl_projections), nprojs, czero, c_loc(temp_realvec_i), npwout)
1318           !$OMP END TARGET DATA
1319 
1320           !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO &
1321           !!$OMP TARGET LOOP &
1322           !$OMP& MAP(to:temp_realvec_r,temp_realvec_i,vectout) PRIVATE(i)
1323           do i=1, npwin*nspinor*ndat
1324             vectout(1,i) = temp_realvec_r(i)
1325             vectout(2,i) = temp_realvec_i(i)
1326           end do
1327         end if
1328       end if
1329     end if ! opernlb
1330 
1331     ! opernld
1332     if(signs==1) then
1333 #ifdef HAVE_GPU_HIP
1334       !$OMP TARGET UPDATE FROM(vnl_projections,projections_ptr,dprojections)
1335 #endif
1336       if(choice==1.or.choice==3.or.choice==23) then
1337         shift=0
1338         iatm=0
1339         esum=zero
1340         do itypat=1, ntypat
1341           nlmn=count(indlmn(3,:,itypat)>0)
1342           ibeg = shift+1
1343           iend = shift+nattyp(itypat)*nlmn
1344           nattyp_i = nattyp(itypat)
1345 #ifndef HAVE_GPU_HIP
1346           !$OMP TARGET TEAMS DISTRIBUTE &
1347           !$OMP& MAP(to:vnl_projections,projections_ptr,enlk) &
1348           !$OMP& FIRSTPRIVATE(idat,itypat,nlmn,esum)
1349 #endif
1350           do idat=1,ndat*nspinor
1351             esum=zero
1352 #ifndef HAVE_GPU_HIP
1353             !$OMP PARALLEL DO COLLAPSE(3) REDUCTION(+:esum) &
1354             !$OMP& PRIVATE(ia,ilmn,ii)
1355 #endif
1356             do ia=1,nattyp_i
1357               do ilmn=1,nlmn
1358                 do ii=1,cplex
1359                   esum=esum +vnl_projections(ii,shift+(ia-1)*nlmn+ilmn,idat) &
1360 &                           *projections_ptr    (ii,shift+(ia-1)*nlmn+ilmn,idat)
1361                 end do
1362               end do
1363             end do
1364             enlk(idat) = enlk(idat) + esum
1365           end do
1366           shift = shift + nattyp(itypat)*nlmn
1367           iatm = iatm+nattyp(itypat)
1368         end do
1369         if (choice==1) then
1370 #ifndef HAVE_GPU_HIP
1371           !$OMP TARGET PARALLEL DO MAP(to:enlout,enlk) PRIVATE(idat)
1372 #endif
1373           do idat=1,ndat
1374             enlout(idat)=enlk(idat)
1375           end do
1376         end if
1377       end if ! choice=1/3/23
1378       if(choice==2.or.choice==3.or.choice==23) then
1379         grad_shift=merge(9,6,choice==23)
1380         if (choice==3.or.choice==23) then
1381           shift=0
1382           iatm=0
1383           do itypat=1, ntypat
1384             nlmn=count(indlmn(3,:,itypat)>0)
1385             ibeg = shift+1
1386             iend = shift+nattyp(itypat)*nlmn
1387             nattyp_i = nattyp(itypat)
1388 
1389 #ifndef HAVE_GPU_HIP
1390             !$OMP TARGET TEAMS DISTRIBUTE COLLAPSE(2) &
1391             !$OMP& MAP(to:vnl_projections,dprojections,enlout) &
1392             !$OMP& FIRSTPRIVATE(idat,itypat,nlmn,esum)
1393 #endif
1394             do idat=1,ndat*nspinor
1395               do igrad=1,6
1396                 esum=zero
1397 #ifndef HAVE_GPU_HIP
1398                 !$OMP PARALLEL DO COLLAPSE(3) REDUCTION(+:esum) &
1399                 !$OMP& PRIVATE(ia,ilmn,ii)
1400 #endif
1401                 do ia=1,nattyp_i
1402                   !Following loops are a [D][Z]DOT
1403                   do ilmn=1,nlmn
1404                     do ii=1,cplex
1405                       esum=esum +vnl_projections(ii,shift+(ia-1)*nlmn+ilmn,idat) &
1406 &                               *dprojections   (ii,grad_shift*shift + (ia-1)*nlmn*grad_shift + (igrad-1)*nlmn +ilmn,idat)
1407                     end do
1408                   end do
1409                 end do
1410                 enlout((idat-1)*nnlout+igrad) = enlout((idat-1)*nnlout+igrad) + two*esum
1411               end do
1412             end do
1413 
1414             shift = shift + nattyp(itypat)*nlmn
1415             iatm = iatm+nattyp(itypat)
1416           end do
1417         end if
1418 
1419         if (choice==2.or.choice==23) then
1420           shift=0
1421           iatm=0
1422           force_shift=merge(6,0,choice==23)
1423           do itypat=1, ntypat
1424             nlmn=count(indlmn(3,:,itypat)>0)
1425             nattyp_i = nattyp(itypat)
1426 
1427 #ifndef HAVE_GPU_HIP
1428             !$OMP TARGET TEAMS DISTRIBUTE COLLAPSE(3) &
1429             !$OMP& MAP(to:vnl_projections,dprojections,enlout) &
1430             !$OMP& FIRSTPRIVATE(idat,itypat,nlmn,esum)
1431 #endif
1432             do idat=1,ndat*nspinor
1433               do ia=1,nattyp_i
1434                 do igrad=1,3
1435                   !Following loops are a [D][Z]DOT
1436                   esum=zero
1437 #ifndef HAVE_GPU_HIP
1438                   !$OMP PARALLEL DO COLLAPSE(2) REDUCTION(+:esum) &
1439                   !$OMP& PRIVATE(ilmn,ii)
1440 #endif
1441                   do ilmn=1,nlmn
1442                     do ii=1,cplex
1443                       esum=esum +vnl_projections(ii,shift+(ia-1)*nlmn+ilmn,idat) &
1444 &                               *dprojections(ii,grad_shift*shift+(ia-1)*nlmn*grad_shift+(igrad-1+force_shift)*nlmn +ilmn,idat)
1445                     end do
1446                   end do
1447                   enlout((idat-1)*nnlout + force_shift + (iatm+ia-1)*3 + igrad)= &
1448 &                               enlout((idat-1)*nnlout + force_shift + (iatm+ia-1)*3 + igrad) + two*esum
1449                 end do
1450               end do
1451             end do
1452 
1453             shift = shift + nattyp(itypat)*nlmn
1454             iatm = iatm+nattyp(itypat)
1455           end do
1456         end if
1457       end if ! choice=2, 3 or 23
1458 #ifndef HAVE_GPU_HIP
1459       !$OMP TARGET UPDATE FROM(enlout,enlk)
1460 #endif
1461     end if !opernld
1462 
1463   end if ! choice>0
1464 
1465   !$OMP TASKWAIT
1466 ! Reduction in case of parallelism
1467   if (signs==1.and.mpi_enreg%paral_spinor==1) then
1468     if (size(enlout)>0) then
1469       call xmpi_sum(enlout,mpi_enreg%comm_spinor,ierr)
1470     end if
1471     if (choice==3.or.choice==23) then
1472       call xmpi_sum(enlk,mpi_enreg%comm_spinor,ierr)
1473     end if
1474   end if
1475 
1476 ! Derivatives wrt strain
1477 !  - Convert from reduced to cartesian coordinates
1478 !  - Substract volume contribution
1479  if ((choice==3.or.choice==23).and.signs==1.and.paw_opt<=3) then
1480    do idat=1,ndat
1481      enlout_shift=(idat-1)*nnlout
1482      call strconv(enlout(enlout_shift+1:enlout_shift+6),gprimd,work)
1483      enlout(enlout_shift+1:enlout_shift+3)=(work(1:3)-enlk(idat))
1484      enlout(enlout_shift+4:enlout_shift+6)= work(4:6)
1485    end do
1486  end if
1487 
1488  ! Retrieve and release allocated buffers
1489  !$OMP TARGET EXIT DATA MAP(delete:vectin) IF(transfer_vectin)
1490  !$OMP TARGET EXIT DATA MAP(from:vectout)   IF(transfer_vectout)
1491  !$OMP TARGET EXIT DATA MAP(from:svectout)  IF(transfer_svectout)
1492 
1493 #ifdef HAVE_GPU_CUDA
1494  !$OMP TARGET EXIT DATA MAP(delete:s_projections,vnl_projections)
1495  if(.not. local_vectproj) then
1496    !$OMP TARGET EXIT DATA MAP(delete:projections_ptr)
1497  end if
1498 #endif
1499 
1500 ! Release memory
1501   if(signs == 1) then
1502     !$OMP TARGET EXIT DATA MAP(delete:enlout)
1503   end if
1504   if(gemm_nonlop_is_distributed) then
1505     !$OMP TARGET EXIT DATA MAP(delete:projs_recv)
1506     ABI_FREE(projs_recv)
1507     if (signs==1.and.ngrads>0) then
1508       !$OMP TARGET EXIT DATA MAP(delete:dprojs_recv)
1509       ABI_FREE(dprojs_recv)
1510     end if
1511   end if
1512   !$OMP TARGET EXIT DATA MAP(delete:atindx1,indlmn,enl)
1513   if (allocated(dprojections)) then
1514     !$OMP TARGET EXIT DATA MAP(delete:dprojections)
1515     ABI_FREE(dprojections)
1516   end if
1517   if (allocated(enlk)) then
1518     !$OMP TARGET EXIT DATA MAP(delete:enlk,nattyp)
1519     ABI_FREE(enlk)
1520   end if
1521 
1522  end subroutine gemm_nonlop_ompgpu
1523 
1524 !***
1525 
1526 
1527 #else
1528 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1529 !!!!!! stubs for compiling with OpenMP GPU offload disabled.
1530 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1531 
1532  subroutine init_gemm_nonlop_ompgpu(nkpt)
1533   integer,intent(in) :: nkpt
1534 
1535   ABI_UNUSED((/nkpt/))
1536   ABI_BUG("Unhandled configuration for OpenMP GPU immplementation")
1537  end subroutine init_gemm_nonlop_ompgpu

m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu_distributed_gemm_opernla [ Functions ]

[ Top ] [ m_gemm_nonlop_ompgpu ] [ Functions ]

NAME

 gemm_nonlop_ompgpu_distributed_gemm_opernla

FUNCTION

 Distributed version of "opernla" GEMM called in gemm_nonlop.

INPUTS

SOURCE

555  subroutine gemm_nonlop_ompgpu_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,&
556  &                                               nprojs,nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex,beta,&
557  &                                               projs_local,projs_recv,vectin,cprojs)
558    integer,  intent(in)     :: rank,nprocs,npwin,ndat,nspinor
559    integer,  intent(in)     :: nprojs,nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex
560    real(dp), intent(in)     :: vectin(*)
561    complex(dpc), intent(in) :: beta
562    real(dp), intent(inout), target  :: projs_local(cplex,npwin,nprojs_last_blk),projs_recv(cplex,npwin,nprojs_last_blk)
563    real(dp), intent(out)  :: cprojs(:,:,:)
564 
565    !Local variables
566    integer :: iblock,ibeg,iend,req(2),ierr,nprojs_cur_blk,rank_prev,rank_next,idat
567    real(dp), ABI_CONTIGUOUS pointer :: recv_buf(:,:,:), work_buf(:,:,:)
568    real(dp), ABI_CONTIGUOUS pointer :: recv_buf_f(:), work_buf_f(:)
569 
570 
571 ! *************************************************************************
572 
573    rank_next=modulo(rank + 1,nprocs)
574    rank_prev=rank - 1
575    if(rank_prev == -1) rank_prev = nprocs - 1
576    do iblock=1,nprocs
577 
578      if(rank+iblock == nprocs) then
579        nprojs_cur_blk = nprojs_last_blk
580      else
581        nprojs_cur_blk = nprojs_blk
582      end if
583 
584      if(modulo(iblock,2)==1) then
585        work_buf => projs_local(1:cplex,1:npwin,1:nprojs_last_blk)
586        recv_buf => projs_recv(1:cplex,1:npwin,1:nprojs_last_blk)
587      else
588        work_buf => projs_recv(1:cplex,1:npwin,1:nprojs_last_blk)
589        recv_buf => projs_local(1:cplex,1:npwin,1:nprojs_last_blk)
590      end if
591 
592 #ifndef HAVE_GPU_MPI
593 
594        ! GPU-aware MPI not available : perform MPI comms on CPU
595        call xmpi_isend(work_buf,rank_prev,iblock,gemm_nonlop_block_comm,req(1),ierr)
596        call xmpi_irecv(recv_buf,rank_next,iblock,gemm_nonlop_block_comm,req(2),ierr)
597        !$OMP TARGET UPDATE TO(work_buf)
598 
599 #else
600 
601        ! GPU-aware MPI available : pass GPU buffers to MPI
602        !$OMP TARGET DATA USE_DEVICE_PTR(work_buf,recv_buf)
603        call c_f_pointer(c_loc(work_buf), work_buf_f, [cplex*npwin*nprojs_last_blk])
604        call c_f_pointer(c_loc(recv_buf), recv_buf_f, [cplex*npwin*nprojs_last_blk])
605        call MPI_ISEND(work_buf_f,cplex*npwin*nprojs_cur_blk,MPI_DOUBLE_PRECISION,&
606        &    rank_prev,iblock,gemm_nonlop_block_comm,req(1),ierr)
607        call MPI_IRECV(recv_buf_f,cplex*npwin*nprojs_cur_blk,MPI_DOUBLE_PRECISION,&
608        &    rank_next,iblock,gemm_nonlop_block_comm,req(2),ierr)
609        !$OMP END TARGET DATA
610 
611 #endif
612 
613 
614      ibeg = 1 + modulo(rank+iblock-1,nprocs)*nprojs_blk
615      iend = ibeg+nprojs_cur_blk-1
616      if(cplex == 2) then
617        !$OMP TARGET DATA USE_DEVICE_PTR(work_buf,vectin,cprojs)
618        call abi_gpu_xgemm(cplex, 'C','N', &
619                nprojs_cur_blk, ndat*nspinor, npwin, cone, &
620                c_loc(work_buf), npwin, &
621                c_loc(vectin), npwin, &
622                beta, &
623                c_loc(cprojs(1,ibeg,1)), nprojs)
624        !$OMP END TARGET DATA
625      else
626        call DGEMM('T', 'N', nprojs_cur_blk, ndat*nspinor, npwin, one, &
627        &          work_buf, npwin, &
628        &          vectin, npwin, real(beta), cprojs(:,ibeg:iend,:), nprojs_cur_blk)
629      end if
630 
631      call xmpi_waitall(req,ierr)
632 
633    end do
634 
635    if(modulo(iblock,2)==1) then
636 #ifndef HAVE_GPU_MPI
637        call DCOPY(cplex*npwin*nprojs_cur_blk, recv_buf, 1, work_buf, 1)
638 #else
639        !$OMP TARGET DATA USE_DEVICE_PTR(work_buf,recv_buf)
640        call copy_gpu_to_gpu(c_loc(work_buf), c_loc(recv_buf), INT(cplex, c_size_t)*npwin*nprojs_last_blk*dp)
641        !$OMP END TARGET DATA
642 #endif
643    end if
644  end subroutine gemm_nonlop_ompgpu_distributed_gemm_opernla

m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu_distributed_gemm_opernlb [ Functions ]

[ Top ] [ m_gemm_nonlop_ompgpu ] [ Functions ]

NAME

 gemm_nonlop_ompgpu_distributed_gemm_opernlb

FUNCTION

 Distributed version of "opernlb" GEMM called in gemm_nonlop.

INPUTS

SOURCE

659  subroutine gemm_nonlop_ompgpu_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,&
660  &                                               nprojs,nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex,&
661  &                                               projs_local,projs_recv,cprojs_s_vnl,vectout)
662    integer,  intent(in)     :: rank,nprocs,npwout,ndat,nspinor
663    integer,  intent(in)     :: nprojs,nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex
664    real(dp), intent(in)     :: cprojs_s_vnl(:,:,:)
665    real(dp), intent(inout), target  :: projs_local(cplex,npwout,nprojs_last_blk),projs_recv(cplex,npwout,nprojs_last_blk)
666    real(dp), intent(out)    :: vectout(*)
667 
668    !Local variables
669    integer :: iblock,ibeg,iend,req(2),ierr,nprojs_cur_blk,rank_prev,rank_next
670    complex(dpc) :: beta
671    real(dp), ABI_CONTIGUOUS pointer :: recv_buf(:,:,:), work_buf(:,:,:)
672    real(dp), ABI_CONTIGUOUS pointer :: recv_buf_f(:), work_buf_f(:)
673 
674 ! *************************************************************************
675 
676    rank_next=modulo(rank + 1,nprocs)
677    rank_prev=rank - 1
678    if(rank_prev == -1) rank_prev = nprocs - 1
679 
680    beta = czero
681 
682    do iblock=1,nprocs
683 
684      if(rank+iblock == nprocs) then
685        nprojs_cur_blk = nprojs_last_blk
686      else
687        nprojs_cur_blk = nprojs_blk
688      end if
689 
690      if(modulo(iblock,2)==1) then
691        work_buf => projs_local(1:cplex,1:npwout,1:nprojs_last_blk)
692        recv_buf => projs_recv(1:cplex,1:npwout,1:nprojs_last_blk)
693      else
694        work_buf => projs_recv(1:cplex,1:npwout,1:nprojs_last_blk)
695        recv_buf => projs_local(1:cplex,1:npwout,1:nprojs_last_blk)
696      end if
697 
698 #ifndef HAVE_GPU_MPI
699 
700        ! GPU-aware MPI not available : perform MPI comms on CPU
701        call xmpi_isend(work_buf,rank_prev,iblock,gemm_nonlop_block_comm,req(1),ierr)
702        call xmpi_irecv(recv_buf,rank_next,iblock,gemm_nonlop_block_comm,req(2),ierr)
703        !$OMP TARGET UPDATE TO(work_buf)
704 
705 #else
706 
707        ! GPU-aware MPI available : pass GPU buffers to MPI
708        !$OMP TARGET DATA USE_DEVICE_PTR(work_buf,recv_buf)
709        call c_f_pointer(c_loc(work_buf), work_buf_f, [cplex*npwout*nprojs_last_blk])
710        call c_f_pointer(c_loc(recv_buf), recv_buf_f, [cplex*npwout*nprojs_last_blk])
711        call MPI_ISEND(work_buf_f,cplex*npwout*nprojs_cur_blk,MPI_DOUBLE_PRECISION,&
712        &    rank_prev,iblock,gemm_nonlop_block_comm,req(1),ierr)
713        call MPI_IRECV(recv_buf_f,cplex*npwout*nprojs_cur_blk,MPI_DOUBLE_PRECISION,&
714        &    rank_next,iblock,gemm_nonlop_block_comm,req(2),ierr)
715        !$OMP END TARGET DATA
716 
717 #endif
718 
719      ibeg = 1 + modulo(rank+iblock-1,nprocs)*nprojs_blk
720      iend = ibeg+nprojs_cur_blk-1
721      if(cplex==2) then
722        !$OMP TARGET DATA USE_DEVICE_PTR(work_buf,vectout,cprojs_s_vnl)
723        call abi_gpu_xgemm(cplex, 'N','N', &
724                npwout, ndat*nspinor, nprojs_cur_blk, cone, &
725                c_loc(work_buf), npwout, &
726                c_loc(cprojs_s_vnl(1,ibeg,1)), nprojs, &
727                beta, &
728                c_loc(vectout), npwout)
729        !$OMP END TARGET DATA
730      else
731        call DGEMM('N', 'N', npwout, ndat*nspinor, nprojs_cur_blk, one, &
732        &          work_buf, npwout, &
733        &          cprojs_s_vnl(:,ibeg:iend,:), nprojs_cur_blk, real(beta), vectout, npwout)
734      end if
735 
736      beta = cone
737 
738      call xmpi_waitall(req,ierr)
739 
740    end do
741 
742    if(modulo(iblock,2)==1) then
743 #ifndef HAVE_GPU_MPI
744        call DCOPY(cplex*npwout*nprojs_cur_blk, recv_buf, 1, work_buf, 1)
745 #else
746        !$OMP TARGET DATA USE_DEVICE_PTR(work_buf,recv_buf)
747        call copy_gpu_to_gpu(c_loc(work_buf), c_loc(recv_buf), INT(cplex, c_size_t)*npwout*nprojs_last_blk*dp)
748        !$OMP END TARGET DATA
749 #endif
750    end if
751 
752  end subroutine gemm_nonlop_ompgpu_distributed_gemm_opernlb

m_gemm_nonlop_ompgpu/init_gemm_nonlop_ompgpu [ Functions ]

[ Top ] [ m_gemm_nonlop_ompgpu ] [ Functions ]

NAME

 init_gemm_nonlop_ompgpu

FUNCTION

 Initalization of the gemm_nonlop_kpt array

INPUTS

 nkpt= number of k-points

SOURCE

398  subroutine init_gemm_nonlop_ompgpu(nkpt)
399 
400   integer,intent(in) :: nkpt
401   integer :: ikpt
402 
403 ! *************************************************************************
404 
405   ! TODO only allocate the number of kpt treated by this proc
406   ABI_MALLOC(gemm_nonlop_kpt, (nkpt))
407   do ikpt=1,nkpt
408     gemm_nonlop_kpt(ikpt)%nprojs = -1
409     gemm_nonlop_kpt(ikpt)%ngrads = -1
410   end do
411   mod__nkpt=nkpt
412 
413  end subroutine init_gemm_nonlop_ompgpu

m_gemm_nonlop_ompgpu/make_gemm_nonlop_ompgpu [ Functions ]

[ Top ] [ m_gemm_nonlop_ompgpu ] [ Functions ]

NAME

 make_gemm_nonlop_ompgpu

FUNCTION

 Build the gemm_nonlop array

INPUTS

SOURCE

514  subroutine make_gemm_nonlop_ompgpu(ikpt,npw,lmnmax,ntypat,indlmn,nattyp,istwf_k,ucvol,ffnl_k, &
515 &                            ph3d_k,kpt_k,kg_k,kpg_k, &
516 &                            compute_grad_strain,compute_grad_atom) ! Optional parameters
517 
518   integer, intent(in) :: ikpt
519   integer, intent(in) :: npw, lmnmax,ntypat
520   integer, intent(in) :: indlmn(:,:,:), kg_k(:,:)
521   integer, intent(in) :: nattyp(ntypat)
522   integer, intent(in) :: istwf_k
523   logical, intent(in), optional :: compute_grad_strain,compute_grad_atom
524   real(dp), intent(in) :: ucvol
525   real(dp), intent(in) :: ffnl_k(:,:,:,:)
526   real(dp), intent(in) :: ph3d_k(:,:,:)
527   real(dp), intent(in) :: kpt_k(:)
528   real(dp), intent(in), target :: kpg_k(:,:)
529 
530   integer :: cplex
531 ! *************************************************************************
532 
533   call free_gemm_nonlop_kpt_ompgpu()
534   call make_gemm_nonlop(ikpt,npw,lmnmax,ntypat,indlmn,nattyp,istwf_k,ucvol,ffnl_k, &
535 &                            ph3d_k,kpt_k,kg_k,kpg_k, &
536 &                            compute_grad_strain=compute_grad_strain,compute_grad_atom=compute_grad_atom)
537   cplex=2;if (istwf_k>1) cplex=1
538   call alloc_gemm_nonlop_kpt_ompgpu(ikpt,cplex)
539 
540  end subroutine make_gemm_nonlop_ompgpu

TABLE OF CONTENTS

ABINIT/m_gemm_nonlop_ompgpu [ Modules ]

m_gemm_nonlop/free_gemm_nonlop_ompgpu_ikpt [ Functions ]

m_gemm_nonlop_ompgpu/destroy_gpu_nonlop_ompgpu [ Functions ]

m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu [ Functions ]

m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu_distributed_gemm_opernla [ Functions ]

m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu_distributed_gemm_opernlb [ Functions ]

m_gemm_nonlop_ompgpu/init_gemm_nonlop_ompgpu [ Functions ]

m_gemm_nonlop_ompgpu/make_gemm_nonlop_ompgpu [ Functions ]