TABLE OF CONTENTS
- ABINIT/m_gemm_nonlop_ompgpu
- m_gemm_nonlop/free_gemm_nonlop_ompgpu_ikpt
- m_gemm_nonlop_ompgpu/destroy_gpu_nonlop_ompgpu
- m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu
- m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu_distributed_gemm_opernla
- m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu_distributed_gemm_opernlb
- m_gemm_nonlop_ompgpu/init_gemm_nonlop_ompgpu
- m_gemm_nonlop_ompgpu/make_gemm_nonlop_ompgpu
ABINIT/m_gemm_nonlop_ompgpu [ Modules ]
NAME
m_gemm_nonlop_ompgpu
FUNCTION
This module provides functions to compute the nonlocal operator by means of the BLAS GEMM routine. By treating ndat simultaneous wavefunctions, it is able to exploit BLAS3 routines, which leads to excellent CPU efficiency and OpenMP scalability.
COPYRIGHT
Copyright (C) 2014-2024 ABINIT group (MS) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
18 ! TODO list : 19 ! Don't allocate the full nkpt structures, only those that are treated by this proc: use same init as in m_bandfft_kpt 20 ! support more options (forces & stresses mostly) 21 ! Support RF/other computations (only GS right now) 22 ! handle the case where nloalg(2) < 0, ie no precomputation of ph3d 23 ! more systematic checking of the workflow (right now, only works if init/make/gemm/destroy, no multiple makes, etc) 24 ! Avoid allocating the complex matrix when istwfk > 1 25 ! Merge with chebfi's invovl 26 27 28 #if defined HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #include "abi_common.h" 33 34 module m_gemm_nonlop_ompgpu 35 36 use defs_basis 37 use m_errors 38 use m_abicore 39 use m_xmpi 40 use m_xomp 41 use m_abi_linalg 42 43 #ifdef HAVE_FC_ISO_C_BINDING 44 use iso_c_binding 45 #endif 46 47 #ifdef HAVE_GPU 48 use m_gpu_toolbox 49 #endif 50 51 use defs_abitypes, only : MPI_type 52 use m_opernlc_ylm_ompgpu, only : opernlc_ylm_ompgpu 53 use m_pawcprj, only : pawcprj_type 54 use m_geometry, only : strconv 55 use m_kg, only : mkkpg 56 use m_gemm_nonlop 57 58 #if defined HAVE_MPI2 59 use mpi 60 #endif 61 62 implicit none 63 64 private 65 66 ! Use these routines in order: first call init, then call make_gemm_nonlop for each k point, 67 ! then call gemm_nonlop to do the actual computation, and call destroy when done. See gstate and vtorho. 68 public :: init_gemm_nonlop_ompgpu 69 public :: destroy_gemm_nonlop_ompgpu 70 public :: make_gemm_nonlop_ompgpu 71 public :: gemm_nonlop_ompgpu 72 73 ! Those routines are here to assess memory requirements 74 public :: gemm_nonlop_ompgpu_work_mem 75 public :: gemm_nonlop_ompgpu_static_mem
m_gemm_nonlop/free_gemm_nonlop_ompgpu_ikpt [ Functions ]
[ Top ] [ m_gemm_nonlop ] [ Functions ]
NAME
free_destroy_gemm_nonlop_ompgpuikpt
FUNCTION
Release memory for one kpt value of the gemm_nonlop_kpt array
INPUTS
ikpt= index of gemm_nonlop_kptto be released
SOURCE
467 subroutine free_gemm_nonlop_ompgpu_ikpt(ikpt) 468 469 integer,intent(in) :: ikpt 470 471 ! ************************************************************************* 472 473 if(gemm_nonlop_kpt(ikpt)%nprojs /= -1) then 474 if (allocated(gemm_nonlop_kpt(ikpt)%projs)) then 475 ABI_FREE(gemm_nonlop_kpt(ikpt)%projs) 476 end if 477 if (allocated(gemm_nonlop_kpt(ikpt)%projs_r)) then 478 ABI_FREE(gemm_nonlop_kpt(ikpt)%projs_r) 479 end if 480 if (allocated(gemm_nonlop_kpt(ikpt)%projs_i)) then 481 ABI_FREE(gemm_nonlop_kpt(ikpt)%projs_i) 482 end if 483 gemm_nonlop_kpt(ikpt)%nprojs = -1 484 if(gemm_nonlop_kpt(ikpt)%ngrads /= -1) then 485 if (allocated(gemm_nonlop_kpt(ikpt)%dprojs)) then 486 ABI_FREE(gemm_nonlop_kpt(ikpt)%dprojs) 487 end if 488 if (allocated(gemm_nonlop_kpt(ikpt)%dprojs_r)) then 489 ABI_FREE(gemm_nonlop_kpt(ikpt)%dprojs_r) 490 end if 491 if (allocated(gemm_nonlop_kpt(ikpt)%dprojs_i)) then 492 ABI_FREE(gemm_nonlop_kpt(ikpt)%dprojs_i) 493 end if 494 gemm_nonlop_kpt(ikpt)%ngrads = -1 495 end if 496 end if 497 498 end subroutine free_gemm_nonlop_ompgpu_ikpt
m_gemm_nonlop_ompgpu/destroy_gpu_nonlop_ompgpu [ Functions ]
[ Top ] [ m_gemm_nonlop_ompgpu ] [ Functions ]
NAME
destroy_gemm_nonlop_ompgpu
FUNCTION
Destruction of the gemm_nonlop_kpt array
INPUTS
nkpt= number of k-points
SOURCE
429 subroutine destroy_gemm_nonlop_ompgpu() 430 431 integer :: ikpt 432 character(len=200) :: msg 433 434 ! ************************************************************************* 435 436 if(mod__nkpt==0) then 437 msg="GPU nonlop arrays aren't initialized. Redundant call" 438 ABI_ERROR(msg) 439 end if 440 441 call free_work_buffers() 442 call free_gemm_nonlop_kpt_ompgpu() 443 ! TODO add cycling if kpt parallelism 444 do ikpt = 1,mod__nkpt 445 call free_gemm_nonlop_ompgpu_ikpt(ikpt) 446 end do 447 448 ABI_FREE(gemm_nonlop_kpt) 449 mod__nkpt=0 450 451 end subroutine destroy_gemm_nonlop_ompgpu
m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu [ Functions ]
[ Top ] [ m_gemm_nonlop_ompgpu ] [ Functions ]
NAME
gemm_nonlop_ompgpu
FUNCTION
Replacement of nonlop. same prototype as nonlop although not all options are implemented.
INPUTS
[gpu_option] = GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)
SOURCE
768 subroutine gemm_nonlop_ompgpu(atindx1,choice,cpopt,cprjin,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout,& 769 & enl,enlout,ffnlin,ffnlout,gmet,gprimd,idir,indlmn,istwf_k,& 770 & kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,lmnmax,matblk,mgfft,& 771 & mpi_enreg,mpsang,mpssoang,natom,nattyp,ndat,ngfft,nkpgin,nkpgout,nloalg,& 772 & nnlout,npwin,npwout,nspinor,nspinortot,ntypat,only_SO,paw_opt,phkxredin,& 773 & phkxredout,ph1d,ph3din,ph3dout,signs,sij,svectout,& 774 & tim_nonlop,ucvol,useylm,vectin,vectout,& 775 & vectproj,gpu_option) 776 777 !Arguments ------------------------------------ 778 !scalars 779 integer,intent(in) :: choice,cpopt,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout,idir 780 integer,intent(in) :: istwf_k,lmnmax,matblk,mgfft,mpsang,mpssoang,natom,ndat,nkpgin 781 integer,intent(in) :: nkpgout,nnlout,npwin,npwout,nspinor,nspinortot,ntypat,only_SO 782 integer,intent(in) :: paw_opt,signs,tim_nonlop,useylm 783 integer,optional,intent(in) :: gpu_option 784 real(dp),intent(in) :: lambda(ndat),ucvol 785 type(MPI_type),intent(in) :: mpi_enreg 786 !arrays 787 integer,intent(in) :: atindx1(natom),indlmn(6,lmnmax,ntypat),kgin(3,npwin) 788 integer,intent(in) :: kgout(3,npwout),nattyp(ntypat),ngfft(18),nloalg(3) 789 real(dp),intent(in) :: enl(dimenl1,dimenl2,nspinortot**2,dimekbq) 790 real(dp),intent(in) :: ffnlin(npwin,dimffnlin,lmnmax,ntypat) 791 real(dp),intent(in) :: ffnlout(npwout,dimffnlout,lmnmax,ntypat),gmet(3,3) 792 real(dp),intent(in) :: gprimd(3,3),kpgin(npwin,nkpgin*useylm) 793 real(dp),intent(in) :: kpgout(npwout,nkpgout*useylm),kptin(3),kptout(3) 794 real(dp),intent(in) :: phkxredin(2,natom),phkxredout(2,natom),ph1d(2,*) 795 real(dp),intent(in) :: sij(dimenl1,ntypat*((paw_opt+1)/3)) 796 real(dp),intent(inout) :: ph3din(2,npwin,matblk),ph3dout(2,npwout,matblk) 797 real(dp),intent(inout),target :: vectin(2,npwin*nspinor*ndat) 798 real(dp),intent(inout) :: enlout(nnlout*ndat) 799 real(dp),intent(out),target :: svectout(:,:) 800 real(dp),intent(inout),target :: vectout(:,:) 801 real(dp),intent(inout),optional, ABI_CONTIGUOUS target :: vectproj(:,:,:) 802 type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat) 803 804 805 ! locals 806 integer :: ii, ia, idat, igrad, nprojs, ngrads, shift, iatom, nlmn, ierr, ibeg, iend, i1, i2, i, ikpt 807 integer :: cplex, cplex_enl, cplex_fac, proj_shift, grad_shift, nattyp_i 808 integer :: enlout_shift, force_shift, nnlout_test 809 integer :: iatm, ndgxdt, ndgxdtfac, nd2gxdt, nd2gxdtfac, optder, itypat, ilmn 810 integer :: cplex_dgxdt(1), cplex_d2gxdt(1) 811 real(dp) :: esum,esumk(9) 812 real(dp) :: work(6) 813 real(dp) :: dgxdt_dum_in(1,1,1,1,1), dgxdt_dum_out(1,1,1,1,1),dgxdt_dum_out2(1,1,1,1,1) 814 real(dp) :: d2gxdt_dum_in(1,1,1,1,1), d2gxdt_dum_out(1,1,1,1,1),d2gxdt_dum_out2(1,1,1,1,1) 815 real(dp), allocatable :: enlk(:) 816 real(dp),pointer :: projections_ptr(:,:,:) 817 integer :: nprojs_my_blk, ipw, iproj, iblock, nprojs_blk 818 integer :: rank, nprocs, req(2) 819 logical :: is_last 820 real(dp), allocatable :: projs_recv(:,:,:), dprojs_recv(:,:,:) 821 logical :: local_vectproj 822 logical :: transfer_vectin,transfer_vectout,transfer_svectout 823 character(len=500) :: msg 824 integer(C_SIZE_T) :: byte_count 825 real(dp), ABI_CONTIGUOUS pointer :: vectin_(:,:),vectout_(:,:),svectout_(:,:) 826 827 ! ************************************************************************* 828 829 ! We keep the same interface as nonlop, but we don't use many of those 830 ABI_UNUSED((/ffnlin,ffnlout,gmet,kpgin,kpgout/)) 831 ABI_UNUSED((/ph1d(1,1),ph3din,ph3dout/)) 832 ABI_UNUSED((/phkxredin,phkxredout,ucvol/)) 833 ABI_UNUSED((/mgfft,mpsang,mpssoang/)) 834 ABI_UNUSED((/kptin,kptout/)) 835 ABI_UNUSED((/idir,nloalg,ngfft,kgin,kgout,ngfft,only_SO,tim_nonlop,gpu_option/)) 836 837 ! Check supported options 838 if ( (choice>1.and.choice/=7.and.signs==2) .or. & 839 & (choice>3.and.choice/=7.and.choice/=23.and.signs==1) .or. & 840 & (useylm/=1) ) then 841 ABI_BUG('gemm_nonlop option not supported!') 842 end if 843 if (signs==1) then 844 nnlout_test=0 845 if (choice==1) nnlout_test=1 846 if (choice==2) nnlout_test=3*natom 847 if (choice==3) nnlout_test=6 848 if (choice==23) nnlout_test=6+3*natom 849 if (nnlout<nnlout_test) then 850 ABI_BUG('wrong nnlout size!') 851 end if 852 end if 853 854 ikpt=gemm_nonlop_ikpt_this_proc_being_treated 855 cplex=2;if (istwf_k>1) cplex=1 856 cplex_enl=1;if (paw_opt>0) cplex_enl=2*dimenl1/(lmnmax*(lmnmax+1)) ! is enl complex? 857 cplex_fac=max(cplex,dimekbq) 858 if ((nspinortot==2.or.cplex_enl==2).and.paw_opt>0.and.choice/=7) cplex_fac=2 ! is vnl_projections complex? 859 860 nprojs = gemm_nonlop_kpt(ikpt)%nprojs 861 ngrads = gemm_nonlop_kpt(ikpt)%ngrads 862 nprojs_my_blk = gemm_nonlop_kpt(ikpt)%nprojs_blk 863 864 if(gemm_nonlop_is_distributed) then 865 rank = xmpi_comm_rank(gemm_nonlop_block_comm); nprocs = xmpi_comm_size(gemm_nonlop_block_comm) 866 is_last = (rank==nprocs-1) 867 if(is_last) nprojs_my_blk = gemm_nonlop_kpt(ikpt)%nprojs_last_blk 868 end if 869 870 if(choice==1) ngrads=0 871 ABI_CHECK(ngrads>=3.or.choice/=2 ,"Bad allocation in gemm_nonlop (2)!") 872 ABI_CHECK(ngrads>=6.or.choice/=3 ,"Bad allocation in gemm_nonlop (3)!") 873 ABI_CHECK(ngrads>=9.or.choice/=23,"Bad allocation in gemm_nonlop (23)!") 874 875 ! Allocate and copy GPU buffers if user doesn't manage them 876 transfer_vectin=.not. xomp_target_is_present(c_loc(vectin)) & 877 .and. (cpopt < 2 .or. (choice/=7 .and.paw_opt >=3)) 878 transfer_vectout=.not. xomp_target_is_present(c_loc(vectout)) & 879 .and. (signs==2 .and. (paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4)) 880 transfer_svectout=.not. xomp_target_is_present(c_loc(svectout)) & 881 .and. (signs==2 .and. (paw_opt == 3 .or. paw_opt == 4)) 882 !$OMP TARGET ENTER DATA MAP(to:vectin) IF(transfer_vectin) 883 !$OMP TARGET ENTER DATA MAP(alloc:vectout) IF(transfer_vectout) 884 !$OMP TARGET ENTER DATA MAP(alloc:svectout) IF(transfer_svectout) 885 886 !FIXME These seemingly useless pointers are used in BLAS operations for 887 ! working around a bug in AOMP LLVM misreading mapped device pointers. 888 ! I chose to generalise the workaround to avoid dupplicating each 889 ! BLAS call specifically for handling AOMP LLVM. 890 vectin_ => vectin 891 vectout_ => vectout 892 svectout_ => svectout 893 894 !$OMP TARGET ENTER DATA MAP(to:atindx1,indlmn,enl) 895 if(gpu_initialised == 0 .or. mod__ndat /= ndat*nspinor) then 896 call alloc_work_buffers(cplex, cplex_fac,& 897 & nspinor*ndat, nprojs, ntypat, lmnmax, MAX(npwin,npwout)) 898 if(paw_opt>=2 .and. choice > 0 .and. choice /= 7) then 899 if (cplex_enl==1) then 900 do itypat=1, ntypat 901 nlmn=count(indlmn(3,:,itypat)>0) 902 do ilmn=1,nlmn*(nlmn+1)/2 903 sij_typ(ilmn,itypat)=sij(ilmn,itypat) 904 end do 905 end do 906 else 907 do itypat=1, ntypat 908 nlmn=count(indlmn(3,:,itypat)>0) 909 do ilmn=1,nlmn*(nlmn+1)/2 910 sij_typ(ilmn,itypat)=sij(2*ilmn-1,itypat) 911 end do 912 end do 913 end if 914 end if 915 !$OMP TARGET UPDATE TO(sij_typ) 916 end if 917 if(current_ikpt_in_gpu /= gemm_nonlop_ikpt_this_proc_being_treated) then 918 call refresh_gemm_nonlop_kpt_ompgpu(gemm_nonlop_ikpt_this_proc_being_treated) 919 end if 920 921 ! If vectproj is provided, use it for further calculations, use static array otherwise 922 projections_ptr => projections 923 local_vectproj=.false. 924 if(PRESENT(vectproj)) then 925 if(size(vectproj)>1) local_vectproj=.true. 926 end if 927 if (local_vectproj) projections_ptr => vectproj 928 929 #ifdef HAVE_GPU_CUDA 930 !Work buffers allocated at each call to save memory in CUDA 931 !$OMP TARGET ENTER DATA MAP(alloc:s_projections,vnl_projections) 932 if(.not. local_vectproj) then 933 !$OMP TARGET ENTER DATA MAP(alloc:projections_ptr) 934 end if 935 #endif 936 937 if(gemm_nonlop_is_distributed) then 938 ABI_MALLOC(projs_recv, (cplex, npwin, gemm_nonlop_kpt(ikpt)%nprojs_last_blk)) 939 !$OMP TARGET ENTER DATA MAP(alloc:projs_recv) 940 if (signs==1.and.ngrads>0) then 941 ABI_MALLOC(dprojs_recv, (cplex, npwin, ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk)) 942 !$OMP TARGET ENTER DATA MAP(alloc:dprojs_recv) 943 end if 944 end if 945 946 ! These will store the non-local factors for vectin, svectout and vectout respectively 947 #if defined HAVE_GPU_CUDA 948 byte_count=sizeof(projections_ptr) 949 !$OMP TARGET DATA USE_DEVICE_PTR(projections_ptr,s_projections,vnl_projections) 950 if(cpopt < 2) call gpu_memset(c_loc(projections_ptr), 0, byte_count) 951 call gpu_memset(c_loc(s_projections), 0, byte_count) 952 call gpu_memset(c_loc(vnl_projections), 0, byte_count) 953 !$OMP END TARGET DATA 954 #elif defined HAVE_GPU_HIP 955 if(cpopt < 2) then 956 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & 957 !$OMP& MAP(to:projections_ptr) PRIVATE(i1,i2) 958 do i2=1, nspinor*ndat 959 do i1=1, nprojs 960 projections_ptr(:,i1,i2) = zero 961 end do 962 end do 963 end if 964 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & 965 !$OMP& MAP(to:s_projections,vnl_projections) PRIVATE(i1,i2) 966 do i2=1, nspinor*ndat 967 do i1=1, nprojs 968 s_projections(:,i1,i2) = zero 969 vnl_projections(:,i1,i2) = zero 970 end do 971 end do 972 #endif 973 974 if (signs==1.and.ngrads>0) then 975 ABI_MALLOC(dprojections,(cplex, ngrads*nprojs, nspinor*ndat)) 976 !$OMP TARGET ENTER DATA MAP(alloc:dprojections) 977 #if defined HAVE_GPU_CUDA 978 byte_count=sizeof(dprojections) 979 !$OMP TARGET DATA USE_DEVICE_PTR(dprojections) 980 call gpu_memset(c_loc(dprojections), 0, byte_count) 981 !$OMP END TARGET DATA 982 #elif defined HAVE_GPU_HIP 983 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & 984 !$OMP& MAP(to:dprojections) PRIVATE(i1,i2) 985 do i2=1, nspinor*ndat 986 do i1=1, nprojs*ngrads 987 dprojections(:,i1,i2) = zero 988 end do 989 end do 990 #endif 991 if(choice==1.or.choice==3.or.choice==23) then 992 ABI_MALLOC(enlk,(ndat)) 993 enlk=zero 994 !$OMP TARGET ENTER DATA MAP(to:enlk,nattyp) 995 end if 996 end if 997 998 if(nprojs == 0) then 999 ! TODO check if this is correct 1000 if(signs == 1) then 1001 enlout=zero 1002 return 1003 end if 1004 if(signs == 2) then 1005 vectout = zero 1006 if(paw_opt>0) svectout = vectin 1007 return 1008 end if 1009 end if 1010 1011 if(signs == 1) then 1012 enlout=zero 1013 !$OMP TARGET ENTER DATA MAP(to:enlout) 1014 end if 1015 1016 !$OMP TASKWAIT 1017 if(cpopt >= 2) then 1018 ! retrieve from cprjin 1019 if(.not. local_vectproj .and. cpopt/=3) then 1020 !TODO This use-case is extremely painful for GEMM OpenGPU nonlop performance 1021 ABI_WARNING("Inefficient call of OpenGPU nonlop. Was vectproj provided with OpenMP mapping?") 1022 !$OMP TARGET UPDATE FROM(projections_ptr) 1023 !$OMP PARALLEL DO PRIVATE(shift,idat,iatom,nlmn) 1024 do idat=1, ndat*nspinor 1025 shift = 0 1026 do iatom = 1, natom 1027 nlmn = cprjin(iatom, idat)%nlmn 1028 projections_ptr(1:cplex, shift+1:shift+nlmn, idat) = cprjin(iatom, idat)%cp(1:cplex, 1:nlmn) 1029 shift = shift + nlmn 1030 end do 1031 end do 1032 !$OMP TARGET UPDATE TO(projections_ptr) 1033 end if 1034 if(cpopt==4.and.allocated(dprojections)) then 1035 !$OMP TARGET UPDATE FROM(dprojections) 1036 ABI_CHECK(cprjin(1,1)%ncpgr>=ngrads,"cprjin%ncpgr not correct! (1)") 1037 !$OMP PARALLEL DO PRIVATE(shift,idat,iatom,igrad,nlmn) 1038 do idat=1, ndat*nspinor 1039 shift = 0 1040 do iatom = 1, natom 1041 nlmn = cprjin(iatom, idat)%nlmn 1042 do igrad=1,ngrads 1043 dprojections(1:cplex, shift+1:shift+nlmn, idat) = & 1044 & cprjin(iatom, idat)%dcp(1:cplex,igrad,1:ilmn) 1045 shift = shift + nlmn 1046 end do 1047 end do 1048 end do 1049 !$OMP TARGET UPDATE TO(dprojections) 1050 end if 1051 else 1052 ! opernla 1053 if(cplex == 2) then 1054 if(.not. gemm_nonlop_is_distributed) then 1055 !$OMP TARGET DATA USE_DEVICE_PTR(projections_ptr,current_ikpt_projs,vectin_) 1056 call abi_gpu_xgemm(cplex, 'C', 'N', nprojs, ndat*nspinor, npwin, cone, & 1057 & c_loc(current_ikpt_projs), npwin,& 1058 & c_loc(vectin_), npwin, czero, c_loc(projections_ptr), nprojs) 1059 !$OMP END TARGET DATA 1060 if(signs==1.and.ngrads>0) then 1061 !$OMP TARGET DATA USE_DEVICE_PTR(dprojections,current_ikpt_dprojs,vectin_) 1062 call abi_gpu_xgemm(cplex, 'C', 'N', ngrads*nprojs, ndat*nspinor, npwin, cone, & 1063 c_loc(current_ikpt_dprojs), npwin,& 1064 c_loc(vectin_), npwin, czero, c_loc(dprojections), ngrads*nprojs) 1065 !$OMP END TARGET DATA 1066 end if 1067 else 1068 call gemm_nonlop_ompgpu_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,& 1069 & nprojs,& 1070 & gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1071 & gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1072 & nprojs_my_blk,cplex,czero,& 1073 & current_ikpt_projs,projs_recv,& 1074 & vectin,projections_ptr) 1075 if(signs==1.and.ngrads>0) then 1076 call gemm_nonlop_ompgpu_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,& 1077 & ngrads*nprojs,& 1078 & ngrads*gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1079 & ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1080 & ngrads*nprojs_my_blk,cplex,czero,& 1081 & current_ikpt_dprojs,dprojs_recv,& 1082 & vectin,dprojections) 1083 end if 1084 end if 1085 else 1086 ! only compute real part of projections = P^* psi => projections_r = P_r^T psi_r + P_i^T psi_i 1087 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO & 1088 !!$OMP TARGET LOOP & 1089 !$OMP& MAP(to:temp_realvec_r,vectin) PRIVATE(i) 1090 do i=1, npwin*nspinor*ndat 1091 temp_realvec_r(i) = vectin(1,i) 1092 end do 1093 if(istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then 1094 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO & 1095 !!$OMP TARGET LOOP & 1096 !$OMP& MAP(to:temp_realvec_r) PRIVATE(idat) 1097 do idat=1, ndat*nspinor 1098 temp_realvec_r(1+(idat-1)*npwin) = temp_realvec_r(1+(idat-1)*npwin)/2 1099 end do 1100 end if 1101 !$OMP TARGET DATA USE_DEVICE_PTR(projections_ptr,current_ikpt_projs_r,temp_realvec_r) 1102 call abi_gpu_xgemm(cplex, 'T', 'N', nprojs, ndat*nspinor, npwin, cone, & 1103 & c_loc(current_ikpt_projs_r), npwin, & 1104 & c_loc(temp_realvec_r), npwin, czero, c_loc(projections_ptr), nprojs) 1105 !$OMP END TARGET DATA 1106 if(signs==1.and.ngrads>0) then 1107 !$OMP TARGET DATA USE_DEVICE_PTR(dprojections,current_ikpt_dprojs_r,temp_realvec_r) 1108 call abi_gpu_xgemm(cplex, 'T', 'N', ngrads*nprojs, ndat*nspinor, npwin, cone, & 1109 & c_loc(current_ikpt_dprojs_r), npwin, & 1110 & c_loc(temp_realvec_r), npwin, czero, c_loc(dprojections), ngrads*nprojs) 1111 !$OMP END TARGET DATA 1112 end if 1113 1114 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO & 1115 !!$OMP TARGET LOOP & 1116 !$OMP& MAP(to:temp_realvec_i,vectin) PRIVATE(i) 1117 do i=1, npwin*nspinor*ndat 1118 temp_realvec_i(i) = vectin(2,i) 1119 end do 1120 if(istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then 1121 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO & 1122 !!$OMP TARGET LOOP & 1123 !$OMP& MAP(to:temp_realvec_i) PRIVATE(idat) 1124 do idat=1, ndat*nspinor 1125 temp_realvec_i(1+(idat-1)*npwin) = zero 1126 end do 1127 end if 1128 1129 !$OMP TARGET DATA USE_DEVICE_PTR(projections_ptr,current_ikpt_projs_i,temp_realvec_i) 1130 call abi_gpu_xgemm(cplex, 'T', 'N', nprojs, ndat*nspinor, npwin, cone, & 1131 c_loc(current_ikpt_projs_i), npwin, & 1132 c_loc(temp_realvec_i), npwin, cone , c_loc(projections_ptr), nprojs) 1133 !$OMP END TARGET DATA 1134 1135 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & 1136 !!$OMP TARGET LOOP & 1137 !$OMP& MAP(to:projections_ptr) PRIVATE(i1,i2) 1138 do i2=1, nspinor*ndat 1139 do i1=1, nprojs 1140 projections_ptr(1,i1,i2) = projections_ptr(1,i1,i2) * 2 1141 end do 1142 end do 1143 1144 if(signs==1.and.ngrads>0) then 1145 !$OMP TARGET DATA USE_DEVICE_PTR(dprojections,current_ikpt_dprojs_i,temp_realvec_i) 1146 call abi_gpu_xgemm(cplex, 'T', 'N', ngrads*nprojs, ndat*nspinor, npwin, cone, & 1147 c_loc(current_ikpt_dprojs_i), npwin, & 1148 c_loc(temp_realvec_i), npwin, cone , c_loc(dprojections), ngrads*nprojs) 1149 !$OMP END TARGET DATA 1150 1151 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & 1152 !!$OMP TARGET LOOP & 1153 !$OMP& MAP(to:dprojections) PRIVATE(i1,i2) 1154 do i2=1, nspinor*ndat 1155 do i1=1, ngrads*nprojs 1156 dprojections(1,i1,i2) = dprojections(1,i1,i2) * 2 1157 end do 1158 end do 1159 end if 1160 end if 1161 1162 if(cpopt >= 0) then 1163 ! store in cprjin 1164 if(.not. local_vectproj .and. cpopt/=3) then 1165 !TODO This use-case is extremely painful for GEMM OpenGPU nonlop performance 1166 !$OMP TARGET UPDATE FROM(projections_ptr) 1167 !$OMP PARALLEL DO PRIVATE(shift,idat,iatom,nlmn) 1168 do idat=1, ndat*nspinor 1169 shift = 0 1170 do iatom = 1, natom 1171 nlmn = cprjin(iatom, idat)%nlmn 1172 cprjin(iatom, idat)%cp(1:cplex, 1:nlmn) = projections_ptr(1:cplex, shift+1:shift+nlmn, idat) 1173 shift = shift + nlmn 1174 end do 1175 end do 1176 end if 1177 if(cpopt==3) then 1178 ABI_CHECK(cprjin(1,1)%ncpgr>=ngrads,"cprjin%ncpgr not correct! (2)") 1179 !$OMP TARGET UPDATE FROM(dprojections) 1180 !$OMP PARALLEL DO PRIVATE(shift,idat,iatom,igrad,nlmn) 1181 do idat=1, ndat*nspinor 1182 shift = 0 1183 do iatom = 1, natom 1184 nlmn = cprjin(iatom, idat)%nlmn 1185 do igrad=1,ngrads 1186 cprjin(iatom, idat)%dcp(1:cplex,igrad,1:nlmn) = & 1187 & dprojections(1:cplex, shift+1:shift+nlmn, idat) 1188 shift = shift + nlmn 1189 end do 1190 end do 1191 end do 1192 end if 1193 end if 1194 end if 1195 1196 1197 1198 if(choice > 0) then 1199 1200 if(choice /= 7) then 1201 ! opernlc 1202 iatm = 0 1203 ndgxdt = 0 1204 ndgxdtfac = 0 1205 nd2gxdt = 0 1206 nd2gxdtfac = 0 1207 optder = 0 1208 1209 shift = 0 1210 do itypat=1, ntypat 1211 nlmn=count(indlmn(3,:,itypat)>0) 1212 1213 ibeg = shift+1 1214 iend = shift+nattyp(itypat)*nlmn 1215 1216 call opernlc_ylm_ompgpu(atindx1,cplex,cplex_dgxdt,cplex_d2gxdt,cplex_enl,cplex_fac,& 1217 & dgxdt_dum_in,dgxdt_dum_out,dgxdt_dum_out2,& 1218 & d2gxdt_dum_in,d2gxdt_dum_out,d2gxdt_dum_out2,dimenl1,dimenl2,dimekbq,enl,& 1219 & projections_ptr,& 1220 & vnl_projections,& 1221 & s_projections,& 1222 & iatm,indlmn,itypat,lambda,mpi_enreg,natom,ndgxdt,ndgxdtfac,nd2gxdt,nd2gxdtfac,& 1223 & nattyp(itypat),nlmn,nspinor,nspinortot,optder,paw_opt,sij_typ,ndat,ibeg-1,iend,nprojs,ntypat) 1224 1225 shift = shift + nattyp(itypat)*nlmn 1226 iatm = iatm+nattyp(itypat) 1227 end do 1228 else 1229 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & 1230 !!$OMP TARGET LOOP & 1231 !$OMP& MAP(to:projections_ptr,s_projections) PRIVATE(i1,i2) 1232 do i2=1, nspinor*ndat 1233 do i1=1, nprojs 1234 s_projections(1,i1,i2) = projections_ptr(1,i1,i2) 1235 s_projections(2,i1,i2) = projections_ptr(2,i1,i2) 1236 end do 1237 end do 1238 end if 1239 1240 ! opernlb (only choice=1) 1241 if(signs==2) then 1242 if(paw_opt == 3 .or. paw_opt == 4) then 1243 ! Get svectout from s_projections 1244 if(cplex == 2) then 1245 if(.not. gemm_nonlop_is_distributed) then 1246 !$OMP TARGET DATA USE_DEVICE_PTR(s_projections,current_ikpt_projs,svectout_) 1247 call abi_gpu_xgemm(cplex, 'N', 'N', npwout, ndat*nspinor, nprojs, cone, & 1248 c_loc(current_ikpt_projs), npwout,& 1249 c_loc(s_projections), nprojs, czero, c_loc(svectout_), npwout) 1250 !$OMP END TARGET DATA 1251 else 1252 call gemm_nonlop_ompgpu_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,& 1253 & nprojs,& 1254 & gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1255 & gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1256 & nprojs_my_blk,cplex,& 1257 & current_ikpt_projs,projs_recv,& 1258 & s_projections,svectout) 1259 end if 1260 else 1261 !$OMP TARGET DATA USE_DEVICE_PTR(s_projections,current_ikpt_projs_r,temp_realvec_r) 1262 call abi_gpu_xgemm(cplex, 'N', 'N', npwout, ndat*nspinor, nprojs, cone, & 1263 c_loc(current_ikpt_projs_r), npwout, & 1264 c_loc(s_projections), nprojs, czero, c_loc(temp_realvec_r), npwout) 1265 !$OMP END TARGET DATA 1266 !$OMP TARGET DATA USE_DEVICE_PTR(s_projections,current_ikpt_projs_i,temp_realvec_i) 1267 call abi_gpu_xgemm(cplex, 'N', 'N', npwout, ndat*nspinor, nprojs, cone, & 1268 c_loc(current_ikpt_projs_i), npwout,& 1269 c_loc(s_projections), nprojs, czero, c_loc(temp_realvec_i), npwout) 1270 !$OMP END TARGET DATA 1271 1272 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO & 1273 !!$OMP TARGET LOOP & 1274 !$OMP& MAP(to:temp_realvec_r,temp_realvec_i,svectout) PRIVATE(i) 1275 do i=1, npwin*nspinor*ndat 1276 svectout(1,i) = temp_realvec_r(i) 1277 svectout(2,i) = temp_realvec_i(i) 1278 end do 1279 end if 1280 if(choice /= 7) then 1281 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO & 1282 !!$OMP TARGET LOOP & 1283 !$OMP& MAP(to:vectin,svectout) PRIVATE(i) 1284 do i=1, npwin*nspinor*ndat 1285 svectout(1,i) = svectout(1,i) + vectin(1,i) 1286 svectout(2,i) = svectout(2,i) + vectin(2,i) 1287 end do 1288 end if 1289 end if 1290 if(paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4) then 1291 ! Get vectout from vnl_projections 1292 if(cplex_fac == 2) then 1293 if(.not. gemm_nonlop_is_distributed) then 1294 !$OMP TARGET DATA USE_DEVICE_PTR(vnl_projections,current_ikpt_projs,vectout_) 1295 call abi_gpu_xgemm(cplex, 'N', 'N', npwout, ndat*nspinor, nprojs, cone, & 1296 c_loc(current_ikpt_projs), npwout, & 1297 c_loc(vnl_projections), nprojs, czero, c_loc(vectout_), npwout) 1298 !$OMP END TARGET DATA 1299 else 1300 call gemm_nonlop_ompgpu_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,& 1301 & nprojs,& 1302 & gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1303 & gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1304 & nprojs_my_blk,cplex,& 1305 & current_ikpt_projs,projs_recv,& 1306 & vnl_projections,vectout) 1307 end if 1308 else 1309 !$OMP TARGET DATA USE_DEVICE_PTR(vnl_projections,current_ikpt_projs_r,temp_realvec_r) 1310 call abi_gpu_xgemm(cplex, 'N', 'N', npwout, ndat*nspinor, nprojs, cone, & 1311 c_loc(current_ikpt_projs_r), npwout, & 1312 c_loc(vnl_projections), nprojs, czero, c_loc(temp_realvec_r), npwout) 1313 !$OMP END TARGET DATA 1314 !$OMP TARGET DATA USE_DEVICE_PTR(vnl_projections,current_ikpt_projs_i,temp_realvec_i) 1315 call abi_gpu_xgemm(cplex, 'N', 'N', npwout, ndat*nspinor, nprojs, cone, & 1316 c_loc(current_ikpt_projs_i), npwout, & 1317 c_loc(vnl_projections), nprojs, czero, c_loc(temp_realvec_i), npwout) 1318 !$OMP END TARGET DATA 1319 1320 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO & 1321 !!$OMP TARGET LOOP & 1322 !$OMP& MAP(to:temp_realvec_r,temp_realvec_i,vectout) PRIVATE(i) 1323 do i=1, npwin*nspinor*ndat 1324 vectout(1,i) = temp_realvec_r(i) 1325 vectout(2,i) = temp_realvec_i(i) 1326 end do 1327 end if 1328 end if 1329 end if ! opernlb 1330 1331 ! opernld 1332 if(signs==1) then 1333 #ifdef HAVE_GPU_HIP 1334 !$OMP TARGET UPDATE FROM(vnl_projections,projections_ptr,dprojections) 1335 #endif 1336 if(choice==1.or.choice==3.or.choice==23) then 1337 shift=0 1338 iatm=0 1339 esum=zero 1340 do itypat=1, ntypat 1341 nlmn=count(indlmn(3,:,itypat)>0) 1342 ibeg = shift+1 1343 iend = shift+nattyp(itypat)*nlmn 1344 nattyp_i = nattyp(itypat) 1345 #ifndef HAVE_GPU_HIP 1346 !$OMP TARGET TEAMS DISTRIBUTE & 1347 !$OMP& MAP(to:vnl_projections,projections_ptr,enlk) & 1348 !$OMP& FIRSTPRIVATE(idat,itypat,nlmn,esum) 1349 #endif 1350 do idat=1,ndat*nspinor 1351 esum=zero 1352 #ifndef HAVE_GPU_HIP 1353 !$OMP PARALLEL DO COLLAPSE(3) REDUCTION(+:esum) & 1354 !$OMP& PRIVATE(ia,ilmn,ii) 1355 #endif 1356 do ia=1,nattyp_i 1357 do ilmn=1,nlmn 1358 do ii=1,cplex 1359 esum=esum +vnl_projections(ii,shift+(ia-1)*nlmn+ilmn,idat) & 1360 & *projections_ptr (ii,shift+(ia-1)*nlmn+ilmn,idat) 1361 end do 1362 end do 1363 end do 1364 enlk(idat) = enlk(idat) + esum 1365 end do 1366 shift = shift + nattyp(itypat)*nlmn 1367 iatm = iatm+nattyp(itypat) 1368 end do 1369 if (choice==1) then 1370 #ifndef HAVE_GPU_HIP 1371 !$OMP TARGET PARALLEL DO MAP(to:enlout,enlk) PRIVATE(idat) 1372 #endif 1373 do idat=1,ndat 1374 enlout(idat)=enlk(idat) 1375 end do 1376 end if 1377 end if ! choice=1/3/23 1378 if(choice==2.or.choice==3.or.choice==23) then 1379 grad_shift=merge(9,6,choice==23) 1380 if (choice==3.or.choice==23) then 1381 shift=0 1382 iatm=0 1383 do itypat=1, ntypat 1384 nlmn=count(indlmn(3,:,itypat)>0) 1385 ibeg = shift+1 1386 iend = shift+nattyp(itypat)*nlmn 1387 nattyp_i = nattyp(itypat) 1388 1389 #ifndef HAVE_GPU_HIP 1390 !$OMP TARGET TEAMS DISTRIBUTE COLLAPSE(2) & 1391 !$OMP& MAP(to:vnl_projections,dprojections,enlout) & 1392 !$OMP& FIRSTPRIVATE(idat,itypat,nlmn,esum) 1393 #endif 1394 do idat=1,ndat*nspinor 1395 do igrad=1,6 1396 esum=zero 1397 #ifndef HAVE_GPU_HIP 1398 !$OMP PARALLEL DO COLLAPSE(3) REDUCTION(+:esum) & 1399 !$OMP& PRIVATE(ia,ilmn,ii) 1400 #endif 1401 do ia=1,nattyp_i 1402 !Following loops are a [D][Z]DOT 1403 do ilmn=1,nlmn 1404 do ii=1,cplex 1405 esum=esum +vnl_projections(ii,shift+(ia-1)*nlmn+ilmn,idat) & 1406 & *dprojections (ii,grad_shift*shift + (ia-1)*nlmn*grad_shift + (igrad-1)*nlmn +ilmn,idat) 1407 end do 1408 end do 1409 end do 1410 enlout((idat-1)*nnlout+igrad) = enlout((idat-1)*nnlout+igrad) + two*esum 1411 end do 1412 end do 1413 1414 shift = shift + nattyp(itypat)*nlmn 1415 iatm = iatm+nattyp(itypat) 1416 end do 1417 end if 1418 1419 if (choice==2.or.choice==23) then 1420 shift=0 1421 iatm=0 1422 force_shift=merge(6,0,choice==23) 1423 do itypat=1, ntypat 1424 nlmn=count(indlmn(3,:,itypat)>0) 1425 nattyp_i = nattyp(itypat) 1426 1427 #ifndef HAVE_GPU_HIP 1428 !$OMP TARGET TEAMS DISTRIBUTE COLLAPSE(3) & 1429 !$OMP& MAP(to:vnl_projections,dprojections,enlout) & 1430 !$OMP& FIRSTPRIVATE(idat,itypat,nlmn,esum) 1431 #endif 1432 do idat=1,ndat*nspinor 1433 do ia=1,nattyp_i 1434 do igrad=1,3 1435 !Following loops are a [D][Z]DOT 1436 esum=zero 1437 #ifndef HAVE_GPU_HIP 1438 !$OMP PARALLEL DO COLLAPSE(2) REDUCTION(+:esum) & 1439 !$OMP& PRIVATE(ilmn,ii) 1440 #endif 1441 do ilmn=1,nlmn 1442 do ii=1,cplex 1443 esum=esum +vnl_projections(ii,shift+(ia-1)*nlmn+ilmn,idat) & 1444 & *dprojections(ii,grad_shift*shift+(ia-1)*nlmn*grad_shift+(igrad-1+force_shift)*nlmn +ilmn,idat) 1445 end do 1446 end do 1447 enlout((idat-1)*nnlout + force_shift + (iatm+ia-1)*3 + igrad)= & 1448 & enlout((idat-1)*nnlout + force_shift + (iatm+ia-1)*3 + igrad) + two*esum 1449 end do 1450 end do 1451 end do 1452 1453 shift = shift + nattyp(itypat)*nlmn 1454 iatm = iatm+nattyp(itypat) 1455 end do 1456 end if 1457 end if ! choice=2, 3 or 23 1458 #ifndef HAVE_GPU_HIP 1459 !$OMP TARGET UPDATE FROM(enlout,enlk) 1460 #endif 1461 end if !opernld 1462 1463 end if ! choice>0 1464 1465 !$OMP TASKWAIT 1466 ! Reduction in case of parallelism 1467 if (signs==1.and.mpi_enreg%paral_spinor==1) then 1468 if (size(enlout)>0) then 1469 call xmpi_sum(enlout,mpi_enreg%comm_spinor,ierr) 1470 end if 1471 if (choice==3.or.choice==23) then 1472 call xmpi_sum(enlk,mpi_enreg%comm_spinor,ierr) 1473 end if 1474 end if 1475 1476 ! Derivatives wrt strain 1477 ! - Convert from reduced to cartesian coordinates 1478 ! - Substract volume contribution 1479 if ((choice==3.or.choice==23).and.signs==1.and.paw_opt<=3) then 1480 do idat=1,ndat 1481 enlout_shift=(idat-1)*nnlout 1482 call strconv(enlout(enlout_shift+1:enlout_shift+6),gprimd,work) 1483 enlout(enlout_shift+1:enlout_shift+3)=(work(1:3)-enlk(idat)) 1484 enlout(enlout_shift+4:enlout_shift+6)= work(4:6) 1485 end do 1486 end if 1487 1488 ! Retrieve and release allocated buffers 1489 !$OMP TARGET EXIT DATA MAP(delete:vectin) IF(transfer_vectin) 1490 !$OMP TARGET EXIT DATA MAP(from:vectout) IF(transfer_vectout) 1491 !$OMP TARGET EXIT DATA MAP(from:svectout) IF(transfer_svectout) 1492 1493 #ifdef HAVE_GPU_CUDA 1494 !$OMP TARGET EXIT DATA MAP(delete:s_projections,vnl_projections) 1495 if(.not. local_vectproj) then 1496 !$OMP TARGET EXIT DATA MAP(delete:projections_ptr) 1497 end if 1498 #endif 1499 1500 ! Release memory 1501 if(signs == 1) then 1502 !$OMP TARGET EXIT DATA MAP(delete:enlout) 1503 end if 1504 if(gemm_nonlop_is_distributed) then 1505 !$OMP TARGET EXIT DATA MAP(delete:projs_recv) 1506 ABI_FREE(projs_recv) 1507 if (signs==1.and.ngrads>0) then 1508 !$OMP TARGET EXIT DATA MAP(delete:dprojs_recv) 1509 ABI_FREE(dprojs_recv) 1510 end if 1511 end if 1512 !$OMP TARGET EXIT DATA MAP(delete:atindx1,indlmn,enl) 1513 if (allocated(dprojections)) then 1514 !$OMP TARGET EXIT DATA MAP(delete:dprojections) 1515 ABI_FREE(dprojections) 1516 end if 1517 if (allocated(enlk)) then 1518 !$OMP TARGET EXIT DATA MAP(delete:enlk,nattyp) 1519 ABI_FREE(enlk) 1520 end if 1521 1522 end subroutine gemm_nonlop_ompgpu 1523 1524 !*** 1525 1526 1527 #else 1528 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1529 !!!!!! stubs for compiling with OpenMP GPU offload disabled. 1530 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1531 1532 subroutine init_gemm_nonlop_ompgpu(nkpt) 1533 integer,intent(in) :: nkpt 1534 1535 ABI_UNUSED((/nkpt/)) 1536 ABI_BUG("Unhandled configuration for OpenMP GPU immplementation") 1537 end subroutine init_gemm_nonlop_ompgpu
m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu_distributed_gemm_opernla [ Functions ]
[ Top ] [ m_gemm_nonlop_ompgpu ] [ Functions ]
NAME
gemm_nonlop_ompgpu_distributed_gemm_opernla
FUNCTION
Distributed version of "opernla" GEMM called in gemm_nonlop.
INPUTS
SOURCE
555 subroutine gemm_nonlop_ompgpu_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,& 556 & nprojs,nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex,beta,& 557 & projs_local,projs_recv,vectin,cprojs) 558 integer, intent(in) :: rank,nprocs,npwin,ndat,nspinor 559 integer, intent(in) :: nprojs,nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex 560 real(dp), intent(in) :: vectin(*) 561 complex(dpc), intent(in) :: beta 562 real(dp), intent(inout), target :: projs_local(cplex,npwin,nprojs_last_blk),projs_recv(cplex,npwin,nprojs_last_blk) 563 real(dp), intent(out) :: cprojs(:,:,:) 564 565 !Local variables 566 integer :: iblock,ibeg,iend,req(2),ierr,nprojs_cur_blk,rank_prev,rank_next,idat 567 real(dp), ABI_CONTIGUOUS pointer :: recv_buf(:,:,:), work_buf(:,:,:) 568 real(dp), ABI_CONTIGUOUS pointer :: recv_buf_f(:), work_buf_f(:) 569 570 571 ! ************************************************************************* 572 573 rank_next=modulo(rank + 1,nprocs) 574 rank_prev=rank - 1 575 if(rank_prev == -1) rank_prev = nprocs - 1 576 do iblock=1,nprocs 577 578 if(rank+iblock == nprocs) then 579 nprojs_cur_blk = nprojs_last_blk 580 else 581 nprojs_cur_blk = nprojs_blk 582 end if 583 584 if(modulo(iblock,2)==1) then 585 work_buf => projs_local(1:cplex,1:npwin,1:nprojs_last_blk) 586 recv_buf => projs_recv(1:cplex,1:npwin,1:nprojs_last_blk) 587 else 588 work_buf => projs_recv(1:cplex,1:npwin,1:nprojs_last_blk) 589 recv_buf => projs_local(1:cplex,1:npwin,1:nprojs_last_blk) 590 end if 591 592 #ifndef HAVE_GPU_MPI 593 594 ! GPU-aware MPI not available : perform MPI comms on CPU 595 call xmpi_isend(work_buf,rank_prev,iblock,gemm_nonlop_block_comm,req(1),ierr) 596 call xmpi_irecv(recv_buf,rank_next,iblock,gemm_nonlop_block_comm,req(2),ierr) 597 !$OMP TARGET UPDATE TO(work_buf) 598 599 #else 600 601 ! GPU-aware MPI available : pass GPU buffers to MPI 602 !$OMP TARGET DATA USE_DEVICE_PTR(work_buf,recv_buf) 603 call c_f_pointer(c_loc(work_buf), work_buf_f, [cplex*npwin*nprojs_last_blk]) 604 call c_f_pointer(c_loc(recv_buf), recv_buf_f, [cplex*npwin*nprojs_last_blk]) 605 call MPI_ISEND(work_buf_f,cplex*npwin*nprojs_cur_blk,MPI_DOUBLE_PRECISION,& 606 & rank_prev,iblock,gemm_nonlop_block_comm,req(1),ierr) 607 call MPI_IRECV(recv_buf_f,cplex*npwin*nprojs_cur_blk,MPI_DOUBLE_PRECISION,& 608 & rank_next,iblock,gemm_nonlop_block_comm,req(2),ierr) 609 !$OMP END TARGET DATA 610 611 #endif 612 613 614 ibeg = 1 + modulo(rank+iblock-1,nprocs)*nprojs_blk 615 iend = ibeg+nprojs_cur_blk-1 616 if(cplex == 2) then 617 !$OMP TARGET DATA USE_DEVICE_PTR(work_buf,vectin,cprojs) 618 call abi_gpu_xgemm(cplex, 'C','N', & 619 nprojs_cur_blk, ndat*nspinor, npwin, cone, & 620 c_loc(work_buf), npwin, & 621 c_loc(vectin), npwin, & 622 beta, & 623 c_loc(cprojs(1,ibeg,1)), nprojs) 624 !$OMP END TARGET DATA 625 else 626 call DGEMM('T', 'N', nprojs_cur_blk, ndat*nspinor, npwin, one, & 627 & work_buf, npwin, & 628 & vectin, npwin, real(beta), cprojs(:,ibeg:iend,:), nprojs_cur_blk) 629 end if 630 631 call xmpi_waitall(req,ierr) 632 633 end do 634 635 if(modulo(iblock,2)==1) then 636 #ifndef HAVE_GPU_MPI 637 call DCOPY(cplex*npwin*nprojs_cur_blk, recv_buf, 1, work_buf, 1) 638 #else 639 !$OMP TARGET DATA USE_DEVICE_PTR(work_buf,recv_buf) 640 call copy_gpu_to_gpu(c_loc(work_buf), c_loc(recv_buf), INT(cplex, c_size_t)*npwin*nprojs_last_blk*dp) 641 !$OMP END TARGET DATA 642 #endif 643 end if 644 end subroutine gemm_nonlop_ompgpu_distributed_gemm_opernla
m_gemm_nonlop_ompgpu/gemm_nonlop_ompgpu_distributed_gemm_opernlb [ Functions ]
[ Top ] [ m_gemm_nonlop_ompgpu ] [ Functions ]
NAME
gemm_nonlop_ompgpu_distributed_gemm_opernlb
FUNCTION
Distributed version of "opernlb" GEMM called in gemm_nonlop.
INPUTS
SOURCE
659 subroutine gemm_nonlop_ompgpu_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,& 660 & nprojs,nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex,& 661 & projs_local,projs_recv,cprojs_s_vnl,vectout) 662 integer, intent(in) :: rank,nprocs,npwout,ndat,nspinor 663 integer, intent(in) :: nprojs,nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex 664 real(dp), intent(in) :: cprojs_s_vnl(:,:,:) 665 real(dp), intent(inout), target :: projs_local(cplex,npwout,nprojs_last_blk),projs_recv(cplex,npwout,nprojs_last_blk) 666 real(dp), intent(out) :: vectout(*) 667 668 !Local variables 669 integer :: iblock,ibeg,iend,req(2),ierr,nprojs_cur_blk,rank_prev,rank_next 670 complex(dpc) :: beta 671 real(dp), ABI_CONTIGUOUS pointer :: recv_buf(:,:,:), work_buf(:,:,:) 672 real(dp), ABI_CONTIGUOUS pointer :: recv_buf_f(:), work_buf_f(:) 673 674 ! ************************************************************************* 675 676 rank_next=modulo(rank + 1,nprocs) 677 rank_prev=rank - 1 678 if(rank_prev == -1) rank_prev = nprocs - 1 679 680 beta = czero 681 682 do iblock=1,nprocs 683 684 if(rank+iblock == nprocs) then 685 nprojs_cur_blk = nprojs_last_blk 686 else 687 nprojs_cur_blk = nprojs_blk 688 end if 689 690 if(modulo(iblock,2)==1) then 691 work_buf => projs_local(1:cplex,1:npwout,1:nprojs_last_blk) 692 recv_buf => projs_recv(1:cplex,1:npwout,1:nprojs_last_blk) 693 else 694 work_buf => projs_recv(1:cplex,1:npwout,1:nprojs_last_blk) 695 recv_buf => projs_local(1:cplex,1:npwout,1:nprojs_last_blk) 696 end if 697 698 #ifndef HAVE_GPU_MPI 699 700 ! GPU-aware MPI not available : perform MPI comms on CPU 701 call xmpi_isend(work_buf,rank_prev,iblock,gemm_nonlop_block_comm,req(1),ierr) 702 call xmpi_irecv(recv_buf,rank_next,iblock,gemm_nonlop_block_comm,req(2),ierr) 703 !$OMP TARGET UPDATE TO(work_buf) 704 705 #else 706 707 ! GPU-aware MPI available : pass GPU buffers to MPI 708 !$OMP TARGET DATA USE_DEVICE_PTR(work_buf,recv_buf) 709 call c_f_pointer(c_loc(work_buf), work_buf_f, [cplex*npwout*nprojs_last_blk]) 710 call c_f_pointer(c_loc(recv_buf), recv_buf_f, [cplex*npwout*nprojs_last_blk]) 711 call MPI_ISEND(work_buf_f,cplex*npwout*nprojs_cur_blk,MPI_DOUBLE_PRECISION,& 712 & rank_prev,iblock,gemm_nonlop_block_comm,req(1),ierr) 713 call MPI_IRECV(recv_buf_f,cplex*npwout*nprojs_cur_blk,MPI_DOUBLE_PRECISION,& 714 & rank_next,iblock,gemm_nonlop_block_comm,req(2),ierr) 715 !$OMP END TARGET DATA 716 717 #endif 718 719 ibeg = 1 + modulo(rank+iblock-1,nprocs)*nprojs_blk 720 iend = ibeg+nprojs_cur_blk-1 721 if(cplex==2) then 722 !$OMP TARGET DATA USE_DEVICE_PTR(work_buf,vectout,cprojs_s_vnl) 723 call abi_gpu_xgemm(cplex, 'N','N', & 724 npwout, ndat*nspinor, nprojs_cur_blk, cone, & 725 c_loc(work_buf), npwout, & 726 c_loc(cprojs_s_vnl(1,ibeg,1)), nprojs, & 727 beta, & 728 c_loc(vectout), npwout) 729 !$OMP END TARGET DATA 730 else 731 call DGEMM('N', 'N', npwout, ndat*nspinor, nprojs_cur_blk, one, & 732 & work_buf, npwout, & 733 & cprojs_s_vnl(:,ibeg:iend,:), nprojs_cur_blk, real(beta), vectout, npwout) 734 end if 735 736 beta = cone 737 738 call xmpi_waitall(req,ierr) 739 740 end do 741 742 if(modulo(iblock,2)==1) then 743 #ifndef HAVE_GPU_MPI 744 call DCOPY(cplex*npwout*nprojs_cur_blk, recv_buf, 1, work_buf, 1) 745 #else 746 !$OMP TARGET DATA USE_DEVICE_PTR(work_buf,recv_buf) 747 call copy_gpu_to_gpu(c_loc(work_buf), c_loc(recv_buf), INT(cplex, c_size_t)*npwout*nprojs_last_blk*dp) 748 !$OMP END TARGET DATA 749 #endif 750 end if 751 752 end subroutine gemm_nonlop_ompgpu_distributed_gemm_opernlb
m_gemm_nonlop_ompgpu/init_gemm_nonlop_ompgpu [ Functions ]
[ Top ] [ m_gemm_nonlop_ompgpu ] [ Functions ]
NAME
init_gemm_nonlop_ompgpu
FUNCTION
Initalization of the gemm_nonlop_kpt array
INPUTS
nkpt= number of k-points
SOURCE
398 subroutine init_gemm_nonlop_ompgpu(nkpt) 399 400 integer,intent(in) :: nkpt 401 integer :: ikpt 402 403 ! ************************************************************************* 404 405 ! TODO only allocate the number of kpt treated by this proc 406 ABI_MALLOC(gemm_nonlop_kpt, (nkpt)) 407 do ikpt=1,nkpt 408 gemm_nonlop_kpt(ikpt)%nprojs = -1 409 gemm_nonlop_kpt(ikpt)%ngrads = -1 410 end do 411 mod__nkpt=nkpt 412 413 end subroutine init_gemm_nonlop_ompgpu
m_gemm_nonlop_ompgpu/make_gemm_nonlop_ompgpu [ Functions ]
[ Top ] [ m_gemm_nonlop_ompgpu ] [ Functions ]
NAME
make_gemm_nonlop_ompgpu
FUNCTION
Build the gemm_nonlop array
INPUTS
SOURCE
514 subroutine make_gemm_nonlop_ompgpu(ikpt,npw,lmnmax,ntypat,indlmn,nattyp,istwf_k,ucvol,ffnl_k, & 515 & ph3d_k,kpt_k,kg_k,kpg_k, & 516 & compute_grad_strain,compute_grad_atom) ! Optional parameters 517 518 integer, intent(in) :: ikpt 519 integer, intent(in) :: npw, lmnmax,ntypat 520 integer, intent(in) :: indlmn(:,:,:), kg_k(:,:) 521 integer, intent(in) :: nattyp(ntypat) 522 integer, intent(in) :: istwf_k 523 logical, intent(in), optional :: compute_grad_strain,compute_grad_atom 524 real(dp), intent(in) :: ucvol 525 real(dp), intent(in) :: ffnl_k(:,:,:,:) 526 real(dp), intent(in) :: ph3d_k(:,:,:) 527 real(dp), intent(in) :: kpt_k(:) 528 real(dp), intent(in), target :: kpg_k(:,:) 529 530 integer :: cplex 531 ! ************************************************************************* 532 533 call free_gemm_nonlop_kpt_ompgpu() 534 call make_gemm_nonlop(ikpt,npw,lmnmax,ntypat,indlmn,nattyp,istwf_k,ucvol,ffnl_k, & 535 & ph3d_k,kpt_k,kg_k,kpg_k, & 536 & compute_grad_strain=compute_grad_strain,compute_grad_atom=compute_grad_atom) 537 cplex=2;if (istwf_k>1) cplex=1 538 call alloc_gemm_nonlop_kpt_ompgpu(ikpt,cplex) 539 540 end subroutine make_gemm_nonlop_ompgpu