TABLE OF CONTENTS
- ABINIT/m_invovl
- m_invovl/apply_block
- m_invovl/apply_block_ompgpu
- m_invovl/apply_invovl
- m_invovl/apply_invovl_ompgpu
- m_invovl/destroy_invovl
- m_invovl/destroy_invovl_ikpt
- m_invovl/init_invovl
- m_invovl/invovl_kpt_type
- m_invovl/make_invovl
- m_invovl/make_invovl_kpt_gpu
- m_invovl/solve_inner
- m_invovl/solve_inner_ompgpu
ABINIT/m_invovl [ Modules ]
NAME
m_invovl
FUNCTION
Provides functions to invert the overlap matrix S. Used by Chebyshev in PAW See paper by A. Levitt and M. Torrent for details S = 1 + projs * s_projs * projs' S^-1 = 1 + projs * inv_s_projs * projs', with inv_s_projs = - (s_projs^-1 + projs'*projs)^-1
COPYRIGHT
Copyright (C) 2013-2024 ABINIT group (AL) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
20 #if defined HAVE_CONFIG_H 21 #include "config.h" 22 #endif 23 24 #include "abi_common.h" 25 26 ! nvtx related macro definition 27 #include "nvtx_macros.h" 28 29 MODULE m_invovl 30 31 use defs_basis 32 use m_errors 33 use m_xmpi 34 use m_xomp 35 use m_abicore 36 use m_abi_linalg 37 38 use defs_abitypes, only : mpi_type 39 use m_time, only : timab 40 use m_hamiltonian, only : gs_hamiltonian_type 41 use m_bandfft_kpt, only : bandfft_kpt_get_ikpt 42 use m_pawcprj, only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_axpby 43 use m_gemm_nonlop, only : gemm_nonlop_use_gemm 44 use m_nonlop, only : nonlop 45 use m_prep_kgb, only : prep_nonlop 46 47 #ifdef HAVE_FC_ISO_C_BINDING 48 use, intrinsic :: iso_c_binding, only : c_ptr, c_int32_t, c_int64_t, c_float, c_double, c_size_t, c_loc 49 #endif 50 51 #if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) 52 use m_nvtx_data 53 #endif 54 55 #ifdef HAVE_GPU 56 use m_gpu_toolbox 57 #endif 58 59 #ifdef HAVE_KOKKOS 60 use m_manage_kokkos, only : add_array_kokkos 61 #endif 62 63 implicit none 64 65 private 66 67 !public procedures. 68 public :: init_invovl 69 public :: make_invovl 70 public :: apply_invovl 71 public :: destroy_invovl 72 73 ! Those routines are here to assess memory requirements 74 public :: invovl_ompgpu_work_mem 75 public :: invovl_ompgpu_static_mem
m_invovl/apply_block [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
apply_block
FUNCTION
Helper function: applies a block-diagonal matrix mat(lmnmax, lmnmax, ntypat)
INPUTS
SOURCE
1165 subroutine apply_block(ham, cplx, mat, nprojs, ndat, x, y, block_sliced) 1166 1167 use m_abi_linalg 1168 implicit none 1169 1170 integer,intent(in) :: ndat, nprojs, cplx 1171 real(dp), intent(inout) :: x(cplx, nprojs, ndat), y(cplx, nprojs, ndat) 1172 type(gs_hamiltonian_type),intent(in) :: ham 1173 real(dp), intent(in) :: mat(cplx, ham%lmnmax, ham%lmnmax, ham%ntypat) 1174 integer, intent(in) :: block_sliced 1175 1176 integer :: nlmn, shift, itypat, idat 1177 1178 ! ************************************************************************* 1179 1180 if (block_sliced == 1) then 1181 1182 do idat = 1, ndat 1183 shift = 1 1184 do itypat=1, ham%ntypat 1185 nlmn = count(ham%indlmn(3,:,itypat)>0) 1186 !! apply mat to all atoms at once 1187 ! perform natom multiplications of size nlmn 1188 ! compute y = mat*x 1189 if(cplx == 2) then 1190 call ZHEMM('L','U', nlmn, ham%nattyp(itypat), cone, & 1191 & mat(:, :, :, itypat), ham%lmnmax, & 1192 & x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn, czero, & 1193 & y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn) 1194 else 1195 call DSYMM('L','U', nlmn, ham%nattyp(itypat), one, & 1196 & mat(:, :, :, itypat), ham%lmnmax, & 1197 & x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn, zero, & 1198 & y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn) 1199 end if 1200 shift = shift + nlmn*ham%nattyp(itypat) 1201 end do 1202 end do 1203 1204 else ! block_sliced = 0 1205 1206 shift = 1 1207 do itypat=1, ham%ntypat 1208 nlmn = count(ham%indlmn(3,:,itypat)>0) 1209 !! apply mat to all atoms at once, all idat at once 1210 ! perform natom multiplications of size nlmn 1211 ! be careful here matrix extracted from x and y are not memory contiguous 1212 ! ==> so in the GPU version we will need to adapt leading dimension 1213 if(cplx == 2) then 1214 call ZHEMM('L','U', nlmn, ham%nattyp(itypat)*ndat, cone, & 1215 & mat(:, :, :, itypat), ham%lmnmax, & 1216 & x(:, 1:nlmn*ham%nattyp(itypat), 1:ndat), nlmn, czero, & 1217 & y(:, 1:shift+nlmn*ham%nattyp(itypat)-1, 1:ndat), nlmn) 1218 else 1219 call DSYMM('L','U', nlmn, ham%nattyp(itypat)*ndat, one, & 1220 & mat(:, :, :, itypat), ham%lmnmax, & 1221 & x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, 1:ndat), nlmn, zero, & 1222 & y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, 1:ndat), nlmn) 1223 end if 1224 shift = shift + nlmn*ham%nattyp(itypat) 1225 end do 1226 1227 end if 1228 1229 end subroutine apply_block
m_invovl/apply_block_ompgpu [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
apply_block_ompgpu
FUNCTION
Helper function: applies a block-diagonal matrix mat(lmnmax, lmnmax, ntypat)
INPUTS
SOURCE
1634 subroutine apply_block_ompgpu(ham, cplx, mat, nprojs, ndat, x, y, block_sliced) 1635 1636 use m_abi_linalg 1637 implicit none 1638 1639 integer,intent(in) :: ndat, nprojs, cplx 1640 real(dp), intent(inout), target :: x(cplx, nprojs, ndat), y(cplx, nprojs, ndat) 1641 type(gs_hamiltonian_type),intent(in) :: ham 1642 real(dp), intent(in), target :: mat(cplx, ham%lmnmax, ham%lmnmax, ham%ntypat) 1643 integer, intent(in) :: block_sliced 1644 1645 integer :: nlmn, shift, itypat, idat 1646 real(dp), ABI_CONTIGUOUS pointer :: x_ptr(:, :, :), y_ptr(:, :, :), mat_ptr(:,:,:) 1647 1648 ! ************************************************************************* 1649 1650 if (block_sliced == 1) then 1651 1652 do idat = 1, ndat 1653 shift = 1 1654 do itypat=1, ham%ntypat 1655 nlmn = count(ham%indlmn(3,:,itypat)>0) 1656 !! apply mat to all atoms at once 1657 ! perform natom multiplications of size nlmn 1658 ! compute y = mat*x 1659 if(cplx == 2) then 1660 !$OMP TARGET DATA USE_DEVICE_PTR(mat,x,y) 1661 call abi_gpu_zhemm('L','U', nlmn, ham%nattyp(itypat), cone, & 1662 c_loc(mat(:, :, :, itypat)), ham%lmnmax, & 1663 c_loc(x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat)), nlmn, czero, & 1664 c_loc(y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat)), nlmn) 1665 !$OMP END TARGET DATA 1666 else 1667 !$OMP TARGET DATA USE_DEVICE_PTR(mat,x,y) 1668 call abi_gpu_xsymm(cplx, 'L','U', nlmn, ham%nattyp(itypat), cone, & 1669 c_loc(mat(:, :, :, itypat)), ham%lmnmax, & 1670 c_loc(x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat)), nlmn, czero, & 1671 c_loc(y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat)), nlmn) 1672 !$OMP END TARGET DATA 1673 end if 1674 shift = shift + nlmn*ham%nattyp(itypat) 1675 end do 1676 end do 1677 1678 else ! block_sliced = 0 1679 1680 shift = 1 1681 do itypat=1, ham%ntypat 1682 nlmn = count(ham%indlmn(3,:,itypat)>0) 1683 x_ptr => x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, :) 1684 y_ptr => y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, :) 1685 mat_ptr => mat(:, :, :, itypat) 1686 !! apply mat to all atoms at once, all idat at once 1687 ! perform natom multiplications of size nlmn 1688 ! be careful here matrix extracted from x and y are not memory contiguous 1689 ! ==> so in the GPU version we will need to adapt leading dimension 1690 !$OMP TARGET DATA USE_DEVICE_PTR(mat_ptr,x_ptr,y_ptr) 1691 call abi_gpu_xgemm_strided(cplx, 'N','N', & 1692 nlmn, ham%nattyp(itypat), nlmn, cone, & 1693 c_loc(mat_ptr), ham%lmnmax, 0, & 1694 c_loc(x_ptr), nlmn, nprojs, & 1695 czero, & 1696 c_loc(y_ptr), nlmn, nprojs, ndat) 1697 !$OMP END TARGET DATA 1698 shift = shift + nlmn*ham%nattyp(itypat) 1699 end do 1700 1701 end if 1702 1703 end subroutine apply_block_ompgpu
m_invovl/apply_invovl [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
apply_invovl
FUNCTION
Applies the inverse of the overlap matrix to cwavef
INPUTS
SOURCE
787 subroutine apply_invovl(ham, cwavef, sm1cwavef, cwaveprj, npw, ndat, mpi_enreg, nspinor, block_sliced) 788 789 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA) 790 use, intrinsic :: iso_c_binding 791 #endif 792 793 implicit none 794 795 ! args 796 type(gs_hamiltonian_type), intent(in), target :: ham 797 integer, intent(in) :: npw, ndat 798 integer, intent(in) :: nspinor 799 integer, intent(in) :: block_sliced 800 real(dp), intent(inout), target :: cwavef(2, npw*nspinor*ndat) ! TODO should be in, fix nonlop 801 type(mpi_type) :: mpi_enreg 802 real(dp), intent(inout), target :: sm1cwavef(2, npw*nspinor*ndat) 803 type(pawcprj_type), intent(inout) :: cwaveprj(:,:) 804 805 real(dp),allocatable, target :: proj(:,:,:), sm1proj(:,:,:), PtPsm1proj(:,:,:) 806 807 ! used to pass proj dimensions to cuda 808 integer(kind=c_int32_t) :: proj_dim(3) 809 integer(kind=c_int32_t) :: nattyp_dim 810 integer(kind=c_int32_t) :: indlmn_dim(3) 811 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) && defined(HAVE_YAKL) 812 integer(kind=c_int32_t) :: cwavef_size 813 #endif 814 815 integer :: idat, iatom, nlmn, shift 816 real(dp) :: tsec(2) 817 818 integer :: choice, cpopt, paw_opt , cplx 819 character :: blas_transpose 820 type(pawcprj_type),allocatable :: cwaveprj_in(:,:) 821 822 integer :: ikpt_this_proc 823 ! dummies 824 real(dp) :: enlout(ndat), lambda_block(1), gvnlxc(1,1) 825 integer, parameter :: nnlout = 0, idir = 0, signs = 2 826 827 type(invovl_kpt_type), pointer :: invovl 828 829 ! ************************************************************************* 830 831 if(ham%gpu_option==ABI_GPU_OPENMP) then 832 #ifdef HAVE_OPENMP_OFFLOAD 833 call apply_invovl_ompgpu(ham, cwavef, sm1cwavef, cwaveprj, npw, ndat, mpi_enreg, nspinor, block_sliced) 834 return 835 #endif 836 end if 837 838 ABI_NVTX_START_RANGE(NVTX_INVOVL_PREP) 839 ikpt_this_proc=bandfft_kpt_get_ikpt() 840 invovl => invovl_kpt(ikpt_this_proc) 841 842 if(ham%istwf_k == 1) then 843 cplx = 2 844 blas_transpose = 'c' 845 else 846 cplx = 1 847 blas_transpose = 't' 848 end if 849 850 ABI_MALLOC(proj, (cplx,invovl%nprojs,nspinor*ndat)) 851 ABI_MALLOC(sm1proj, (cplx,invovl%nprojs,nspinor*ndat)) 852 ABI_MALLOC(PtPsm1proj, (cplx,invovl%nprojs,nspinor*ndat)) 853 proj = zero 854 sm1proj = zero 855 PtPsm1proj = zero 856 857 proj_dim = (/ size(proj,1), size(proj,2), size(proj,3) /) 858 859 nattyp_dim = size(ham%nattyp) 860 861 indlmn_dim = (/ size(ham%indlmn,1), size(ham%indlmn,2), size(ham%indlmn,3) /) 862 863 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING) 864 865 !! memory allocation of data used in solve_inner_gpu 866 !! note : this is actually done only once 867 if (ham%gpu_option==ABI_GPU_LEGACY .or. ham%gpu_option==ABI_GPU_KOKKOS) then 868 869 #ifdef DEBUG_VERBOSE_GPU 870 if(xmpi_comm_rank(xmpi_world) == 0) then 871 call check_gpu_mem("gpu_apply_invovl_inner_alloc begin") 872 end if 873 #endif 874 875 ! make sure to use sizes from apply_invovl 876 call f_gpu_apply_invovl_inner_alloc(proj_dim, ham%ntypat, 0) 877 878 #ifdef DEBUG_VERBOSE_GPU 879 if(xmpi_comm_rank(xmpi_world) == 0) then 880 call check_gpu_mem("gpu_apply_invovl_inner_alloc end") 881 end if 882 #endif 883 884 ! TODO find a better place to put that initialization 885 call f_gpu_init_invovl_data(indlmn_dim, c_loc(ham%indlmn(1,1,1))) 886 887 end if 888 889 #endif 890 891 892 call timab(timer_apply_inv_ovl_opernla, 1, tsec) 893 894 ! cwaveprj may be dummy or unused if gemm nonlop is turned on 895 if((.not. gemm_nonlop_use_gemm) .or. size(cwaveprj) > 1) then 896 ABI_MALLOC(cwaveprj_in, (ham%natom,nspinor*ndat)) 897 call pawcprj_alloc(cwaveprj_in,0,ham%dimcprj) 898 else 899 ABI_MALLOC(cwaveprj_in, (1,1)) 900 call pawcprj_alloc(cwaveprj_in,0,(/1/)) 901 end if 902 ABI_NVTX_END_RANGE() 903 904 ! get the cprj 905 ABI_NVTX_START_RANGE(NVTX_INVOVL_NONLOP1) 906 choice = 0 ! only compute cprj, nothing else 907 cpopt = 0 ! compute and save cprj 908 paw_opt = 3 ! S nonlocal operator 909 if (mpi_enreg%paral_kgb==1) then 910 call prep_nonlop(choice,cpopt,cwaveprj_in,enlout,ham,idir,lambda_block,ndat,mpi_enreg,& 911 & nnlout,paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,& 912 & already_transposed=.true.,& 913 & gpu_option=ham%gpu_option,& 914 & vectproj=proj) 915 else 916 call nonlop(choice,cpopt,cwaveprj_in,enlout,ham,idir,lambda_block,mpi_enreg,ndat,& 917 & nnlout,paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,vectproj=proj) 918 end if 919 ABI_NVTX_END_RANGE() 920 921 call timab(timer_apply_inv_ovl_opernla, 2, tsec) 922 call timab(timer_apply_inv_ovl_inv_s, 1, tsec) 923 924 ! If using GEMM nonlop, proj array is used directly instead of writing in cwaveprj_in content, so skip this copy 925 if(.not. gemm_nonlop_use_gemm) then 926 ! copy cwaveprj_in to proj(:,:) 927 do idat=1, ndat*nspinor 928 shift = 0 929 do iatom = 1, ham%natom 930 nlmn = cwaveprj_in(iatom, idat)%nlmn 931 proj(1:cplx, shift+1:shift+nlmn, idat) = cwaveprj_in(iatom, idat)%cp(1:cplx, 1:nlmn) 932 shift = shift + nlmn 933 end do 934 end do 935 end if 936 937 !multiply by S^1 938 ABI_NVTX_START_RANGE(NVTX_INVOVL_INNER) 939 ! TODO : when solve_inner_gpu is ready, update the following to activate GPU computation 940 if (ham%gpu_option == ABI_GPU_LEGACY .or. ham%gpu_option==ABI_GPU_KOKKOS) then 941 942 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA) 943 944 if (mpi_enreg%nproc_fft /= 1) then 945 ABI_ERROR("[66_wfs/m_invovl.F90:apply_invovl] nproc_fft must be 1, when GPU/CUDA is activated") 946 end if 947 948 call f_solve_inner_gpu(proj_dim, c_loc(proj(1,1,1)), & 949 & c_loc(sm1proj(1,1,1)), c_loc(PtPsm1proj(1,1,1)), & 950 & nattyp_dim, c_loc(ham%nattyp(1)), ham%ntypat, & 951 & ham%lmnmax, cplx, block_sliced) 952 953 #endif 954 955 else 956 957 call solve_inner(invovl, ham, cplx, mpi_enreg, proj, ndat*nspinor, sm1proj, PtPsm1proj, block_sliced) 958 sm1proj = - sm1proj 959 PtPsm1proj = - PtPsm1proj 960 end if 961 962 ABI_NVTX_END_RANGE() 963 964 ! If using GEMM nonlop, sm1proj array is used directly instead of reading cwaveprj content, so skip this copy 965 if(.not. gemm_nonlop_use_gemm) then 966 ! copy sm1proj to cwaveprj(:,:) 967 do idat=1, ndat*nspinor 968 shift = 0 969 do iatom = 1, ham%natom 970 nlmn = cwaveprj(iatom, idat)%nlmn 971 cwaveprj(iatom, idat)%cp(1:cplx, 1:nlmn) = sm1proj(1:cplx, shift+1:shift+nlmn, idat) 972 shift = shift + nlmn 973 end do 974 end do 975 end if 976 call timab(timer_apply_inv_ovl_inv_s, 2, tsec) 977 call timab(timer_apply_inv_ovl_opernlb, 1, tsec) 978 979 ! get the corresponding wf 980 ABI_NVTX_START_RANGE(NVTX_INVOVL_NONLOP2) 981 cpopt = 2 ! reuse cprj 982 choice = 7 ! get wf from cprj, without the application of S 983 paw_opt = 3 984 if (mpi_enreg%paral_kgb==1) then 985 call prep_nonlop(choice,cpopt,cwaveprj,enlout,ham,idir,lambda_block,ndat,mpi_enreg,nnlout,& 986 & paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,already_transposed=.true.,& 987 & gpu_option=ham%gpu_option,vectproj=sm1proj) 988 else 989 call nonlop(choice,cpopt,cwaveprj,enlout,ham,idir,lambda_block,mpi_enreg,ndat,nnlout,paw_opt,& 990 & signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,vectproj=sm1proj) 991 end if 992 ABI_NVTX_END_RANGE() 993 994 call timab(timer_apply_inv_ovl_opernlb, 2, tsec) 995 996 ABI_NVTX_START_RANGE(NVTX_INVOVL_POST2) 997 if(size(cwaveprj) > 1) then 998 ! copy PtPSm1proj to cwaveprj(:,:) 999 do idat=1, ndat*nspinor 1000 shift = 0 1001 do iatom = 1, ham%natom 1002 nlmn = cwaveprj(iatom, idat)%nlmn 1003 cwaveprj(iatom, idat)%cp(1:cplx, 1:nlmn) = PtPsm1proj(1:cplx, shift+1:shift+nlmn, idat) 1004 shift = shift + nlmn 1005 end do 1006 end do 1007 !cwaveprj_in is empty if GEMM nonlop is being used, so populate it here 1008 if(gemm_nonlop_use_gemm) then 1009 do idat=1, ndat*nspinor 1010 shift = 0 1011 do iatom = 1, ham%natom 1012 nlmn = cwaveprj_in(iatom, idat)%nlmn 1013 cwaveprj_in(iatom, idat)%cp(1:cplx, 1:nlmn) = proj(1:cplx, shift+1:shift+nlmn, idat) 1014 shift = shift + nlmn 1015 end do 1016 end do 1017 end if 1018 call pawcprj_axpby(one, one, cwaveprj_in, cwaveprj) 1019 end if 1020 call pawcprj_free(cwaveprj_in) 1021 ABI_FREE(cwaveprj_in) 1022 1023 if (ham%gpu_option == ABI_GPU_LEGACY .or. ham%gpu_option==ABI_GPU_KOKKOS) then 1024 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) && defined(HAVE_YAKL) 1025 cwavef_size = 2*npw*nspinor*ndat 1026 call add_array_kokkos(c_loc(sm1cwavef), c_loc(cwavef), cwavef_size) 1027 #endif 1028 else 1029 sm1cwavef = cwavef + sm1cwavef 1030 end if 1031 1032 ABI_NVTX_END_RANGE() 1033 1034 ABI_FREE(proj) 1035 ABI_FREE(sm1proj) 1036 ABI_FREE(PtPsm1proj) 1037 1038 end subroutine apply_invovl
m_invovl/apply_invovl_ompgpu [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
apply_invovl_ompgpu
FUNCTION
Applies the inverse of the overlap matrix to cwavef (OpenMP GPU implementation)
INPUTS
SOURCE
1293 subroutine apply_invovl_ompgpu(ham, cwavef, sm1cwavef, cwaveprj, npw, ndat, mpi_enreg, nspinor, block_sliced) 1294 1295 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU) 1296 use, intrinsic :: iso_c_binding 1297 #endif 1298 1299 implicit none 1300 1301 ! args 1302 type(gs_hamiltonian_type), intent(in), target :: ham 1303 integer, intent(in) :: npw, ndat 1304 integer, intent(in) :: nspinor 1305 integer, intent(in) :: block_sliced 1306 real(dp), intent(inout), target :: cwavef(2, npw*nspinor*ndat) ! TODO should be in, fix nonlop 1307 type(mpi_type) :: mpi_enreg 1308 real(dp), intent(inout), target :: sm1cwavef(2, npw*nspinor*ndat) 1309 type(pawcprj_type), intent(inout) :: cwaveprj(:,:) 1310 logical :: transfer_omp_args 1311 1312 real(dp), ABI_CONTIGUOUS pointer :: proj(:,:,:),sm1proj(:,:,:),PtPsm1proj(:,:,:) 1313 1314 integer :: idat, iatom, icplx, iproj, nprojs, nlmn, shift 1315 real(dp) :: tsec(2) 1316 1317 integer :: choice, cpopt, paw_opt , cplx, old_me_g0 1318 type(pawcprj_type),allocatable :: cwaveprj_in(:,:) 1319 1320 integer :: ikpt_this_proc 1321 ! dummies 1322 real(dp) :: enlout(ndat), lambda_block(1), gvnlxc(1,1) 1323 integer, parameter :: nnlout = 0, idir = 0, signs = 2 1324 1325 type(invovl_kpt_type), pointer :: invovl 1326 1327 ! ************************************************************************* 1328 1329 ikpt_this_proc=bandfft_kpt_get_ikpt() 1330 invovl => invovl_kpt(ikpt_this_proc) 1331 nprojs=invovl%nprojs 1332 if(ikpt_this_proc /= current_ikpt_in_gpu) call refresh_invovl_ompgpu_kpt(ikpt_this_proc) 1333 1334 if(ham%istwf_k == 1) then 1335 cplx = 2 1336 else 1337 cplx = 1 1338 end if 1339 if(gpu_initialized == 0) call alloc_ompgpu_buffers(cplx,nprojs,nspinor,ndat) 1340 proj => proj_ompgpu 1341 sm1proj => sm1proj_ompgpu 1342 PtPsm1proj => PtPsm1proj_ompgpu 1343 !$OMP TARGET ENTER DATA MAP(alloc:proj,sm1proj,PtPsm1proj) 1344 1345 transfer_omp_args = .not. ( xomp_target_is_present(c_loc(sm1cwavef)) & 1346 .and. xomp_target_is_present(c_loc(cwavef))) 1347 !$OMP TARGET ENTER DATA MAP(alloc:gvnlxc) 1348 if(transfer_omp_args) then 1349 !$OMP TARGET ENTER DATA MAP(alloc:sm1cwavef,cwavef) 1350 !$OMP TARGET UPDATE TO(sm1cwavef,cwavef) 1351 end if 1352 1353 call timab(timer_apply_inv_ovl_opernla, 1, tsec) 1354 1355 ! get the cprj 1356 ABI_NVTX_START_RANGE(NVTX_INVOVL_NONLOP1) 1357 choice = 0 ! only compute cprj, nothing else 1358 cpopt = 0 ! compute and save cprj 1359 paw_opt = 3 ! S nonlocal operator 1360 1361 if(ham%istwf_k==2) then 1362 old_me_g0=mpi_enreg%me_g0 1363 if (mpi_enreg%me_fft==0) then 1364 mpi_enreg%me_g0=1 1365 else 1366 mpi_enreg%me_g0=0 1367 end if 1368 end if 1369 call nonlop(choice,cpopt,cwaveprj_in,enlout,ham,idir,lambda_block,mpi_enreg,ndat,nnlout,& 1370 paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,vectproj=proj) 1371 ABI_NVTX_END_RANGE() 1372 1373 call timab(timer_apply_inv_ovl_opernla, 2, tsec) 1374 call timab(timer_apply_inv_ovl_inv_s, 1, tsec) 1375 1376 !multiply by S^1 1377 ABI_NVTX_START_RANGE(NVTX_INVOVL_INNER) 1378 call solve_inner_ompgpu(invovl, ham, cplx, mpi_enreg, proj, ndat*nspinor, sm1proj, PtPsm1proj, block_sliced) 1379 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) MAP(to:sm1proj,PtPsm1proj) 1380 do idat =1, ndat*nspinor 1381 do iproj = 1, nprojs 1382 do icplx = 1, cplx 1383 sm1proj(icplx,iproj,idat) = - sm1proj(icplx,iproj,idat) 1384 PtPsm1proj(icplx,iproj,idat) = - PtPsm1proj(icplx,iproj,idat) 1385 end do 1386 end do 1387 end do 1388 1389 ABI_NVTX_END_RANGE() 1390 1391 call timab(timer_apply_inv_ovl_inv_s, 2, tsec) 1392 call timab(timer_apply_inv_ovl_opernlb, 1, tsec) 1393 1394 ! get the corresponding wf 1395 ABI_NVTX_START_RANGE(NVTX_INVOVL_NONLOP2) 1396 cpopt = 2 ! reuse cprj 1397 choice = 7 ! get wf from cprj, without the application of S 1398 paw_opt = 3 1399 call nonlop(choice,cpopt,cwaveprj,enlout,ham,idir,lambda_block,mpi_enreg,ndat,nnlout,paw_opt,& 1400 signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,vectproj=sm1proj) 1401 ABI_NVTX_END_RANGE() 1402 1403 call timab(timer_apply_inv_ovl_opernlb, 2, tsec) 1404 if (ham%istwf_k==2) mpi_enreg%me_g0=old_me_g0 1405 1406 if(size(cwaveprj) > 1) then 1407 ABI_MALLOC(cwaveprj_in, (ham%natom,nspinor*ndat)) 1408 call pawcprj_alloc(cwaveprj_in,0,ham%dimcprj) 1409 !$OMP TARGET UPDATE FROM(PtPsm1proj,proj) 1410 ! copy PtPsm1proj to cwaveprj(:,:) 1411 do idat=1, ndat*nspinor 1412 shift = 0 1413 do iatom = 1, ham%natom 1414 nlmn = cwaveprj(iatom, idat)%nlmn 1415 cwaveprj(iatom, idat)%cp(1:cplx, 1:nlmn) = PtPsm1proj(1:cplx, shift+1:shift+nlmn, idat) 1416 shift = shift + nlmn 1417 end do 1418 end do 1419 do idat=1, ndat*nspinor 1420 shift = 0 1421 do iatom = 1, ham%natom 1422 nlmn = cwaveprj_in(iatom, idat)%nlmn 1423 cwaveprj_in(iatom, idat)%cp(1:cplx, 1:nlmn) = proj(1:cplx, shift+1:shift+nlmn, idat) 1424 shift = shift + nlmn 1425 end do 1426 end do 1427 call pawcprj_axpby(one, one, cwaveprj_in, cwaveprj) 1428 call pawcprj_free(cwaveprj_in) 1429 ABI_FREE(cwaveprj_in) 1430 end if 1431 1432 call abi_gpu_xaxpy(1, 2*npw*nspinor*ndat, cone, cwavef, 1, sm1cwavef, 1) 1433 1434 if(transfer_omp_args) then 1435 !$OMP TARGET UPDATE FROM(sm1cwavef,cwavef) 1436 !$OMP TARGET EXIT DATA MAP(delete:sm1cwavef,cwavef) 1437 end if 1438 1439 !$OMP TARGET EXIT DATA MAP(delete:gvnlxc) 1440 !$OMP TARGET EXIT DATA MAP(delete:proj,sm1proj,PtPsm1proj) 1441 1442 end subroutine apply_invovl_ompgpu
m_invovl/destroy_invovl [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
destroy_invovl
FUNCTION
Destruction of the invovl_kpt array
INPUTS
nkpt= number of k-points
SOURCE
442 subroutine destroy_invovl(nkpt, gpu_option) 443 444 integer, intent(in) :: nkpt 445 integer, intent(in) :: gpu_option 446 integer :: ikpt 447 448 ! ************************************************************************* 449 450 ! TODO add cycling if kpt parallelism 451 do ikpt=1,nkpt 452 if(invovl_kpt(ikpt)%nprojs == -1) then 453 ! write(0, *) 'ERROR invovl_kpt is unallocated' 454 cycle 455 end if 456 call destroy_invovl_ikpt(ikpt, gpu_option) 457 end do 458 459 ABI_FREE(invovl_kpt) 460 461 end subroutine destroy_invovl
m_invovl/destroy_invovl_ikpt [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
destroy_invovl_ikpt
FUNCTION
Destruction of the ikpt-th member of invovl array
INPUTS
ikpt= index of k-point
SOURCE
389 subroutine destroy_invovl_ikpt(ikpt, gpu_option) 390 391 integer, intent(in) :: ikpt 392 integer, intent(in) :: gpu_option 393 394 ! ************************************************************************* 395 396 if(gpu_option==ABI_GPU_OPENMP) then 397 #ifdef HAVE_OPENMP_OFFLOAD 398 if(gpu_initialized==1 .and. current_ikpt_in_gpu == ikpt) then 399 !$OMP TARGET EXIT DATA MAP(delete:current_gram_projs) 400 !$OMP TARGET EXIT DATA MAP(delete:current_inv_sij) 401 !$OMP TARGET EXIT DATA MAP(delete:current_inv_s_approx) 402 nullify(current_gram_projs) 403 nullify(current_inv_sij) 404 nullify(current_inv_s_approx) 405 current_ikpt_in_gpu = -1 406 !FIXME Smater buffer management ? 407 !!$OMP TARGET EXIT DATA MAP(delete:proj_ompgpu,sm1proj_ompgpu,PtPsm1proj_ompgpu) 408 ABI_FREE(proj_ompgpu) 409 ABI_FREE(sm1proj_ompgpu) 410 ABI_FREE(PtPsm1proj_ompgpu) 411 gpu_initialized = 0 412 end if 413 #endif 414 end if 415 416 ABI_FREE(invovl_kpt(ikpt)%gram_projs) 417 ABI_FREE(invovl_kpt(ikpt)%inv_sij) 418 ABI_FREE(invovl_kpt(ikpt)%inv_s_approx) 419 invovl_kpt(ikpt)%nprojs = -1 420 421 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING) 422 if (gpu_option == ABI_GPU_LEGACY .or. gpu_option == ABI_GPU_KOKKOS) then 423 call f_gpu_apply_invovl_inner_dealloc() 424 call f_gpu_apply_invovl_matrix_dealloc() 425 end if 426 #endif 427 428 end subroutine destroy_invovl_ikpt
m_invovl/init_invovl [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
init_invovl
FUNCTION
Initalization of the invovl_kpt array
INPUTS
nkpt= number of k-points
SOURCE
362 subroutine init_invovl(nkpt) 363 364 integer, intent(in) :: nkpt 365 integer :: ikpt 366 367 ! ************************************************************************* 368 369 ABI_MALLOC(invovl_kpt, (nkpt)) 370 ! TODO add cycling if kpt parallelism 371 do ikpt=1,nkpt 372 invovl_kpt(ikpt)%nprojs = -1 373 end do 374 375 end subroutine init_invovl
m_invovl/invovl_kpt_type [ Types ]
[ Top ] [ m_invovl ] [ Types ]
NAME
invovl_kpt_type
FUNCTION
Contains information needed to invert the overlap matrix S
SOURCE
87 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA) 88 89 type, public :: invovl_kpt_type 90 91 integer(kind=c_int32_t) :: nprojs 92 ! total number of projectors 93 ! nlmn for a specific atom = count(indlmn(3,:,itypat)>0) 94 ! A value of -1 means that the following arrays are not allocated 95 96 real(kind=c_double), allocatable :: gram_projs(:,:,:) 97 ! gram_projs(cplx, nprojs, nprojs) 98 ! projs' * projs 99 100 real(kind=c_double), allocatable :: inv_sij(:,:,:,:) 101 ! inv_sij(cplx, lmnmax, lmnmax, ntypat) 102 ! inverse of ham%sij 103 104 real(kind=c_double), allocatable :: inv_s_approx(:,:,:,:) 105 ! inv_s_approx(cplx, lmnmax, lmnmax, ntypat) 106 ! preconditionner 107 108 end type invovl_kpt_type 109 110 !> companion type to invovl_kpt_type to pass data to gpu/cuda 111 type, bind(c), public :: invovl_kpt_gpu_type 112 113 integer(kind=c_int32_t) :: nprojs 114 ! total number of projectors 115 ! nlmn for a specific atom = count(indlmn(3,:,itypat)>0) 116 ! A value of -1 means that the following arrays are not allocated 117 118 type(c_ptr) :: gram_projs 119 ! gram_projs(cplx, nprojs, nprojs) 120 ! projs' * projs 121 122 integer(kind=c_int32_t) :: gram_projs_dim(3) 123 124 type(c_ptr) :: inv_sij 125 ! inv_sij(cplx, lmnmax, lmnmax, ntypat) 126 ! inverse of ham%sij 127 128 integer(kind=c_int32_t) :: inv_sij_dim(4) 129 130 type(c_ptr) :: inv_s_approx 131 ! inv_s_approx(cplx, lmnmax, lmnmax, ntypat) 132 ! preconditionner 133 134 integer(kind=c_int32_t) :: inv_s_approx_dim(4) 135 136 end type invovl_kpt_gpu_type 137 138 #else 139 140 type, public :: invovl_kpt_type 141 142 integer :: nprojs 143 ! total number of projectors 144 ! nlmn for a specific atom = count(indlmn(3,:,itypat)>0) 145 ! A value of -1 means that the following arrays are not allocated 146 147 real(dp), allocatable :: gram_projs(:,:,:) 148 ! gram_projs(cplx, nprojs, nprojs) 149 ! projs' * projs 150 151 real(dp), allocatable :: inv_sij(:,:,:,:) 152 ! inv_sij(cplx, lmnmax, lmnmax, ntypat) 153 ! inverse of ham%sij 154 155 real(dp), allocatable :: inv_s_approx(:,:,:,:) 156 ! inv_s_approx(cplx, lmnmax, lmnmax, ntypat) 157 ! preconditionner 158 159 end type invovl_kpt_type 160 161 #endif
m_invovl/make_invovl [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
make_invovl
FUNCTION
Builds of the invovl structure
INPUTS
SOURCE
475 subroutine make_invovl(ham, dimffnl, ffnl, ph3d, mpi_enreg) 476 477 use m_abi_linalg 478 implicit none 479 480 type(gs_hamiltonian_type),intent(in), target :: ham 481 integer, intent(in) :: dimffnl 482 real(dp),intent(in) :: ffnl(ham%npw_k,dimffnl,ham%lmnmax,ham%ntypat) 483 real(dp),intent(in) :: ph3d(2,ham%npw_k,ham%matblk) 484 type(mpi_type) :: mpi_enreg 485 486 real(dp) :: atom_projs(2, ham%npw_k, ham%lmnmax) 487 real(dp) :: temp(ham%npw_k) 488 complex(dpc), allocatable :: work(:) 489 real(dp), allocatable,target :: projs(:,:,:) 490 real(dp), allocatable :: gram_proj(:,:,:) 491 integer, allocatable :: ipiv(:) 492 493 integer :: itypat, ilmn, nlmn, jlmn, ia, iaph3d, shift 494 integer :: il, ilm, jlm, ipw, info, ierr, cplx 495 integer :: ikpt_this_proc,cplex_dij 496 logical :: parity 497 real(dp) :: tsec(2) 498 character(len=500) :: message 499 character :: blas_transpose 500 501 type(invovl_kpt_type), pointer :: invovl 502 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA) 503 type(invovl_kpt_gpu_type) :: invovl_gpu 504 #endif 505 integer :: array_nprojs_pp(mpi_enreg%nproc_fft) 506 integer :: iproc, slice_size 507 real(dp), allocatable :: gramwork(:,:,:) 508 509 integer, parameter :: timer_mkinvovl = 1620, timer_mkinvovl_build_d = 1621, timer_mkinvovl_build_ptp = 1622 510 511 ! ************************************************************************* 512 513 !! S = 1 + PDP', so 514 !! S^-1 = 1 + P inv_s_projs P', with 515 !! inv_s_projs = - (D^-1 + P'*P)^-1 516 517 if(ham%istwf_k == 1) then 518 cplx = 2 519 blas_transpose = 'c' 520 else 521 cplx = 1 522 blas_transpose = 't' 523 end if 524 525 ikpt_this_proc=bandfft_kpt_get_ikpt() 526 invovl => invovl_kpt(ikpt_this_proc) 527 528 if(invovl%nprojs /= -1) then 529 ! We have been here before, cleanup before remaking 530 call destroy_invovl_ikpt(ikpt_this_proc, ham%gpu_option) 531 end if 532 533 iaph3d = 1 534 535 call timab(timer_mkinvovl,1,tsec) 536 call timab(timer_mkinvovl_build_d,1,tsec) 537 538 ! build nprojs 539 invovl%nprojs = 0 540 do itypat=1,ham%ntypat 541 invovl%nprojs = invovl%nprojs + count(ham%indlmn(3,:,itypat)>0)*ham%nattyp(itypat) 542 end do 543 544 ABI_MALLOC(projs, (2, ham%npw_k, invovl%nprojs)) 545 ABI_MALLOC(invovl%inv_sij, (cplx, ham%lmnmax, ham%lmnmax, ham%ntypat)) 546 ABI_MALLOC(invovl%inv_s_approx, (cplx, ham%lmnmax, ham%lmnmax, ham%ntypat)) 547 ! workspace for inversion 548 ABI_MALLOC(ipiv, (ham%lmnmax)) 549 ABI_MALLOC(work, (64*ham%lmnmax)) 550 551 projs = zero 552 invovl%inv_sij = zero 553 invovl%inv_s_approx = zero 554 555 shift = 0 556 do itypat = 1, ham%ntypat 557 nlmn = count(ham%indlmn(3,:,itypat)>0) 558 if (size(ham%sij(:,itypat))==ham%lmnmax*(ham%lmnmax+1)/2) then 559 cplex_dij = 1 560 else if (size(ham%sij(:,itypat))==ham%lmnmax*(ham%lmnmax+1)) then 561 cplex_dij = 2 562 else 563 ABI_ERROR('sij size not recognize') 564 end if 565 !! unpack ham%sij into inv_sij 566 do jlmn = 1, nlmn 567 if (cplex_dij==1) then 568 invovl%inv_sij(1, jlmn, jlmn, itypat) = ham%sij(jlmn*(jlmn-1)/2 + jlmn, itypat) 569 else 570 invovl%inv_sij(1, jlmn, jlmn, itypat) = ham%sij(jlmn*(jlmn-1) + 2*jlmn-1, itypat) 571 end if 572 do ilmn = 1, jlmn-1 573 if (cplex_dij==1) then 574 invovl%inv_sij(1, ilmn, jlmn, itypat) = ham%sij(jlmn*(jlmn-1)/2 + ilmn, itypat) 575 invovl%inv_sij(1, jlmn, ilmn, itypat) = ham%sij(jlmn*(jlmn-1)/2 + ilmn, itypat) 576 else 577 invovl%inv_sij(1, ilmn, jlmn, itypat) = ham%sij(jlmn*(jlmn-1) + 2*ilmn-1, itypat) 578 invovl%inv_sij(1, jlmn, ilmn, itypat) = ham%sij(jlmn*(jlmn-1) + 2*ilmn-1, itypat) 579 580 end if 581 end do 582 end do 583 584 ! Invert sij 585 if(cplx == 2) then 586 call ZHETRF('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info) 587 call ZHETRI('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, info) 588 else 589 call DSYTRF('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info) 590 call DSYTRI('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, info) 591 end if 592 ! complete the matrix 593 do ilm=1, nlmn 594 do jlm=1, ilm-1 595 invovl%inv_sij(1,ilm,jlm,itypat) = invovl%inv_sij(1,jlm,ilm,itypat) 596 end do 597 end do 598 599 !! loop on atoms to build atom_projs and fill projs, s_projs 600 do ia = 1, ham%nattyp(itypat) 601 602 !! build atom_projs, from opernlb 603 !! P = 4pi/sqrt(ucvol)* conj(diag(ph3d)) * ffnl * diag(parity), with parity = (-i)^l 604 atom_projs(:,:,:) = 0 605 606 ! start from 4pi/sqrt(ucvol)*ffnl 607 ! atom_projs(1, :, 1:nlmn) = four_pi/sqrt(ham%ucvol) * ffnl(:, 1, 1:nlmn) 608 ! TODO vectorize (DCOPY with stride) 609 do ipw=1, ham%npw_k 610 atom_projs(1,ipw, 1:nlmn) = four_pi/sqrt(ham%ucvol) * ffnl(ipw, 1, 1:nlmn, itypat) 611 end do 612 613 ! multiply by (-i)^l 614 do ilmn=1,nlmn 615 il=mod(ham%indlmn(1,ilmn, itypat),4); 616 parity=(mod(il,2)==0) 617 if (il>1) then 618 ! multiply by -1 619 atom_projs(:,:,ilmn) = -atom_projs(:,:,ilmn) 620 end if 621 if(.not. parity) then 622 ! multiply by -i 623 temp = atom_projs(2,:,ilmn) 624 atom_projs(2,:,ilmn) = -atom_projs(1,:,ilmn) 625 atom_projs(1,:,ilmn) = temp 626 end if 627 end do 628 629 ! multiply by conj(ph3d) 630 do ilmn=1,nlmn 631 temp = atom_projs(1, :, ilmn) 632 atom_projs(1, :, ilmn) = atom_projs(1, :, ilmn) * ph3d(1, :, iaph3d) + atom_projs(2, :, ilmn) * ph3d(2, :, iaph3d) 633 atom_projs(2, :, ilmn) = atom_projs(2, :, ilmn) * ph3d(1, :, iaph3d) - temp * ph3d(2, :, iaph3d) 634 end do 635 636 ! me_g0 trick 637 if(ham%istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then 638 atom_projs(1,1,:) = atom_projs(1,1,:) / sqrt2 639 atom_projs(2,1,:) = zero 640 end if 641 if(ham%istwf_k == 2) then 642 atom_projs(:,:,:) = atom_projs(:,:,:)*sqrt2 643 end if 644 645 646 !! atom_projs and typat_s_projs are built, copy them to projs and inv_s_projs 647 projs(:, :, shift+1:shift+nlmn) = atom_projs(:, :, 1:nlmn) 648 shift = shift + nlmn 649 650 ! build inv_s_approx = (D^-1+PtP)^-1 restricted to a single atom block 651 ! can be optimized (real, build directly from ffnl) 652 if(ia == 1) then 653 ! D^-1 654 invovl%inv_s_approx(1, :, :, itypat) = invovl%inv_sij(1, :, :, itypat) 655 ! + PtP 656 ABI_MALLOC(gram_proj, (cplx, nlmn, nlmn)) 657 call abi_xgemm(blas_transpose,'N', nlmn, nlmn, (3-cplx)*ham%npw_k, cone, atom_projs(:,:,1), (3-cplx)*ham%npw_k, & 658 & atom_projs(:,:,1), (3-cplx)*ham%npw_k, czero, gram_proj(:,:,1), nlmn,x_cplx=cplx) 659 call xmpi_sum(gram_proj,mpi_enreg%comm_bandspinorfft,ierr) 660 invovl%inv_s_approx(:,1:nlmn,1:nlmn,itypat) = invovl%inv_s_approx(:,1:nlmn,1:nlmn,itypat) + gram_proj(:,:,:) 661 ABI_FREE(gram_proj) 662 ! ^-1 663 if(cplx == 2) then 664 call ZHETRF('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info) 665 call ZHETRI('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, info) 666 else 667 call DSYTRF('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info) 668 call DSYTRI('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, info) 669 end if 670 ! complete lower triangle of matrix 671 do ilm=1, nlmn 672 do jlm=1, ilm-1 673 invovl%inv_s_approx(1, ilm, jlm, itypat) = invovl%inv_s_approx(1, jlm, ilm, itypat) 674 if(cplx == 2) then 675 invovl%inv_s_approx(2, ilm, jlm, itypat) = -invovl%inv_s_approx(2, jlm, ilm, itypat) 676 end if 677 end do 678 end do 679 end if 680 681 iaph3d = iaph3d + 1 682 end do 683 end do 684 ABI_FREE(ipiv) 685 ABI_FREE(work) 686 687 call timab(timer_mkinvovl_build_d, 2, tsec) 688 call timab(timer_mkinvovl_build_ptp, 1, tsec) 689 690 ! Compute P'P one column slice at a time (might be too big to fit in one proc) 691 if(mpi_enreg%paral_kgb == 1) then 692 ! Split the work evenly the fft processors 693 array_nprojs_pp(:) = invovl%nprojs / mpi_enreg%nproc_fft 694 ! not enough work, there's MOD(nprojs,mpi_enreg%nproc_fft) tasks left 695 ! assign them to the first ones 696 array_nprojs_pp(1:MOD(invovl%nprojs,mpi_enreg%nproc_fft)) = array_nprojs_pp(1:MOD(invovl%nprojs,mpi_enreg%nproc_fft)) + 1 697 else 698 array_nprojs_pp = invovl%nprojs 699 end if 700 ABI_MALLOC(invovl%gram_projs, (cplx,invovl%nprojs,array_nprojs_pp(mpi_enreg%me_fft+1))) 701 shift = 0 702 if (ham%gpu_option==ABI_GPU_OPENMP .and. mpi_enreg%nproc_fft==1) then 703 #ifdef HAVE_OPENMP_OFFLOAD 704 ! compute gram_projs in one GEMM, only one FFT proc expected in GPU mode 705 slice_size = array_nprojs_pp(1) 706 current_gram_projs => invovl%gram_projs 707 !$OMP TARGET ENTER DATA MAP(alloc:current_gram_projs) 708 !$OMP TARGET ENTER DATA MAP(to:projs) 709 710 !$OMP TARGET DATA USE_DEVICE_PTR(current_gram_projs,projs) 711 call abi_gpu_xgemm(cplx, blas_transpose,'N', invovl%nprojs, slice_size, (3-cplx)*ham%npw_k, cone, & 712 & c_loc(projs), (3-cplx)*ham%npw_k, & 713 & c_loc(projs), (3-cplx)*ham%npw_k, czero, c_loc(current_gram_projs), invovl%nprojs) 714 !$OMP END TARGET DATA 715 !$OMP TARGET EXIT DATA MAP(from:current_gram_projs) 716 !$OMP TARGET EXIT DATA MAP(delete:projs) 717 call xmpi_sum(invovl%gram_projs,mpi_enreg%comm_band,ierr) 718 #endif 719 else 720 do iproc = 1, mpi_enreg%nproc_fft 721 ! compute local contribution to slice iproc of gram_projs 722 slice_size = array_nprojs_pp(iproc) 723 ABI_MALLOC(gramwork, (cplx,invovl%nprojs,slice_size)) 724 call abi_xgemm(blas_transpose,'N', invovl%nprojs, slice_size, (3-cplx)*ham%npw_k, cone, projs(:,:,1), (3-cplx)*ham%npw_k, & 725 & projs(:, :, shift+1), (3-cplx)*ham%npw_k, czero, gramwork(:,:,1), invovl%nprojs,x_cplx=cplx) 726 shift = shift + slice_size 727 ! reduce on proc i 728 call xmpi_sum_master(gramwork, iproc-1, mpi_enreg%comm_fft, ierr) 729 if(iproc == mpi_enreg%me_fft+1) then 730 invovl%gram_projs = gramwork 731 end if 732 ABI_FREE(gramwork) 733 end do 734 call xmpi_sum(invovl%gram_projs,mpi_enreg%comm_band,ierr) 735 end if 736 737 call timab(timer_mkinvovl_build_ptp, 2, tsec) 738 call timab(timer_mkinvovl,2,tsec) 739 740 ABI_FREE(projs) 741 742 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA) 743 744 ! upload inverse overlap matrices (sij and s_approx) to GPU memory 745 if (ham%gpu_option==ABI_GPU_LEGACY .or. ham%gpu_option==ABI_GPU_KOKKOS) then 746 ! allocate memory for sij and s_approx on GPU 747 write(message,'(a,a,i12,a,a,i6,a,a,i6,a,a,es12.4,a)') & 748 & 'Allocate GPU memory for inverse overlap computations (sij and s_approx) : ',& 749 & 'nprojs=',invovl%nprojs,ch10,& 750 & 'nlmnax=',ham%lmnmax,ch10,& 751 & 'ntypat=',ham%ntypat,ch10,& 752 & 'gram_projs_gpu_size (GBytes)=',1e-9*cplx*invovl%nprojs*invovl%nprojs*dp,ch10 753 call wrtout(std_out,message,'COLL') 754 call f_gpu_apply_invovl_matrix_alloc(cplx, invovl%nprojs, ham%lmnmax, ham%ntypat, 0) 755 756 invovl_gpu = make_invovl_kpt_gpu(invovl) 757 call f_upload_inverse_overlap(invovl_gpu, cplx, invovl%nprojs, ham%lmnmax, ham%ntypat) 758 write(message,*) 'Invovl uploaded to GPU memory' 759 call wrtout(std_out,message,'COLL') 760 end if 761 762 #endif 763 764 #ifdef HAVE_OPENMP_OFFLOAD 765 if (ham%gpu_option==ABI_GPU_OPENMP) then 766 call refresh_invovl_ompgpu_kpt(ikpt_this_proc) 767 end if 768 #endif 769 770 write(message,*) 'Invovl built' 771 call wrtout(std_out,message,'COLL') 772 773 end subroutine make_invovl
m_invovl/make_invovl_kpt_gpu [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
make_invovl_kpt
FUNCTION
Create a invovl_pkt_gpu_type from a cpu counter part for cuda interoperability
SOURCE
272 function make_invovl_kpt_gpu(invovl) result(invovl_gpu) 273 implicit none 274 type(invovl_kpt_type), intent(inout),target :: invovl 275 type(invovl_kpt_gpu_type) :: invovl_gpu 276 277 invovl_gpu%nprojs = invovl%nprojs 278 279 invovl_gpu%gram_projs = c_loc(invovl%gram_projs(1,1,1)) 280 invovl_gpu%gram_projs_dim = (/ & 281 & size(invovl%gram_projs,1), & 282 & size(invovl%gram_projs,2), & 283 & size(invovl%gram_projs,3) & 284 & /) 285 286 invovl_gpu%inv_sij = c_loc(invovl%inv_sij(1,1,1,1)) 287 invovl_gpu%inv_sij_dim = (/ & 288 & size(invovl%inv_sij,1), & 289 & size(invovl%inv_sij,2), & 290 & size(invovl%inv_sij,3), & 291 & size(invovl%inv_sij,4) & 292 & /) 293 294 invovl_gpu%inv_s_approx = c_loc(invovl%inv_s_approx(1,1,1,1)) 295 invovl_gpu%inv_s_approx_dim = (/ & 296 & size(invovl%inv_s_approx,1), & 297 & size(invovl%inv_s_approx,2), & 298 & size(invovl%inv_s_approx,3), & 299 & size(invovl%inv_s_approx,4) & 300 & /) 301 302 end function make_invovl_kpt_gpu
m_invovl/solve_inner [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
solve_inner
FUNCTION
Helper function: iteratively solves the inner system
INPUTS
SOURCE
1051 subroutine solve_inner(invovl, ham, cplx, mpi_enreg, proj, ndat, sm1proj, PtPsm1proj, block_sliced) 1052 1053 use m_abi_linalg 1054 implicit none 1055 1056 integer,intent(in) :: ndat,cplx 1057 type(invovl_kpt_type), intent(in) :: invovl 1058 real(dp), intent(inout) :: proj(cplx, invovl%nprojs,ndat) 1059 real(dp), intent(inout) :: sm1proj(cplx, invovl%nprojs, ndat) 1060 real(dp), intent(inout) :: PtPsm1proj(cplx, invovl%nprojs, ndat) 1061 real(dp), allocatable :: temp_proj(:,:,:) 1062 type(mpi_type), intent(in) :: mpi_enreg 1063 type(gs_hamiltonian_type),intent(in) :: ham 1064 integer, intent(in) :: block_sliced 1065 1066 integer :: array_nlmntot_pp(mpi_enreg%nproc_fft) 1067 integer :: nlmntot_this_proc, ibeg, iend, ierr, i, nprojs 1068 real(dp) :: resid(cplx, invovl%nprojs,ndat), precondresid(cplx, invovl%nprojs,ndat) 1069 real(dp) :: normprojs(ndat), errs(ndat), maxerr, previous_maxerr 1070 character(len=500) :: message 1071 1072 real(dp), parameter :: precision = 1e-16 ! maximum relative error. TODO: use tolwfr ? 1073 real(dp) :: convergence_rate 1074 integer :: additional_steps_to_take 1075 1076 ! ************************************************************************* 1077 1078 nprojs = invovl%nprojs 1079 normprojs = SUM(SUM(proj**2, 1),1) 1080 call xmpi_sum(normprojs, mpi_enreg%comm_fft, ierr) 1081 1082 ! Compute work distribution : split nprojs evenly between the fft processors 1083 if(mpi_enreg%paral_kgb == 1) then 1084 array_nlmntot_pp(:) = nprojs / mpi_enreg%nproc_fft 1085 ! not enough work, there's MOD(nprojs,mpi_enreg%nproc_fft) tasks left 1086 ! assign them to the first ones 1087 array_nlmntot_pp(1:MOD(nprojs,mpi_enreg%nproc_fft)) = array_nlmntot_pp(1:MOD(nprojs,mpi_enreg%nproc_fft)) + 1 1088 ibeg = SUM(array_nlmntot_pp(1:mpi_enreg%me_fft)) + 1 1089 iend = ibeg + array_nlmntot_pp(1+mpi_enreg%me_fft) - 1 1090 nlmntot_this_proc = iend - ibeg + 1 1091 else 1092 ibeg = 1 1093 iend = nprojs 1094 nlmntot_this_proc = nprojs 1095 end if 1096 1097 ABI_MALLOC(temp_proj, (cplx, nlmntot_this_proc, ndat)) 1098 1099 ! first guess for sm1proj 1100 call apply_block(ham, cplx, invovl%inv_s_approx, nprojs, ndat, proj, sm1proj, block_sliced) 1101 1102 ! Iterative refinement 1103 ! TODO use a more efficient iterative algorithm than iterative refinement, use locking 1104 additional_steps_to_take = -1 1105 do i=1, 30 1106 ! compute resid = proj - (D^-1 + PtP)sm1proj 1107 call apply_block(ham, cplx, invovl%inv_sij, nprojs, ndat, sm1proj, resid, block_sliced) 1108 temp_proj = sm1proj(:,ibeg:iend,:) 1109 1110 ! compute matrix multiplication : PtPsm1proj(:,:,1) = invovl%gram * temp_proj(:,:,1) 1111 call abi_xgemm('N', 'N', nprojs, ndat, nlmntot_this_proc, cone, & 1112 & invovl%gram_projs(:,:,1), nprojs, & 1113 & temp_proj(:,:,1), nlmntot_this_proc, czero, & 1114 & PtPsm1proj(:,:,1), nprojs, & 1115 & x_cplx=cplx) 1116 call xmpi_sum(PtPsm1proj, mpi_enreg%comm_fft, ierr) 1117 resid = proj - resid - Ptpsm1proj 1118 ! exit check 1119 errs = SUM(SUM(resid**2, 1),1) 1120 call xmpi_sum(errs, mpi_enreg%comm_fft, ierr) 1121 1122 maxerr = sqrt(MAXVAL(errs/normprojs)) 1123 if(maxerr < precision .or. additional_steps_to_take == 1) then 1124 exit 1125 ! We might stall and never get to the specified precision because of machine errors. 1126 ! If we got to 1e-10, extrapolate convergence rate and determine the number of additional 1127 ! steps to take to reach precision 1128 else if(maxerr < 1e-10 .and. additional_steps_to_take == -1) then 1129 convergence_rate = -LOG(1e-10) / i 1130 additional_steps_to_take = CEILING(-LOG(precision/1e-10)/convergence_rate) + 1 1131 else if(additional_steps_to_take > 0) then 1132 if(previous_maxerr<maxerr)exit 1133 additional_steps_to_take = additional_steps_to_take - 1 1134 end if 1135 previous_maxerr=maxerr 1136 1137 ! add preconditionned residual 1138 call apply_block(ham, cplx, invovl%inv_s_approx, nprojs, ndat, resid, precondresid, block_sliced) 1139 sm1proj = sm1proj + precondresid 1140 end do 1141 1142 if(maxerr >= precision .and. maxerr >= 1e-10) then 1143 write(message, *) 'In invovl, max error was', maxerr, ' after 30 iterations' 1144 ABI_WARNING(message) 1145 else 1146 ! write(message,'(a,i2,a,es13.5)') 'Iterative solver in invovl finished in ', i, ' iterations, error', maxerr 1147 ! call wrtout(std_out,message,'COLL') 1148 end if 1149 1150 ABI_FREE(temp_proj) 1151 1152 end subroutine solve_inner
m_invovl/solve_inner_ompgpu [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
solve_inner_ompgpu
FUNCTION
Helper function: iteratively solves the inner system (OpenMP GPU offload implementation)
INPUTS
SOURCE
1455 subroutine solve_inner_ompgpu(invovl, ham, cplx, mpi_enreg, proj, ndat, sm1proj, PtPsm1proj, block_sliced) 1456 1457 use m_abi_linalg 1458 implicit none 1459 1460 integer,intent(in) :: ndat,cplx 1461 type(invovl_kpt_type), intent(in), target :: invovl 1462 real(dp), intent(inout) :: proj(cplx, invovl%nprojs,ndat) 1463 real(dp), intent(inout), target :: sm1proj(cplx, invovl%nprojs, ndat) 1464 real(dp), intent(inout), target :: PtPsm1proj(cplx, invovl%nprojs, ndat) 1465 type(mpi_type), intent(in) :: mpi_enreg 1466 type(gs_hamiltonian_type),intent(in) :: ham 1467 integer, intent(in) :: block_sliced 1468 1469 integer :: array_nlmntot_pp(mpi_enreg%nproc_fft) 1470 integer :: nlmntot_this_proc, ibeg, iend, ierr, i, nprojs 1471 real(dp) :: resid(cplx, invovl%nprojs,ndat), precondresid(cplx, invovl%nprojs,ndat) 1472 real(dp) :: normprojs(ndat), errs(ndat), maxerr, previous_maxerr 1473 character(len=500) :: message 1474 1475 real(dp), parameter :: precision = 1e-16 ! maximum relative error. TODO: use tolwfr ? 1476 real(dp) :: convergence_rate,sum_tmp 1477 integer :: additional_steps_to_take,idat,iproj,icplx 1478 integer :: Ptsize(3) 1479 #ifdef HAVE_GPU_HIP 1480 type(c_ptr) :: sm1proj_amdcopy,PtPsm1proj_amdcopy 1481 #endif 1482 1483 ! ************************************************************************* 1484 1485 Ptsize(1) = cplx 1486 Ptsize(2) = invovl%nprojs 1487 Ptsize(3) = ndat 1488 nprojs = invovl%nprojs 1489 #if defined HAVE_GPU_HIP && defined FC_LLVM 1490 !FIXME Work-around for AOMP v15.0.3 (AMD Flang fork) 1491 sm1proj_amdref => sm1proj 1492 PtPsm1proj_amdref => PtPsm1proj 1493 #endif 1494 1495 !$OMP TARGET ENTER DATA MAP(alloc:errs,precondresid,resid,normprojs) 1496 1497 !FIXME LLVM has trouble with performing team reduction (AOMP 15.0.2) 1498 #ifdef FC_LLVM 1499 !$OMP TARGET UPDATE FROM(proj) 1500 #else 1501 !$OMP TARGET TEAMS DISTRIBUTE MAP(to:normprojs,proj) PRIVATE(idat,sum_tmp) 1502 #endif 1503 do idat = 1,ndat 1504 sum_tmp=0 1505 #ifndef FC_LLVM 1506 !$OMP PARALLEL DO COLLAPSE(2) REDUCTION(+:sum_tmp) PRIVATE(iproj,icplx) 1507 #endif 1508 do iproj = 1,nprojs 1509 do icplx = 1,cplx 1510 sum_tmp = sum_tmp + proj(icplx,iproj,idat)**2 1511 end do 1512 end do 1513 normprojs(idat)=sum_tmp 1514 end do 1515 #ifndef FC_LLVM 1516 !$OMP TARGET UPDATE FROM(normprojs) 1517 #endif 1518 1519 ibeg = 1 1520 iend = nprojs 1521 nlmntot_this_proc = nprojs 1522 1523 ! first guess for sm1proj 1524 call apply_block_ompgpu(ham, cplx, invovl%inv_s_approx, nprojs, ndat, proj, sm1proj, block_sliced) 1525 1526 ! Iterative refinement 1527 ! TODO use a more efficient iterative algorithm than iterative refinement, use locking 1528 additional_steps_to_take = -1 1529 do i=1, 30 1530 ! compute resid = proj - (D^-1 + PtP)sm1proj 1531 call apply_block_ompgpu(ham, cplx, invovl%inv_sij, nprojs, ndat, sm1proj, resid, block_sliced) 1532 1533 ! compute matrix multiplication : PtPsm1proj(:,:,1) = invovl%gram * sm1proj(:,:,1) 1534 ABI_NVTX_START_RANGE(NVTX_INVOVL_INNER_GEMM) 1535 #if defined HAVE_GPU_HIP && defined FC_LLVM 1536 !$OMP TARGET DATA USE_DEVICE_PTR(current_gram_projs, sm1proj_amdref, PtPsm1proj_amdref) 1537 call abi_gpu_xgemm(cplx, 'N', 'N', nprojs, ndat, nlmntot_this_proc, cone, & 1538 c_loc(current_gram_projs), nprojs,& 1539 c_loc(sm1proj_amdref), nlmntot_this_proc, czero, & 1540 c_loc(PtPsm1proj_amdref), nprojs) 1541 !$OMP END TARGET DATA 1542 #else 1543 !$OMP TARGET DATA USE_DEVICE_PTR(current_gram_projs, sm1proj, PtPsm1proj) 1544 call abi_gpu_xgemm(cplx, 'N', 'N', nprojs, ndat, nlmntot_this_proc, cone, & 1545 c_loc(current_gram_projs), nprojs,& 1546 c_loc(sm1proj), nlmntot_this_proc, czero, & 1547 c_loc(PtPsm1proj), nprojs) 1548 !$OMP END TARGET DATA 1549 #endif 1550 1551 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) & 1552 !$OMP& PRIVATE(idat,iproj,icplx) MAP(to:proj,resid,PtPsm1proj) 1553 do idat =1, ndat 1554 do iproj =1, nprojs 1555 do icplx = 1,cplx 1556 resid(icplx, iproj, idat) = proj(icplx, iproj, idat) - resid(icplx, iproj, idat) - PtPsm1proj(icplx, iproj, idat) 1557 end do 1558 end do 1559 end do 1560 1561 ! exit check 1562 #ifdef FC_LLVM 1563 !FIXME LLVM has trouble with performing team reduction (v16.0.0 from AMD ROCm 5.6.0) 1564 !$OMP TARGET UPDATE FROM(resid) 1565 errs = SUM(SUM(resid**2, 1),1) 1566 #else 1567 !$OMP TARGET TEAMS DISTRIBUTE MAP(to:errs,resid) PRIVATE(idat,sum_tmp) 1568 do idat = 1,ndat 1569 sum_tmp=0 1570 !$OMP PARALLEL DO COLLAPSE(2) REDUCTION(+:sum_tmp) PRIVATE(iproj,icplx) 1571 do iproj = 1,nprojs 1572 do icplx = 1,cplx 1573 sum_tmp = sum_tmp + resid(icplx,iproj,idat)**2 1574 end do 1575 end do 1576 errs(idat)=sum_tmp 1577 end do 1578 !$OMP TARGET UPDATE FROM(errs) 1579 #endif 1580 1581 ABI_NVTX_END_RANGE() 1582 1583 maxerr = sqrt(MAXVAL(errs/normprojs)) 1584 if(maxerr < precision .or. additional_steps_to_take == 1) then 1585 exit 1586 ! We might stall and never get to the specified precision because of machine errors. 1587 ! If we got to 1e-10, extrapolate convergence rate and determine the number of additional 1588 ! steps to take to reach precision 1589 else if(maxerr < 1e-10 .and. additional_steps_to_take == -1) then 1590 convergence_rate = -LOG(1e-10) / i 1591 additional_steps_to_take = CEILING(-LOG(precision/1e-10)/convergence_rate) + 1 1592 else if(additional_steps_to_take > 0) then 1593 if(previous_maxerr<maxerr)exit 1594 additional_steps_to_take = additional_steps_to_take - 1 1595 end if 1596 previous_maxerr=maxerr 1597 1598 ! add preconditionned residual 1599 call apply_block_ompgpu(ham, cplx, invovl%inv_s_approx, nprojs, ndat, resid, precondresid, block_sliced) 1600 1601 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) & 1602 !$OMP& PRIVATE(idat,iproj,icplx) MAP(to:sm1proj,precondresid) 1603 do idat =1, ndat 1604 do iproj =1, nprojs 1605 do icplx = 1,cplx 1606 sm1proj(icplx, iproj, idat) = sm1proj(icplx, iproj, idat) + precondresid(icplx, iproj, idat) 1607 end do 1608 end do 1609 end do 1610 end do 1611 !$OMP TARGET EXIT DATA MAP(delete:errs,resid,precondresid,normprojs) 1612 1613 if(maxerr >= precision .and. maxerr >= 1e-10) then 1614 write(message, *) 'In invovl, max error was', maxerr, ' after 30 iterations' 1615 ABI_WARNING(message) 1616 else 1617 ! write(message,'(a,i2,a,es13.5)') 'Iterative solver in invovl finished in ', i, ' iterations, error', maxerr 1618 ! call wrtout(std_out,message,'COLL') 1619 end if 1620 1621 end subroutine solve_inner_ompgpu