TABLE OF CONTENTS


ABINIT/m_invovl [ Modules ]

[ Top ] [ Modules ]

NAME

  m_invovl

FUNCTION

  Provides functions to invert the overlap matrix S. Used by Chebyshev in PAW
  See paper by A. Levitt and M. Torrent for details
  S = 1 + projs * s_projs * projs'
  S^-1 = 1 + projs * inv_s_projs * projs', with
  inv_s_projs = - (s_projs^-1 + projs'*projs)^-1

COPYRIGHT

 Copyright (C) 2013-2024 ABINIT group (AL)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

20 #if defined HAVE_CONFIG_H
21 #include "config.h"
22 #endif
23 
24 #include "abi_common.h"
25 
26 ! nvtx related macro definition
27 #include "nvtx_macros.h"
28 
29 MODULE m_invovl
30 
31  use defs_basis
32  use m_errors
33  use m_xmpi
34  use m_xomp
35  use m_abicore
36  use m_abi_linalg
37 
38  use defs_abitypes, only : mpi_type
39  use m_time,        only : timab
40  use m_hamiltonian, only : gs_hamiltonian_type
41  use m_bandfft_kpt, only : bandfft_kpt_get_ikpt
42  use m_pawcprj,     only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_axpby
43  use m_gemm_nonlop, only : gemm_nonlop_use_gemm
44  use m_nonlop,      only : nonlop
45  use m_prep_kgb,    only : prep_nonlop
46 
47 #ifdef HAVE_FC_ISO_C_BINDING
48  use, intrinsic :: iso_c_binding, only : c_ptr, c_int32_t, c_int64_t, c_float, c_double, c_size_t, c_loc
49 #endif
50 
51 #if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
52  use m_nvtx_data
53 #endif
54 
55 #ifdef HAVE_GPU
56   use m_gpu_toolbox
57 #endif
58 
59 #ifdef HAVE_KOKKOS
60  use m_manage_kokkos, only : add_array_kokkos
61 #endif
62 
63  implicit none
64 
65  private
66 
67 !public procedures.
68  public :: init_invovl
69  public :: make_invovl
70  public :: apply_invovl
71  public :: destroy_invovl
72 
73  ! Those routines are here to assess memory requirements
74  public :: invovl_ompgpu_work_mem
75  public :: invovl_ompgpu_static_mem

m_invovl/apply_block [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 apply_block

FUNCTION

 Helper function: applies a block-diagonal matrix mat(lmnmax, lmnmax, ntypat)

INPUTS

SOURCE

1165 subroutine apply_block(ham, cplx, mat, nprojs, ndat, x, y, block_sliced)
1166 
1167   use m_abi_linalg
1168   implicit none
1169 
1170   integer,intent(in) :: ndat, nprojs, cplx
1171   real(dp), intent(inout) :: x(cplx, nprojs, ndat), y(cplx, nprojs, ndat)
1172   type(gs_hamiltonian_type),intent(in) :: ham
1173   real(dp), intent(in) :: mat(cplx, ham%lmnmax, ham%lmnmax, ham%ntypat)
1174   integer, intent(in) :: block_sliced
1175 
1176   integer :: nlmn, shift, itypat, idat
1177 
1178 ! *************************************************************************
1179 
1180   if (block_sliced == 1) then
1181 
1182      do idat = 1, ndat
1183         shift = 1
1184         do itypat=1, ham%ntypat
1185            nlmn = count(ham%indlmn(3,:,itypat)>0)
1186            !! apply mat to all atoms at once
1187            ! perform natom multiplications of size nlmn
1188            ! compute y = mat*x
1189            if(cplx == 2) then
1190               call ZHEMM('L','U', nlmn, ham%nattyp(itypat), cone, &
1191                    &  mat(:, :, :, itypat), ham%lmnmax, &
1192                    &  x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn, czero, &
1193                    &  y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn)
1194            else
1195               call DSYMM('L','U', nlmn, ham%nattyp(itypat), one, &
1196                    &  mat(:, :, :, itypat), ham%lmnmax, &
1197                    &  x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn, zero, &
1198                    &  y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn)
1199            end if
1200            shift = shift + nlmn*ham%nattyp(itypat)
1201         end do
1202      end do
1203 
1204   else ! block_sliced = 0
1205 
1206      shift = 1
1207      do itypat=1, ham%ntypat
1208         nlmn = count(ham%indlmn(3,:,itypat)>0)
1209         !! apply mat to all atoms at once, all idat at once
1210         ! perform natom multiplications of size nlmn
1211         ! be careful here matrix extracted from x and y are not memory contiguous
1212         ! ==> so in the GPU version we will need to adapt leading dimension
1213         if(cplx == 2) then
1214            call ZHEMM('L','U', nlmn, ham%nattyp(itypat)*ndat, cone, &
1215                 &  mat(:, :, :, itypat), ham%lmnmax, &
1216                 &  x(:, 1:nlmn*ham%nattyp(itypat), 1:ndat), nlmn, czero, &
1217                 &  y(:, 1:shift+nlmn*ham%nattyp(itypat)-1, 1:ndat), nlmn)
1218         else
1219            call DSYMM('L','U', nlmn, ham%nattyp(itypat)*ndat, one, &
1220                 &  mat(:, :, :, itypat), ham%lmnmax, &
1221                 &  x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, 1:ndat), nlmn, zero, &
1222                 &  y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, 1:ndat), nlmn)
1223         end if
1224         shift = shift + nlmn*ham%nattyp(itypat)
1225      end do
1226 
1227   end if
1228 
1229 end subroutine apply_block

m_invovl/apply_block_ompgpu [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 apply_block_ompgpu

FUNCTION

 Helper function: applies a block-diagonal matrix mat(lmnmax, lmnmax, ntypat)

INPUTS

SOURCE

1634 subroutine apply_block_ompgpu(ham, cplx, mat, nprojs, ndat, x, y, block_sliced)
1635 
1636   use m_abi_linalg
1637   implicit none
1638 
1639   integer,intent(in) :: ndat, nprojs, cplx
1640   real(dp), intent(inout), target :: x(cplx, nprojs, ndat), y(cplx, nprojs, ndat)
1641   type(gs_hamiltonian_type),intent(in) :: ham
1642   real(dp), intent(in), target :: mat(cplx, ham%lmnmax, ham%lmnmax, ham%ntypat)
1643   integer, intent(in) :: block_sliced
1644 
1645   integer :: nlmn, shift, itypat, idat
1646   real(dp), ABI_CONTIGUOUS pointer :: x_ptr(:, :, :), y_ptr(:, :, :), mat_ptr(:,:,:)
1647 
1648 ! *************************************************************************
1649 
1650   if (block_sliced == 1) then
1651 
1652      do idat = 1, ndat
1653         shift = 1
1654         do itypat=1, ham%ntypat
1655            nlmn = count(ham%indlmn(3,:,itypat)>0)
1656            !! apply mat to all atoms at once
1657            ! perform natom multiplications of size nlmn
1658            ! compute y = mat*x
1659            if(cplx == 2) then
1660              !$OMP TARGET DATA USE_DEVICE_PTR(mat,x,y)
1661              call abi_gpu_zhemm('L','U', nlmn, ham%nattyp(itypat), cone, &
1662                    c_loc(mat(:, :, :, itypat)), ham%lmnmax, &
1663                    c_loc(x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat)), nlmn, czero, &
1664                    c_loc(y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat)), nlmn)
1665              !$OMP END TARGET DATA
1666            else
1667              !$OMP TARGET DATA USE_DEVICE_PTR(mat,x,y)
1668              call abi_gpu_xsymm(cplx, 'L','U', nlmn, ham%nattyp(itypat), cone, &
1669                    c_loc(mat(:, :, :, itypat)), ham%lmnmax, &
1670                    c_loc(x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat)), nlmn, czero, &
1671                    c_loc(y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat)), nlmn)
1672              !$OMP END TARGET DATA
1673            end if
1674            shift = shift + nlmn*ham%nattyp(itypat)
1675         end do
1676      end do
1677 
1678   else ! block_sliced = 0
1679 
1680     shift = 1
1681     do itypat=1, ham%ntypat
1682       nlmn = count(ham%indlmn(3,:,itypat)>0)
1683       x_ptr => x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, :)
1684       y_ptr => y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, :)
1685       mat_ptr => mat(:, :, :, itypat)
1686       !! apply mat to all atoms at once, all idat at once
1687       ! perform natom multiplications of size nlmn
1688       ! be careful here matrix extracted from x and y are not memory contiguous
1689       ! ==> so in the GPU version we will need to adapt leading dimension
1690       !$OMP TARGET DATA USE_DEVICE_PTR(mat_ptr,x_ptr,y_ptr)
1691       call abi_gpu_xgemm_strided(cplx, 'N','N', &
1692               nlmn, ham%nattyp(itypat), nlmn, cone, &
1693               c_loc(mat_ptr), ham%lmnmax, 0, &
1694               c_loc(x_ptr), nlmn, nprojs, &
1695               czero, &
1696               c_loc(y_ptr), nlmn, nprojs, ndat)
1697       !$OMP END TARGET DATA
1698       shift = shift + nlmn*ham%nattyp(itypat)
1699     end do
1700 
1701   end if
1702 
1703 end subroutine apply_block_ompgpu

m_invovl/apply_invovl [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 apply_invovl

FUNCTION

 Applies the inverse of the overlap matrix to cwavef

INPUTS

SOURCE

 787 subroutine apply_invovl(ham, cwavef, sm1cwavef, cwaveprj, npw, ndat, mpi_enreg, nspinor, block_sliced)
 788 
 789 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA)
 790   use, intrinsic :: iso_c_binding
 791 #endif
 792 
 793   implicit none
 794 
 795   ! args
 796   type(gs_hamiltonian_type), intent(in), target :: ham
 797   integer, intent(in) :: npw, ndat
 798   integer, intent(in) :: nspinor
 799   integer, intent(in) :: block_sliced
 800   real(dp), intent(inout), target :: cwavef(2, npw*nspinor*ndat) ! TODO should be in, fix nonlop
 801   type(mpi_type) :: mpi_enreg
 802   real(dp), intent(inout), target :: sm1cwavef(2, npw*nspinor*ndat)
 803   type(pawcprj_type), intent(inout) :: cwaveprj(:,:)
 804 
 805   real(dp),allocatable, target :: proj(:,:,:), sm1proj(:,:,:), PtPsm1proj(:,:,:)
 806 
 807   ! used to pass proj dimensions to cuda
 808   integer(kind=c_int32_t) :: proj_dim(3)
 809   integer(kind=c_int32_t) :: nattyp_dim
 810   integer(kind=c_int32_t) :: indlmn_dim(3)
 811 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) && defined(HAVE_YAKL)
 812   integer(kind=c_int32_t) :: cwavef_size
 813 #endif
 814 
 815   integer :: idat, iatom, nlmn, shift
 816   real(dp) :: tsec(2)
 817 
 818   integer :: choice, cpopt, paw_opt , cplx
 819   character :: blas_transpose
 820   type(pawcprj_type),allocatable :: cwaveprj_in(:,:)
 821 
 822   integer :: ikpt_this_proc
 823   ! dummies
 824   real(dp) :: enlout(ndat), lambda_block(1), gvnlxc(1,1)
 825   integer, parameter :: nnlout = 0, idir = 0, signs = 2
 826 
 827   type(invovl_kpt_type), pointer :: invovl
 828 
 829   ! *************************************************************************
 830 
 831   if(ham%gpu_option==ABI_GPU_OPENMP) then
 832 #ifdef HAVE_OPENMP_OFFLOAD
 833     call apply_invovl_ompgpu(ham, cwavef, sm1cwavef, cwaveprj, npw, ndat, mpi_enreg, nspinor, block_sliced)
 834     return
 835 #endif
 836   end if
 837 
 838   ABI_NVTX_START_RANGE(NVTX_INVOVL_PREP)
 839   ikpt_this_proc=bandfft_kpt_get_ikpt()
 840   invovl => invovl_kpt(ikpt_this_proc)
 841 
 842   if(ham%istwf_k == 1) then
 843     cplx = 2
 844     blas_transpose = 'c'
 845   else
 846     cplx = 1
 847     blas_transpose = 't'
 848   end if
 849 
 850   ABI_MALLOC(proj,       (cplx,invovl%nprojs,nspinor*ndat))
 851   ABI_MALLOC(sm1proj,    (cplx,invovl%nprojs,nspinor*ndat))
 852   ABI_MALLOC(PtPsm1proj, (cplx,invovl%nprojs,nspinor*ndat))
 853   proj = zero
 854   sm1proj = zero
 855   PtPsm1proj = zero
 856 
 857   proj_dim = (/ size(proj,1), size(proj,2), size(proj,3) /)
 858 
 859   nattyp_dim = size(ham%nattyp)
 860 
 861   indlmn_dim = (/ size(ham%indlmn,1), size(ham%indlmn,2), size(ham%indlmn,3) /)
 862 
 863 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING)
 864 
 865   !! memory allocation of data used in solve_inner_gpu
 866   !! note : this is actually done only once
 867   if (ham%gpu_option==ABI_GPU_LEGACY .or. ham%gpu_option==ABI_GPU_KOKKOS) then
 868 
 869 #ifdef DEBUG_VERBOSE_GPU
 870     if(xmpi_comm_rank(xmpi_world) == 0) then
 871       call check_gpu_mem("gpu_apply_invovl_inner_alloc begin")
 872     end if
 873 #endif
 874 
 875     ! make sure to use sizes from apply_invovl
 876     call f_gpu_apply_invovl_inner_alloc(proj_dim, ham%ntypat, 0)
 877 
 878 #ifdef DEBUG_VERBOSE_GPU
 879     if(xmpi_comm_rank(xmpi_world) == 0) then
 880       call check_gpu_mem("gpu_apply_invovl_inner_alloc end")
 881     end if
 882 #endif
 883 
 884     ! TODO find a better place to put that initialization
 885     call f_gpu_init_invovl_data(indlmn_dim, c_loc(ham%indlmn(1,1,1)))
 886 
 887   end if
 888 
 889 #endif
 890 
 891 
 892   call timab(timer_apply_inv_ovl_opernla, 1, tsec)
 893 
 894   ! cwaveprj may be dummy or unused if gemm nonlop is turned on
 895   if((.not. gemm_nonlop_use_gemm) .or. size(cwaveprj) > 1) then
 896     ABI_MALLOC(cwaveprj_in, (ham%natom,nspinor*ndat))
 897     call pawcprj_alloc(cwaveprj_in,0,ham%dimcprj)
 898   else
 899     ABI_MALLOC(cwaveprj_in, (1,1))
 900     call pawcprj_alloc(cwaveprj_in,0,(/1/))
 901   end if
 902   ABI_NVTX_END_RANGE()
 903 
 904   ! get the cprj
 905   ABI_NVTX_START_RANGE(NVTX_INVOVL_NONLOP1)
 906   choice = 0 ! only compute cprj, nothing else
 907   cpopt = 0 ! compute and save cprj
 908   paw_opt = 3 ! S nonlocal operator
 909   if (mpi_enreg%paral_kgb==1) then
 910     call prep_nonlop(choice,cpopt,cwaveprj_in,enlout,ham,idir,lambda_block,ndat,mpi_enreg,&
 911       &                   nnlout,paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,&
 912       &                   already_transposed=.true.,&
 913       &                   gpu_option=ham%gpu_option,&
 914       &                   vectproj=proj)
 915   else
 916     call nonlop(choice,cpopt,cwaveprj_in,enlout,ham,idir,lambda_block,mpi_enreg,ndat,&
 917       &              nnlout,paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,vectproj=proj)
 918   end if
 919   ABI_NVTX_END_RANGE()
 920 
 921   call timab(timer_apply_inv_ovl_opernla, 2, tsec)
 922   call timab(timer_apply_inv_ovl_inv_s, 1, tsec)
 923 
 924   ! If using GEMM nonlop, proj array is used directly instead of writing in cwaveprj_in content, so skip this copy
 925   if(.not. gemm_nonlop_use_gemm) then
 926     ! copy cwaveprj_in to proj(:,:)
 927     do idat=1, ndat*nspinor
 928       shift = 0
 929       do iatom = 1, ham%natom
 930         nlmn = cwaveprj_in(iatom, idat)%nlmn
 931         proj(1:cplx, shift+1:shift+nlmn, idat) = cwaveprj_in(iatom, idat)%cp(1:cplx, 1:nlmn)
 932         shift = shift + nlmn
 933       end do
 934     end do
 935   end if
 936 
 937   !multiply by S^1
 938   ABI_NVTX_START_RANGE(NVTX_INVOVL_INNER)
 939   ! TODO : when solve_inner_gpu is ready, update the following to activate GPU computation
 940   if (ham%gpu_option == ABI_GPU_LEGACY .or. ham%gpu_option==ABI_GPU_KOKKOS) then
 941 
 942 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA)
 943 
 944     if (mpi_enreg%nproc_fft /= 1) then
 945       ABI_ERROR("[66_wfs/m_invovl.F90:apply_invovl] nproc_fft must be 1, when GPU/CUDA is activated")
 946     end if
 947 
 948     call f_solve_inner_gpu(proj_dim, c_loc(proj(1,1,1)), &
 949       & c_loc(sm1proj(1,1,1)), c_loc(PtPsm1proj(1,1,1)), &
 950       & nattyp_dim, c_loc(ham%nattyp(1)), ham%ntypat, &
 951       & ham%lmnmax, cplx, block_sliced)
 952 
 953 #endif
 954 
 955   else
 956 
 957     call solve_inner(invovl, ham, cplx, mpi_enreg, proj, ndat*nspinor, sm1proj, PtPsm1proj, block_sliced)
 958     sm1proj = - sm1proj
 959     PtPsm1proj = - PtPsm1proj
 960   end if
 961 
 962   ABI_NVTX_END_RANGE()
 963 
 964   ! If using GEMM nonlop, sm1proj array is used directly instead of reading cwaveprj content, so skip this copy
 965   if(.not. gemm_nonlop_use_gemm) then
 966     ! copy sm1proj to cwaveprj(:,:)
 967     do idat=1, ndat*nspinor
 968       shift = 0
 969       do iatom = 1, ham%natom
 970         nlmn = cwaveprj(iatom, idat)%nlmn
 971         cwaveprj(iatom, idat)%cp(1:cplx, 1:nlmn) = sm1proj(1:cplx, shift+1:shift+nlmn, idat)
 972         shift = shift + nlmn
 973       end do
 974     end do
 975   end if
 976   call timab(timer_apply_inv_ovl_inv_s, 2, tsec)
 977   call timab(timer_apply_inv_ovl_opernlb, 1, tsec)
 978 
 979   ! get the corresponding wf
 980   ABI_NVTX_START_RANGE(NVTX_INVOVL_NONLOP2)
 981   cpopt = 2 ! reuse cprj
 982   choice = 7 ! get wf from cprj, without the application of S
 983   paw_opt = 3
 984   if (mpi_enreg%paral_kgb==1) then
 985     call prep_nonlop(choice,cpopt,cwaveprj,enlout,ham,idir,lambda_block,ndat,mpi_enreg,nnlout,&
 986       &                   paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,already_transposed=.true.,&
 987       &                   gpu_option=ham%gpu_option,vectproj=sm1proj)
 988   else
 989     call nonlop(choice,cpopt,cwaveprj,enlout,ham,idir,lambda_block,mpi_enreg,ndat,nnlout,paw_opt,&
 990       &              signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,vectproj=sm1proj)
 991   end if
 992   ABI_NVTX_END_RANGE()
 993 
 994   call timab(timer_apply_inv_ovl_opernlb, 2, tsec)
 995 
 996   ABI_NVTX_START_RANGE(NVTX_INVOVL_POST2)
 997   if(size(cwaveprj) > 1) then
 998     ! copy PtPSm1proj to cwaveprj(:,:)
 999     do idat=1, ndat*nspinor
1000       shift = 0
1001       do iatom = 1, ham%natom
1002         nlmn = cwaveprj(iatom, idat)%nlmn
1003         cwaveprj(iatom, idat)%cp(1:cplx, 1:nlmn) = PtPsm1proj(1:cplx, shift+1:shift+nlmn, idat)
1004         shift = shift + nlmn
1005       end do
1006     end do
1007     !cwaveprj_in is empty if GEMM nonlop is being used, so populate it here
1008     if(gemm_nonlop_use_gemm) then
1009       do idat=1, ndat*nspinor
1010         shift = 0
1011         do iatom = 1, ham%natom
1012           nlmn = cwaveprj_in(iatom, idat)%nlmn
1013           cwaveprj_in(iatom, idat)%cp(1:cplx, 1:nlmn) = proj(1:cplx, shift+1:shift+nlmn, idat)
1014           shift = shift + nlmn
1015         end do
1016       end do
1017     end if
1018     call pawcprj_axpby(one, one, cwaveprj_in, cwaveprj)
1019   end if
1020   call pawcprj_free(cwaveprj_in)
1021   ABI_FREE(cwaveprj_in)
1022 
1023   if (ham%gpu_option == ABI_GPU_LEGACY .or. ham%gpu_option==ABI_GPU_KOKKOS) then
1024 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) && defined(HAVE_YAKL)
1025     cwavef_size = 2*npw*nspinor*ndat
1026     call add_array_kokkos(c_loc(sm1cwavef), c_loc(cwavef), cwavef_size)
1027 #endif
1028   else
1029     sm1cwavef = cwavef + sm1cwavef
1030   end if
1031 
1032   ABI_NVTX_END_RANGE()
1033 
1034   ABI_FREE(proj)
1035   ABI_FREE(sm1proj)
1036   ABI_FREE(PtPsm1proj)
1037 
1038 end subroutine apply_invovl

m_invovl/apply_invovl_ompgpu [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 apply_invovl_ompgpu

FUNCTION

 Applies the inverse of the overlap matrix to cwavef (OpenMP GPU implementation)

INPUTS

SOURCE

1293 subroutine apply_invovl_ompgpu(ham, cwavef, sm1cwavef, cwaveprj, npw, ndat, mpi_enreg, nspinor, block_sliced)
1294 
1295 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU)
1296   use, intrinsic :: iso_c_binding
1297 #endif
1298 
1299   implicit none
1300 
1301   ! args
1302   type(gs_hamiltonian_type), intent(in), target :: ham
1303   integer, intent(in) :: npw, ndat
1304   integer, intent(in) :: nspinor
1305   integer, intent(in) :: block_sliced
1306   real(dp), intent(inout), target :: cwavef(2, npw*nspinor*ndat) ! TODO should be in, fix nonlop
1307   type(mpi_type) :: mpi_enreg
1308   real(dp), intent(inout), target :: sm1cwavef(2, npw*nspinor*ndat)
1309   type(pawcprj_type), intent(inout) :: cwaveprj(:,:)
1310   logical :: transfer_omp_args
1311 
1312   real(dp), ABI_CONTIGUOUS pointer :: proj(:,:,:),sm1proj(:,:,:),PtPsm1proj(:,:,:)
1313 
1314   integer :: idat, iatom, icplx, iproj, nprojs, nlmn, shift
1315   real(dp) :: tsec(2)
1316 
1317   integer :: choice, cpopt, paw_opt , cplx, old_me_g0
1318   type(pawcprj_type),allocatable :: cwaveprj_in(:,:)
1319 
1320   integer :: ikpt_this_proc
1321   ! dummies
1322   real(dp) :: enlout(ndat), lambda_block(1), gvnlxc(1,1)
1323   integer, parameter :: nnlout = 0, idir = 0, signs = 2
1324 
1325   type(invovl_kpt_type), pointer :: invovl
1326 
1327   ! *************************************************************************
1328 
1329   ikpt_this_proc=bandfft_kpt_get_ikpt()
1330   invovl => invovl_kpt(ikpt_this_proc)
1331   nprojs=invovl%nprojs
1332   if(ikpt_this_proc /= current_ikpt_in_gpu) call refresh_invovl_ompgpu_kpt(ikpt_this_proc)
1333 
1334   if(ham%istwf_k == 1) then
1335     cplx = 2
1336   else
1337     cplx = 1
1338   end if
1339   if(gpu_initialized == 0) call alloc_ompgpu_buffers(cplx,nprojs,nspinor,ndat)
1340   proj => proj_ompgpu
1341   sm1proj => sm1proj_ompgpu
1342   PtPsm1proj => PtPsm1proj_ompgpu
1343   !$OMP TARGET ENTER DATA MAP(alloc:proj,sm1proj,PtPsm1proj)
1344 
1345   transfer_omp_args =  .not. ( xomp_target_is_present(c_loc(sm1cwavef)) &
1346    .and. xomp_target_is_present(c_loc(cwavef)))
1347   !$OMP TARGET ENTER DATA MAP(alloc:gvnlxc)
1348   if(transfer_omp_args) then
1349    !$OMP TARGET ENTER DATA MAP(alloc:sm1cwavef,cwavef)
1350    !$OMP TARGET UPDATE TO(sm1cwavef,cwavef)
1351   end if
1352 
1353   call timab(timer_apply_inv_ovl_opernla, 1, tsec)
1354 
1355   ! get the cprj
1356   ABI_NVTX_START_RANGE(NVTX_INVOVL_NONLOP1)
1357   choice = 0 ! only compute cprj, nothing else
1358   cpopt = 0 ! compute and save cprj
1359   paw_opt = 3 ! S nonlocal operator
1360 
1361   if(ham%istwf_k==2) then
1362     old_me_g0=mpi_enreg%me_g0
1363     if (mpi_enreg%me_fft==0) then
1364       mpi_enreg%me_g0=1
1365     else
1366       mpi_enreg%me_g0=0
1367     end if
1368   end if
1369   call nonlop(choice,cpopt,cwaveprj_in,enlout,ham,idir,lambda_block,mpi_enreg,ndat,nnlout,&
1370                 paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,vectproj=proj)
1371   ABI_NVTX_END_RANGE()
1372 
1373   call timab(timer_apply_inv_ovl_opernla, 2, tsec)
1374   call timab(timer_apply_inv_ovl_inv_s, 1, tsec)
1375 
1376   !multiply by S^1
1377   ABI_NVTX_START_RANGE(NVTX_INVOVL_INNER)
1378   call solve_inner_ompgpu(invovl, ham, cplx, mpi_enreg, proj, ndat*nspinor, sm1proj, PtPsm1proj, block_sliced)
1379   !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) MAP(to:sm1proj,PtPsm1proj)
1380   do idat  =1, ndat*nspinor
1381     do iproj = 1, nprojs
1382       do icplx = 1, cplx
1383         sm1proj(icplx,iproj,idat) = - sm1proj(icplx,iproj,idat)
1384         PtPsm1proj(icplx,iproj,idat) = - PtPsm1proj(icplx,iproj,idat)
1385       end do
1386     end do
1387   end do
1388 
1389   ABI_NVTX_END_RANGE()
1390 
1391   call timab(timer_apply_inv_ovl_inv_s, 2, tsec)
1392   call timab(timer_apply_inv_ovl_opernlb, 1, tsec)
1393 
1394   ! get the corresponding wf
1395   ABI_NVTX_START_RANGE(NVTX_INVOVL_NONLOP2)
1396   cpopt = 2 ! reuse cprj
1397   choice = 7 ! get wf from cprj, without the application of S
1398   paw_opt = 3
1399   call nonlop(choice,cpopt,cwaveprj,enlout,ham,idir,lambda_block,mpi_enreg,ndat,nnlout,paw_opt,&
1400               signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,vectproj=sm1proj)
1401   ABI_NVTX_END_RANGE()
1402 
1403   call timab(timer_apply_inv_ovl_opernlb, 2, tsec)
1404   if (ham%istwf_k==2) mpi_enreg%me_g0=old_me_g0
1405 
1406   if(size(cwaveprj) > 1) then
1407     ABI_MALLOC(cwaveprj_in, (ham%natom,nspinor*ndat))
1408     call pawcprj_alloc(cwaveprj_in,0,ham%dimcprj)
1409     !$OMP TARGET UPDATE FROM(PtPsm1proj,proj)
1410     ! copy PtPsm1proj to cwaveprj(:,:)
1411     do idat=1, ndat*nspinor
1412       shift = 0
1413       do iatom = 1, ham%natom
1414         nlmn = cwaveprj(iatom, idat)%nlmn
1415         cwaveprj(iatom, idat)%cp(1:cplx, 1:nlmn) = PtPsm1proj(1:cplx, shift+1:shift+nlmn, idat)
1416         shift = shift + nlmn
1417       end do
1418     end do
1419     do idat=1, ndat*nspinor
1420       shift = 0
1421       do iatom = 1, ham%natom
1422         nlmn = cwaveprj_in(iatom, idat)%nlmn
1423         cwaveprj_in(iatom, idat)%cp(1:cplx, 1:nlmn) = proj(1:cplx, shift+1:shift+nlmn, idat)
1424         shift = shift + nlmn
1425       end do
1426     end do
1427     call pawcprj_axpby(one, one, cwaveprj_in, cwaveprj)
1428     call pawcprj_free(cwaveprj_in)
1429     ABI_FREE(cwaveprj_in)
1430   end if
1431 
1432   call abi_gpu_xaxpy(1, 2*npw*nspinor*ndat, cone, cwavef, 1, sm1cwavef, 1)
1433 
1434   if(transfer_omp_args) then
1435     !$OMP TARGET UPDATE FROM(sm1cwavef,cwavef)
1436     !$OMP TARGET EXIT DATA MAP(delete:sm1cwavef,cwavef)
1437   end if
1438 
1439   !$OMP TARGET EXIT DATA MAP(delete:gvnlxc)
1440   !$OMP TARGET EXIT DATA MAP(delete:proj,sm1proj,PtPsm1proj)
1441 
1442 end subroutine apply_invovl_ompgpu

m_invovl/destroy_invovl [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 destroy_invovl

FUNCTION

 Destruction of the invovl_kpt array

INPUTS

 nkpt= number of k-points

SOURCE

442  subroutine destroy_invovl(nkpt, gpu_option)
443 
444    integer, intent(in) :: nkpt
445    integer, intent(in) :: gpu_option
446    integer :: ikpt
447 
448 ! *************************************************************************
449 
450   ! TODO add cycling if kpt parallelism
451   do ikpt=1,nkpt
452     if(invovl_kpt(ikpt)%nprojs == -1) then
453       ! write(0, *) 'ERROR invovl_kpt is unallocated'
454       cycle
455     end if
456     call destroy_invovl_ikpt(ikpt, gpu_option)
457   end do
458 
459   ABI_FREE(invovl_kpt)
460 
461  end subroutine destroy_invovl

m_invovl/destroy_invovl_ikpt [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 destroy_invovl_ikpt

FUNCTION

 Destruction of the ikpt-th member of invovl array

INPUTS

 ikpt= index of k-point

SOURCE

389  subroutine destroy_invovl_ikpt(ikpt, gpu_option)
390 
391    integer, intent(in) :: ikpt
392    integer, intent(in) :: gpu_option
393 
394 ! *************************************************************************
395 
396   if(gpu_option==ABI_GPU_OPENMP) then
397 #ifdef HAVE_OPENMP_OFFLOAD
398     if(gpu_initialized==1 .and. current_ikpt_in_gpu == ikpt) then
399       !$OMP TARGET EXIT DATA MAP(delete:current_gram_projs)
400       !$OMP TARGET EXIT DATA MAP(delete:current_inv_sij)
401       !$OMP TARGET EXIT DATA MAP(delete:current_inv_s_approx)
402       nullify(current_gram_projs)
403       nullify(current_inv_sij)
404       nullify(current_inv_s_approx)
405       current_ikpt_in_gpu = -1
406       !FIXME Smater buffer management ?
407       !!$OMP TARGET EXIT DATA MAP(delete:proj_ompgpu,sm1proj_ompgpu,PtPsm1proj_ompgpu)
408       ABI_FREE(proj_ompgpu)
409       ABI_FREE(sm1proj_ompgpu)
410       ABI_FREE(PtPsm1proj_ompgpu)
411       gpu_initialized = 0
412     end if
413 #endif
414   end if
415 
416   ABI_FREE(invovl_kpt(ikpt)%gram_projs)
417   ABI_FREE(invovl_kpt(ikpt)%inv_sij)
418   ABI_FREE(invovl_kpt(ikpt)%inv_s_approx)
419   invovl_kpt(ikpt)%nprojs = -1
420 
421 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING)
422   if (gpu_option == ABI_GPU_LEGACY .or. gpu_option == ABI_GPU_KOKKOS) then
423     call f_gpu_apply_invovl_inner_dealloc()
424     call f_gpu_apply_invovl_matrix_dealloc()
425   end if
426 #endif
427 
428  end subroutine destroy_invovl_ikpt

m_invovl/init_invovl [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 init_invovl

FUNCTION

 Initalization of the invovl_kpt array

INPUTS

 nkpt= number of k-points

SOURCE

362  subroutine init_invovl(nkpt)
363 
364   integer, intent(in) :: nkpt
365   integer :: ikpt
366 
367 ! *************************************************************************
368 
369   ABI_MALLOC(invovl_kpt, (nkpt))
370   ! TODO add cycling if kpt parallelism
371   do ikpt=1,nkpt
372     invovl_kpt(ikpt)%nprojs = -1
373   end do
374 
375  end subroutine init_invovl

m_invovl/invovl_kpt_type [ Types ]

[ Top ] [ m_invovl ] [ Types ]

NAME

 invovl_kpt_type

FUNCTION

 Contains information needed to invert the overlap matrix S

SOURCE

 87 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA)
 88 
 89  type, public :: invovl_kpt_type
 90 
 91    integer(kind=c_int32_t) :: nprojs
 92    !  total number of projectors
 93    !    nlmn for a specific atom = count(indlmn(3,:,itypat)>0)
 94    !  A value of -1 means that the following arrays are not allocated
 95 
 96    real(kind=c_double), allocatable :: gram_projs(:,:,:)
 97    ! gram_projs(cplx, nprojs, nprojs)
 98    ! projs' * projs
 99 
100    real(kind=c_double), allocatable :: inv_sij(:,:,:,:)
101    ! inv_sij(cplx, lmnmax, lmnmax, ntypat)
102    ! inverse of ham%sij
103 
104    real(kind=c_double), allocatable :: inv_s_approx(:,:,:,:)
105    ! inv_s_approx(cplx, lmnmax, lmnmax, ntypat)
106    ! preconditionner
107 
108  end type invovl_kpt_type
109 
110  !> companion type to invovl_kpt_type to pass data to gpu/cuda
111  type, bind(c), public :: invovl_kpt_gpu_type
112 
113    integer(kind=c_int32_t) :: nprojs
114    !  total number of projectors
115    !    nlmn for a specific atom = count(indlmn(3,:,itypat)>0)
116    !  A value of -1 means that the following arrays are not allocated
117 
118    type(c_ptr) :: gram_projs
119    ! gram_projs(cplx, nprojs, nprojs)
120    ! projs' * projs
121 
122    integer(kind=c_int32_t) :: gram_projs_dim(3)
123 
124    type(c_ptr) :: inv_sij
125    ! inv_sij(cplx, lmnmax, lmnmax, ntypat)
126    ! inverse of ham%sij
127 
128    integer(kind=c_int32_t) :: inv_sij_dim(4)
129 
130    type(c_ptr) :: inv_s_approx
131    ! inv_s_approx(cplx, lmnmax, lmnmax, ntypat)
132    ! preconditionner
133 
134    integer(kind=c_int32_t) :: inv_s_approx_dim(4)
135 
136  end type invovl_kpt_gpu_type
137 
138 #else
139 
140 type, public :: invovl_kpt_type
141 
142    integer :: nprojs
143    !  total number of projectors
144    !    nlmn for a specific atom = count(indlmn(3,:,itypat)>0)
145    !  A value of -1 means that the following arrays are not allocated
146 
147    real(dp), allocatable :: gram_projs(:,:,:)
148    ! gram_projs(cplx, nprojs, nprojs)
149    ! projs' * projs
150 
151    real(dp), allocatable :: inv_sij(:,:,:,:)
152    ! inv_sij(cplx, lmnmax, lmnmax, ntypat)
153    ! inverse of ham%sij
154 
155    real(dp), allocatable :: inv_s_approx(:,:,:,:)
156    ! inv_s_approx(cplx, lmnmax, lmnmax, ntypat)
157    ! preconditionner
158 
159 end type invovl_kpt_type
160 
161 #endif

m_invovl/make_invovl [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 make_invovl

FUNCTION

 Builds of the invovl structure

INPUTS

SOURCE

475 subroutine make_invovl(ham, dimffnl, ffnl, ph3d, mpi_enreg)
476 
477  use m_abi_linalg
478  implicit none
479 
480  type(gs_hamiltonian_type),intent(in), target :: ham
481  integer, intent(in) :: dimffnl
482  real(dp),intent(in) :: ffnl(ham%npw_k,dimffnl,ham%lmnmax,ham%ntypat)
483  real(dp),intent(in) :: ph3d(2,ham%npw_k,ham%matblk)
484  type(mpi_type) :: mpi_enreg
485 
486  real(dp) :: atom_projs(2, ham%npw_k, ham%lmnmax)
487  real(dp) :: temp(ham%npw_k)
488  complex(dpc), allocatable :: work(:)
489  real(dp), allocatable,target :: projs(:,:,:)
490  real(dp), allocatable :: gram_proj(:,:,:)
491  integer, allocatable :: ipiv(:)
492 
493  integer :: itypat, ilmn, nlmn, jlmn, ia, iaph3d, shift
494  integer :: il, ilm, jlm, ipw, info, ierr, cplx
495  integer :: ikpt_this_proc,cplex_dij
496  logical :: parity
497  real(dp) :: tsec(2)
498  character(len=500) :: message
499  character :: blas_transpose
500 
501  type(invovl_kpt_type), pointer :: invovl
502 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA)
503  type(invovl_kpt_gpu_type) :: invovl_gpu
504 #endif
505  integer :: array_nprojs_pp(mpi_enreg%nproc_fft)
506  integer :: iproc, slice_size
507  real(dp), allocatable :: gramwork(:,:,:)
508 
509  integer, parameter :: timer_mkinvovl = 1620, timer_mkinvovl_build_d = 1621, timer_mkinvovl_build_ptp = 1622
510 
511 ! *************************************************************************
512 
513  !! S = 1 + PDP', so
514  !! S^-1 = 1 + P inv_s_projs P', with
515  !! inv_s_projs = - (D^-1 + P'*P)^-1
516 
517  if(ham%istwf_k == 1) then
518    cplx = 2
519    blas_transpose = 'c'
520  else
521    cplx = 1
522    blas_transpose = 't'
523  end if
524 
525  ikpt_this_proc=bandfft_kpt_get_ikpt()
526  invovl => invovl_kpt(ikpt_this_proc)
527 
528  if(invovl%nprojs /= -1) then
529    ! We have been here before, cleanup before remaking
530    call destroy_invovl_ikpt(ikpt_this_proc, ham%gpu_option)
531  end if
532 
533  iaph3d = 1
534 
535  call timab(timer_mkinvovl,1,tsec)
536  call timab(timer_mkinvovl_build_d,1,tsec)
537 
538  ! build nprojs
539  invovl%nprojs = 0
540  do itypat=1,ham%ntypat
541    invovl%nprojs = invovl%nprojs + count(ham%indlmn(3,:,itypat)>0)*ham%nattyp(itypat)
542  end do
543 
544  ABI_MALLOC(projs, (2, ham%npw_k, invovl%nprojs))
545  ABI_MALLOC(invovl%inv_sij, (cplx, ham%lmnmax, ham%lmnmax, ham%ntypat))
546  ABI_MALLOC(invovl%inv_s_approx, (cplx, ham%lmnmax, ham%lmnmax, ham%ntypat))
547  ! workspace for inversion
548  ABI_MALLOC(ipiv, (ham%lmnmax))
549  ABI_MALLOC(work, (64*ham%lmnmax))
550 
551  projs = zero
552  invovl%inv_sij = zero
553  invovl%inv_s_approx = zero
554 
555  shift = 0
556  do itypat = 1, ham%ntypat
557    nlmn = count(ham%indlmn(3,:,itypat)>0)
558    if (size(ham%sij(:,itypat))==ham%lmnmax*(ham%lmnmax+1)/2) then
559      cplex_dij = 1
560    else if (size(ham%sij(:,itypat))==ham%lmnmax*(ham%lmnmax+1)) then
561      cplex_dij = 2
562    else
563      ABI_ERROR('sij size not recognize')
564    end if
565    !! unpack ham%sij into inv_sij
566    do jlmn = 1, nlmn
567      if (cplex_dij==1) then
568        invovl%inv_sij(1, jlmn, jlmn, itypat) = ham%sij(jlmn*(jlmn-1)/2 + jlmn, itypat)
569      else
570        invovl%inv_sij(1, jlmn, jlmn, itypat) = ham%sij(jlmn*(jlmn-1) + 2*jlmn-1, itypat)
571      end if
572      do ilmn = 1, jlmn-1
573        if (cplex_dij==1) then
574          invovl%inv_sij(1, ilmn, jlmn, itypat) = ham%sij(jlmn*(jlmn-1)/2 + ilmn, itypat)
575          invovl%inv_sij(1, jlmn, ilmn, itypat) = ham%sij(jlmn*(jlmn-1)/2 + ilmn, itypat)
576        else
577          invovl%inv_sij(1, ilmn, jlmn, itypat) = ham%sij(jlmn*(jlmn-1) + 2*ilmn-1, itypat)
578          invovl%inv_sij(1, jlmn, ilmn, itypat) = ham%sij(jlmn*(jlmn-1) + 2*ilmn-1, itypat)
579 
580        end if
581      end do
582    end do
583 
584    ! Invert sij
585    if(cplx == 2) then
586      call ZHETRF('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info)
587      call ZHETRI('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, info)
588    else
589      call DSYTRF('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info)
590      call DSYTRI('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, info)
591    end if
592    ! complete the matrix
593    do ilm=1, nlmn
594      do jlm=1, ilm-1
595        invovl%inv_sij(1,ilm,jlm,itypat) =  invovl%inv_sij(1,jlm,ilm,itypat)
596      end do
597    end do
598 
599    !! loop on atoms to build atom_projs and fill projs, s_projs
600    do ia = 1, ham%nattyp(itypat)
601 
602      !! build atom_projs, from opernlb
603      !! P = 4pi/sqrt(ucvol)* conj(diag(ph3d)) * ffnl * diag(parity), with parity = (-i)^l
604      atom_projs(:,:,:) = 0
605 
606      ! start from 4pi/sqrt(ucvol)*ffnl
607      ! atom_projs(1, :, 1:nlmn) = four_pi/sqrt(ham%ucvol) * ffnl(:, 1, 1:nlmn)
608      ! TODO vectorize (DCOPY with stride)
609      do ipw=1, ham%npw_k
610        atom_projs(1,ipw, 1:nlmn) = four_pi/sqrt(ham%ucvol) * ffnl(ipw, 1, 1:nlmn, itypat)
611      end do
612 
613      ! multiply by (-i)^l
614      do ilmn=1,nlmn
615        il=mod(ham%indlmn(1,ilmn, itypat),4);
616        parity=(mod(il,2)==0)
617        if (il>1) then
618          ! multiply by -1
619          atom_projs(:,:,ilmn) = -atom_projs(:,:,ilmn)
620        end if
621        if(.not. parity) then
622          ! multiply by -i
623          temp = atom_projs(2,:,ilmn)
624          atom_projs(2,:,ilmn) = -atom_projs(1,:,ilmn)
625          atom_projs(1,:,ilmn) =  temp
626        end if
627      end do
628 
629      ! multiply by conj(ph3d)
630      do ilmn=1,nlmn
631        temp = atom_projs(1, :, ilmn)
632        atom_projs(1, :, ilmn) = atom_projs(1, :, ilmn) * ph3d(1, :, iaph3d) + atom_projs(2, :, ilmn) * ph3d(2, :, iaph3d)
633        atom_projs(2, :, ilmn) = atom_projs(2, :, ilmn) * ph3d(1, :, iaph3d) - temp                   * ph3d(2, :, iaph3d)
634      end do
635 
636      ! me_g0 trick
637      if(ham%istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then
638        atom_projs(1,1,:) = atom_projs(1,1,:) / sqrt2
639        atom_projs(2,1,:) = zero
640      end if
641      if(ham%istwf_k == 2) then
642        atom_projs(:,:,:) = atom_projs(:,:,:)*sqrt2
643      end if
644 
645 
646      !! atom_projs and typat_s_projs are built, copy them to projs and inv_s_projs
647      projs(:, :, shift+1:shift+nlmn) = atom_projs(:, :, 1:nlmn)
648      shift = shift + nlmn
649 
650      ! build inv_s_approx = (D^-1+PtP)^-1 restricted to a single atom block
651      ! can be optimized (real, build directly from ffnl)
652      if(ia == 1) then
653        ! D^-1
654        invovl%inv_s_approx(1, :, :, itypat) = invovl%inv_sij(1, :, :, itypat)
655        ! + PtP
656        ABI_MALLOC(gram_proj, (cplx, nlmn, nlmn))
657        call abi_xgemm(blas_transpose,'N', nlmn, nlmn, (3-cplx)*ham%npw_k, cone, atom_projs(:,:,1), (3-cplx)*ham%npw_k, &
658 &                     atom_projs(:,:,1), (3-cplx)*ham%npw_k, czero, gram_proj(:,:,1), nlmn,x_cplx=cplx)
659        call xmpi_sum(gram_proj,mpi_enreg%comm_bandspinorfft,ierr)
660        invovl%inv_s_approx(:,1:nlmn,1:nlmn,itypat) = invovl%inv_s_approx(:,1:nlmn,1:nlmn,itypat) + gram_proj(:,:,:)
661        ABI_FREE(gram_proj)
662        ! ^-1
663        if(cplx == 2) then
664          call ZHETRF('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info)
665          call ZHETRI('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, info)
666        else
667          call DSYTRF('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info)
668          call DSYTRI('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, info)
669        end if
670        ! complete lower triangle of matrix
671        do ilm=1, nlmn
672          do jlm=1, ilm-1
673            invovl%inv_s_approx(1, ilm, jlm, itypat) =  invovl%inv_s_approx(1, jlm, ilm, itypat)
674            if(cplx == 2) then
675              invovl%inv_s_approx(2, ilm, jlm, itypat) =  -invovl%inv_s_approx(2, jlm, ilm, itypat)
676            end if
677          end do
678        end do
679      end if
680 
681      iaph3d = iaph3d + 1
682    end do
683  end do
684  ABI_FREE(ipiv)
685  ABI_FREE(work)
686 
687  call timab(timer_mkinvovl_build_d, 2, tsec)
688  call timab(timer_mkinvovl_build_ptp, 1, tsec)
689 
690  ! Compute P'P one column slice at a time (might be too big to fit in one proc)
691  if(mpi_enreg%paral_kgb == 1) then
692    ! Split the work evenly the fft processors
693    array_nprojs_pp(:) = invovl%nprojs / mpi_enreg%nproc_fft
694    ! not enough work, there's MOD(nprojs,mpi_enreg%nproc_fft) tasks left
695    ! assign them to the first ones
696    array_nprojs_pp(1:MOD(invovl%nprojs,mpi_enreg%nproc_fft)) = array_nprojs_pp(1:MOD(invovl%nprojs,mpi_enreg%nproc_fft)) + 1
697  else
698    array_nprojs_pp = invovl%nprojs
699  end if
700  ABI_MALLOC(invovl%gram_projs, (cplx,invovl%nprojs,array_nprojs_pp(mpi_enreg%me_fft+1)))
701  shift = 0
702  if (ham%gpu_option==ABI_GPU_OPENMP .and. mpi_enreg%nproc_fft==1) then
703 #ifdef HAVE_OPENMP_OFFLOAD
704    ! compute gram_projs in one GEMM, only one FFT proc expected in GPU mode
705    slice_size = array_nprojs_pp(1)
706    current_gram_projs   => invovl%gram_projs
707    !$OMP TARGET ENTER DATA MAP(alloc:current_gram_projs)
708    !$OMP TARGET ENTER DATA MAP(to:projs)
709 
710    !$OMP TARGET DATA USE_DEVICE_PTR(current_gram_projs,projs)
711    call abi_gpu_xgemm(cplx, blas_transpose,'N', invovl%nprojs, slice_size, (3-cplx)*ham%npw_k, cone, &
712    &                  c_loc(projs), (3-cplx)*ham%npw_k, &
713    &                  c_loc(projs), (3-cplx)*ham%npw_k, czero, c_loc(current_gram_projs), invovl%nprojs)
714    !$OMP END TARGET DATA
715    !$OMP TARGET EXIT DATA MAP(from:current_gram_projs)
716    !$OMP TARGET EXIT DATA MAP(delete:projs)
717    call xmpi_sum(invovl%gram_projs,mpi_enreg%comm_band,ierr)
718 #endif
719  else
720    do iproc = 1, mpi_enreg%nproc_fft
721      ! compute local contribution to slice iproc of gram_projs
722      slice_size = array_nprojs_pp(iproc)
723      ABI_MALLOC(gramwork, (cplx,invovl%nprojs,slice_size))
724      call abi_xgemm(blas_transpose,'N', invovl%nprojs, slice_size, (3-cplx)*ham%npw_k, cone, projs(:,:,1), (3-cplx)*ham%npw_k, &
725      &                   projs(:, :, shift+1), (3-cplx)*ham%npw_k, czero, gramwork(:,:,1), invovl%nprojs,x_cplx=cplx)
726      shift = shift + slice_size
727      ! reduce on proc i
728      call xmpi_sum_master(gramwork, iproc-1, mpi_enreg%comm_fft, ierr)
729      if(iproc == mpi_enreg%me_fft+1) then
730        invovl%gram_projs = gramwork
731      end if
732      ABI_FREE(gramwork)
733    end do
734    call xmpi_sum(invovl%gram_projs,mpi_enreg%comm_band,ierr)
735  end if
736 
737  call timab(timer_mkinvovl_build_ptp, 2, tsec)
738  call timab(timer_mkinvovl,2,tsec)
739 
740  ABI_FREE(projs)
741 
742 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA)
743 
744  ! upload inverse overlap matrices (sij and s_approx) to GPU memory
745  if (ham%gpu_option==ABI_GPU_LEGACY .or. ham%gpu_option==ABI_GPU_KOKKOS) then
746    ! allocate memory for sij and s_approx on GPU
747    write(message,'(a,a,i12,a,a,i6,a,a,i6,a,a,es12.4,a)') &
748      & 'Allocate GPU memory for inverse overlap computations (sij and s_approx) : ',&
749      & 'nprojs=',invovl%nprojs,ch10,&
750      & 'nlmnax=',ham%lmnmax,ch10,&
751      & 'ntypat=',ham%ntypat,ch10,&
752      & 'gram_projs_gpu_size (GBytes)=',1e-9*cplx*invovl%nprojs*invovl%nprojs*dp,ch10
753    call wrtout(std_out,message,'COLL')
754    call f_gpu_apply_invovl_matrix_alloc(cplx, invovl%nprojs, ham%lmnmax, ham%ntypat, 0)
755 
756    invovl_gpu = make_invovl_kpt_gpu(invovl)
757    call f_upload_inverse_overlap(invovl_gpu, cplx, invovl%nprojs, ham%lmnmax, ham%ntypat)
758    write(message,*) 'Invovl uploaded to GPU memory'
759    call wrtout(std_out,message,'COLL')
760  end if
761 
762 #endif
763 
764 #ifdef HAVE_OPENMP_OFFLOAD
765  if (ham%gpu_option==ABI_GPU_OPENMP) then
766    call refresh_invovl_ompgpu_kpt(ikpt_this_proc)
767  end if
768 #endif
769 
770  write(message,*) 'Invovl built'
771  call wrtout(std_out,message,'COLL')
772 
773 end subroutine make_invovl

m_invovl/make_invovl_kpt_gpu [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 make_invovl_kpt

FUNCTION

 Create a invovl_pkt_gpu_type from a cpu counter part for cuda interoperability

SOURCE

272   function make_invovl_kpt_gpu(invovl) result(invovl_gpu)
273     implicit none
274     type(invovl_kpt_type), intent(inout),target :: invovl
275     type(invovl_kpt_gpu_type)                   :: invovl_gpu
276 
277     invovl_gpu%nprojs = invovl%nprojs
278 
279     invovl_gpu%gram_projs = c_loc(invovl%gram_projs(1,1,1))
280     invovl_gpu%gram_projs_dim = (/ &
281       & size(invovl%gram_projs,1), &
282       & size(invovl%gram_projs,2), &
283       & size(invovl%gram_projs,3)  &
284       & /)
285 
286     invovl_gpu%inv_sij = c_loc(invovl%inv_sij(1,1,1,1))
287     invovl_gpu%inv_sij_dim = (/ &
288       & size(invovl%inv_sij,1), &
289       & size(invovl%inv_sij,2), &
290       & size(invovl%inv_sij,3), &
291       & size(invovl%inv_sij,4)  &
292       & /)
293 
294     invovl_gpu%inv_s_approx = c_loc(invovl%inv_s_approx(1,1,1,1))
295     invovl_gpu%inv_s_approx_dim = (/ &
296       & size(invovl%inv_s_approx,1), &
297       & size(invovl%inv_s_approx,2), &
298       & size(invovl%inv_s_approx,3), &
299       & size(invovl%inv_s_approx,4)  &
300       & /)
301 
302   end function make_invovl_kpt_gpu

m_invovl/solve_inner [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 solve_inner

FUNCTION

 Helper function: iteratively solves the inner system

INPUTS

SOURCE

1051 subroutine solve_inner(invovl, ham, cplx, mpi_enreg, proj, ndat, sm1proj, PtPsm1proj, block_sliced)
1052 
1053  use m_abi_linalg
1054  implicit none
1055 
1056  integer,intent(in) :: ndat,cplx
1057  type(invovl_kpt_type), intent(in) :: invovl
1058  real(dp), intent(inout) :: proj(cplx, invovl%nprojs,ndat)
1059  real(dp), intent(inout) :: sm1proj(cplx, invovl%nprojs, ndat)
1060  real(dp), intent(inout) :: PtPsm1proj(cplx, invovl%nprojs, ndat)
1061  real(dp), allocatable :: temp_proj(:,:,:)
1062  type(mpi_type), intent(in) :: mpi_enreg
1063  type(gs_hamiltonian_type),intent(in) :: ham
1064  integer, intent(in) :: block_sliced
1065 
1066  integer :: array_nlmntot_pp(mpi_enreg%nproc_fft)
1067  integer :: nlmntot_this_proc, ibeg, iend, ierr, i, nprojs
1068  real(dp) :: resid(cplx, invovl%nprojs,ndat), precondresid(cplx, invovl%nprojs,ndat)
1069  real(dp) :: normprojs(ndat), errs(ndat), maxerr, previous_maxerr
1070  character(len=500) :: message
1071 
1072  real(dp), parameter :: precision = 1e-16 ! maximum relative error. TODO: use tolwfr ?
1073  real(dp) :: convergence_rate
1074  integer :: additional_steps_to_take
1075 
1076 ! *************************************************************************
1077 
1078  nprojs = invovl%nprojs
1079  normprojs = SUM(SUM(proj**2, 1),1)
1080  call xmpi_sum(normprojs, mpi_enreg%comm_fft, ierr)
1081 
1082  ! Compute work distribution : split nprojs evenly between the fft processors
1083  if(mpi_enreg%paral_kgb == 1) then
1084    array_nlmntot_pp(:) = nprojs / mpi_enreg%nproc_fft
1085    ! not enough work, there's MOD(nprojs,mpi_enreg%nproc_fft) tasks left
1086    ! assign them to the first ones
1087    array_nlmntot_pp(1:MOD(nprojs,mpi_enreg%nproc_fft)) = array_nlmntot_pp(1:MOD(nprojs,mpi_enreg%nproc_fft)) + 1
1088    ibeg = SUM(array_nlmntot_pp(1:mpi_enreg%me_fft)) + 1
1089    iend = ibeg + array_nlmntot_pp(1+mpi_enreg%me_fft) - 1
1090    nlmntot_this_proc = iend - ibeg + 1
1091  else
1092    ibeg = 1
1093    iend = nprojs
1094    nlmntot_this_proc = nprojs
1095  end if
1096 
1097  ABI_MALLOC(temp_proj, (cplx, nlmntot_this_proc, ndat))
1098 
1099  ! first guess for sm1proj
1100  call apply_block(ham, cplx, invovl%inv_s_approx, nprojs, ndat, proj, sm1proj, block_sliced)
1101 
1102  ! Iterative refinement
1103  ! TODO use a more efficient iterative algorithm than iterative refinement, use locking
1104  additional_steps_to_take = -1
1105  do i=1, 30
1106    ! compute resid = proj - (D^-1 + PtP)sm1proj
1107    call apply_block(ham, cplx, invovl%inv_sij, nprojs, ndat, sm1proj, resid, block_sliced)
1108    temp_proj = sm1proj(:,ibeg:iend,:)
1109 
1110    ! compute matrix multiplication : PtPsm1proj(:,:,1) = invovl%gram * temp_proj(:,:,1)
1111    call abi_xgemm('N', 'N', nprojs, ndat, nlmntot_this_proc, cone, &
1112      & invovl%gram_projs(:,:,1), nprojs, &
1113      & temp_proj(:,:,1), nlmntot_this_proc, czero, &
1114      & PtPsm1proj(:,:,1), nprojs, &
1115      & x_cplx=cplx)
1116    call xmpi_sum(PtPsm1proj, mpi_enreg%comm_fft, ierr)
1117    resid = proj - resid - Ptpsm1proj
1118    ! exit check
1119    errs = SUM(SUM(resid**2, 1),1)
1120    call xmpi_sum(errs, mpi_enreg%comm_fft, ierr)
1121 
1122    maxerr = sqrt(MAXVAL(errs/normprojs))
1123    if(maxerr < precision .or. additional_steps_to_take == 1) then
1124      exit
1125      ! We might stall and never get to the specified precision because of machine errors.
1126      ! If we got to 1e-10, extrapolate convergence rate and determine the number of additional
1127      ! steps to take to reach precision
1128    else if(maxerr < 1e-10 .and. additional_steps_to_take == -1) then
1129      convergence_rate = -LOG(1e-10) / i
1130      additional_steps_to_take = CEILING(-LOG(precision/1e-10)/convergence_rate) + 1
1131    else if(additional_steps_to_take > 0) then
1132      if(previous_maxerr<maxerr)exit
1133      additional_steps_to_take = additional_steps_to_take - 1
1134    end if
1135    previous_maxerr=maxerr
1136 
1137    ! add preconditionned residual
1138    call apply_block(ham, cplx, invovl%inv_s_approx, nprojs, ndat, resid, precondresid, block_sliced)
1139    sm1proj = sm1proj + precondresid
1140  end do
1141 
1142  if(maxerr >= precision .and. maxerr >= 1e-10) then
1143    write(message, *) 'In invovl, max error was', maxerr, ' after 30 iterations'
1144    ABI_WARNING(message)
1145  else
1146    ! write(message,'(a,i2,a,es13.5)') 'Iterative solver in invovl finished in ', i, ' iterations, error', maxerr
1147    ! call wrtout(std_out,message,'COLL')
1148  end if
1149 
1150  ABI_FREE(temp_proj)
1151 
1152 end subroutine solve_inner

m_invovl/solve_inner_ompgpu [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 solve_inner_ompgpu

FUNCTION

 Helper function: iteratively solves the inner system (OpenMP GPU offload implementation)

INPUTS

SOURCE

1455 subroutine solve_inner_ompgpu(invovl, ham, cplx, mpi_enreg, proj, ndat, sm1proj, PtPsm1proj, block_sliced)
1456 
1457  use m_abi_linalg
1458  implicit none
1459 
1460  integer,intent(in) :: ndat,cplx
1461  type(invovl_kpt_type), intent(in), target :: invovl
1462  real(dp), intent(inout) :: proj(cplx, invovl%nprojs,ndat)
1463  real(dp), intent(inout), target :: sm1proj(cplx, invovl%nprojs, ndat)
1464  real(dp), intent(inout), target :: PtPsm1proj(cplx, invovl%nprojs, ndat)
1465  type(mpi_type), intent(in) :: mpi_enreg
1466  type(gs_hamiltonian_type),intent(in) :: ham
1467  integer, intent(in) :: block_sliced
1468 
1469  integer :: array_nlmntot_pp(mpi_enreg%nproc_fft)
1470  integer :: nlmntot_this_proc, ibeg, iend, ierr, i, nprojs
1471  real(dp) :: resid(cplx, invovl%nprojs,ndat), precondresid(cplx, invovl%nprojs,ndat)
1472  real(dp) :: normprojs(ndat), errs(ndat), maxerr, previous_maxerr
1473  character(len=500) :: message
1474 
1475  real(dp), parameter :: precision = 1e-16 ! maximum relative error. TODO: use tolwfr ?
1476  real(dp) :: convergence_rate,sum_tmp
1477  integer :: additional_steps_to_take,idat,iproj,icplx
1478  integer :: Ptsize(3)
1479 #ifdef HAVE_GPU_HIP
1480  type(c_ptr) :: sm1proj_amdcopy,PtPsm1proj_amdcopy
1481 #endif
1482 
1483 ! *************************************************************************
1484 
1485  Ptsize(1) = cplx
1486  Ptsize(2) = invovl%nprojs
1487  Ptsize(3) = ndat
1488  nprojs = invovl%nprojs
1489 #if defined HAVE_GPU_HIP  && defined FC_LLVM
1490  !FIXME Work-around for AOMP v15.0.3 (AMD Flang fork)
1491  sm1proj_amdref => sm1proj
1492  PtPsm1proj_amdref => PtPsm1proj
1493 #endif
1494 
1495  !$OMP TARGET ENTER DATA MAP(alloc:errs,precondresid,resid,normprojs)
1496 
1497  !FIXME LLVM has trouble with performing team reduction (AOMP 15.0.2)
1498 #ifdef FC_LLVM
1499  !$OMP TARGET UPDATE FROM(proj)
1500 #else
1501  !$OMP TARGET TEAMS DISTRIBUTE MAP(to:normprojs,proj) PRIVATE(idat,sum_tmp)
1502 #endif
1503  do idat = 1,ndat
1504   sum_tmp=0
1505 #ifndef FC_LLVM
1506   !$OMP PARALLEL DO COLLAPSE(2) REDUCTION(+:sum_tmp) PRIVATE(iproj,icplx)
1507 #endif
1508   do iproj = 1,nprojs
1509     do icplx = 1,cplx
1510       sum_tmp = sum_tmp + proj(icplx,iproj,idat)**2
1511     end do
1512   end do
1513   normprojs(idat)=sum_tmp
1514  end do
1515 #ifndef FC_LLVM
1516  !$OMP TARGET UPDATE FROM(normprojs)
1517 #endif
1518 
1519  ibeg = 1
1520  iend = nprojs
1521  nlmntot_this_proc = nprojs
1522 
1523  ! first guess for sm1proj
1524  call apply_block_ompgpu(ham, cplx, invovl%inv_s_approx, nprojs, ndat, proj, sm1proj, block_sliced)
1525 
1526  ! Iterative refinement
1527  ! TODO use a more efficient iterative algorithm than iterative refinement, use locking
1528  additional_steps_to_take = -1
1529  do i=1, 30
1530    ! compute resid = proj - (D^-1 + PtP)sm1proj
1531    call apply_block_ompgpu(ham, cplx, invovl%inv_sij, nprojs, ndat, sm1proj, resid, block_sliced)
1532 
1533    ! compute matrix multiplication : PtPsm1proj(:,:,1) = invovl%gram * sm1proj(:,:,1)
1534    ABI_NVTX_START_RANGE(NVTX_INVOVL_INNER_GEMM)
1535 #if defined HAVE_GPU_HIP && defined FC_LLVM
1536    !$OMP TARGET DATA USE_DEVICE_PTR(current_gram_projs, sm1proj_amdref, PtPsm1proj_amdref)
1537    call abi_gpu_xgemm(cplx, 'N', 'N', nprojs, ndat, nlmntot_this_proc, cone, &
1538                 c_loc(current_gram_projs), nprojs,&
1539                 c_loc(sm1proj_amdref), nlmntot_this_proc, czero, &
1540                 c_loc(PtPsm1proj_amdref), nprojs)
1541    !$OMP END TARGET DATA
1542 #else
1543    !$OMP TARGET DATA USE_DEVICE_PTR(current_gram_projs, sm1proj, PtPsm1proj)
1544    call abi_gpu_xgemm(cplx, 'N', 'N', nprojs, ndat, nlmntot_this_proc, cone, &
1545                 c_loc(current_gram_projs), nprojs,&
1546                 c_loc(sm1proj), nlmntot_this_proc, czero, &
1547                 c_loc(PtPsm1proj), nprojs)
1548    !$OMP END TARGET DATA
1549 #endif
1550 
1551    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) &
1552    !$OMP& PRIVATE(idat,iproj,icplx) MAP(to:proj,resid,PtPsm1proj)
1553    do idat =1, ndat
1554      do iproj =1, nprojs
1555        do icplx = 1,cplx
1556          resid(icplx, iproj, idat) = proj(icplx, iproj, idat) - resid(icplx, iproj, idat) - PtPsm1proj(icplx, iproj, idat)
1557        end do
1558      end do
1559    end do
1560 
1561    ! exit check
1562 #ifdef FC_LLVM
1563    !FIXME LLVM has trouble with performing team reduction (v16.0.0 from AMD ROCm 5.6.0)
1564    !$OMP TARGET UPDATE FROM(resid)
1565    errs = SUM(SUM(resid**2, 1),1)
1566 #else
1567    !$OMP TARGET TEAMS DISTRIBUTE MAP(to:errs,resid) PRIVATE(idat,sum_tmp)
1568    do idat = 1,ndat
1569      sum_tmp=0
1570      !$OMP PARALLEL DO COLLAPSE(2) REDUCTION(+:sum_tmp) PRIVATE(iproj,icplx)
1571      do iproj = 1,nprojs
1572        do icplx = 1,cplx
1573          sum_tmp = sum_tmp + resid(icplx,iproj,idat)**2
1574        end do
1575      end do
1576      errs(idat)=sum_tmp
1577    end do
1578    !$OMP TARGET UPDATE FROM(errs)
1579 #endif
1580 
1581    ABI_NVTX_END_RANGE()
1582 
1583    maxerr = sqrt(MAXVAL(errs/normprojs))
1584    if(maxerr < precision .or. additional_steps_to_take == 1) then
1585      exit
1586      ! We might stall and never get to the specified precision because of machine errors.
1587      ! If we got to 1e-10, extrapolate convergence rate and determine the number of additional
1588      ! steps to take to reach precision
1589    else if(maxerr < 1e-10 .and. additional_steps_to_take == -1) then
1590      convergence_rate = -LOG(1e-10) / i
1591      additional_steps_to_take = CEILING(-LOG(precision/1e-10)/convergence_rate) + 1
1592    else if(additional_steps_to_take > 0) then
1593      if(previous_maxerr<maxerr)exit
1594      additional_steps_to_take = additional_steps_to_take - 1
1595    end if
1596    previous_maxerr=maxerr
1597 
1598    ! add preconditionned residual
1599    call apply_block_ompgpu(ham, cplx, invovl%inv_s_approx, nprojs, ndat, resid, precondresid, block_sliced)
1600 
1601    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) &
1602    !$OMP& PRIVATE(idat,iproj,icplx) MAP(to:sm1proj,precondresid)
1603    do idat =1, ndat
1604      do iproj =1, nprojs
1605        do icplx = 1,cplx
1606          sm1proj(icplx, iproj, idat) = sm1proj(icplx, iproj, idat) + precondresid(icplx, iproj, idat)
1607        end do
1608      end do
1609    end do
1610  end do
1611  !$OMP TARGET EXIT DATA MAP(delete:errs,resid,precondresid,normprojs)
1612 
1613  if(maxerr >= precision .and. maxerr >= 1e-10) then
1614    write(message, *) 'In invovl, max error was', maxerr, ' after 30 iterations'
1615    ABI_WARNING(message)
1616  else
1617    ! write(message,'(a,i2,a,es13.5)') 'Iterative solver in invovl finished in ', i, ' iterations, error', maxerr
1618    ! call wrtout(std_out,message,'COLL')
1619  end if
1620 
1621 end subroutine solve_inner_ompgpu