TABLE OF CONTENTS


ABINIT/m_prep_kgb [ Modules ]

[ Top ] [ Modules ]

NAME

  m_prep_kgb

FUNCTION

  This module provides wrappers that used to apply the full Hamiltonian or just the Vnl part
  or to perform the FFT of the wavefunctions when the orbitals are distributed in linalg mode (paral_kgb = 1).

COPYRIGHT

  Copyright (C) 1998-2024 ABINIT group (FBottin,MT,GZ,MD,FDahm)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

17 #if defined HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include "abi_common.h"
22 
23 module m_prep_kgb
24 
25  use defs_basis
26  use m_abicore
27  use m_errors
28  use m_xmpi
29  use m_abi_linalg
30 
31  use defs_abitypes, only : MPI_type
32  use m_time,        only : timab
33  use m_bandfft_kpt, only : bandfft_kpt, bandfft_kpt_get_ikpt, bandfft_kpt_type
34  use m_pawcprj,     only : pawcprj_type
35  use m_hamiltonian, only : gs_hamiltonian_type
36  use m_nonlop,      only : nonlop
37  use m_getghc,      only : multithreaded_getghc
38  use m_fft,         only : fourwf
39 
40 #if defined HAVE_GPU_CUDA
41  use m_manage_cuda
42 #endif
43 
44 #if defined HAVE_YAKL
45  use gator_mod
46 #endif
47 
48 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
49  use m_gpu_toolbox, only : CPU_DEVICE_ID, gpu_device_synchronize, gpu_data_prefetch_async
50 #endif
51 
52 #if defined HAVE_OPENMP_OFFLOAD
53  use m_ompgpu_fourwf
54 #endif
55 
56  implicit none
57 
58  private

ABINIT/prep_fourwf [ Functions ]

[ Top ] [ Functions ]

NAME

 prep_fourwf

FUNCTION

 this routine prepares the data to the call of fourwf.

INPUTS

  blocksize= size of block for FFT
  cwavef(2,npw*ndat)=planewave coefficients of wavefunction (one spinorial component?).
  dtfil <type(datafiles_type)>=variables related to files
  kg_k(3,npw_k)=reduced planewave coordinates.
  lmnmax=if useylm=1, max number of (l,m,n) comp. over all type of psps
        =if useylm=0, max number of (l,n)   comp. over all type of psps
  mgfft=maximum size of 1d ffts
  mpi_enreg=information about mpi parallelization
  mpsang= 1+maximum angular momentum for nonlocal pseudopotentials
  mpssoang= 1+maximum (spin*angular momentum) for nonlocal pseudopotentials
  natom=number of atoms in cell.
  nband_k=number of bands at this k point for that spin polarization
  ndat=number of FFT to do in //
  ngfft(18)= contain all needed information about 3D FFT
  npw_k=number of plane waves at this k point
  nspinor=number of spinorial components of the wavefunctions
  ntypat=number of types of atoms in unit cell.
  n4,n5,n6 used for dimensionning of vlocal
  option_fourwf=option for fourwf (see fourwf.F90)
  prtvol=control print volume and debugging output
  ucvol=unit cell volume
  [bandfft_kpt_tab]= (optional) if present, contains tabs used to implement
                     the "band-fft" parallelism
                      if not present, the bandfft_kpt global variable is used
  [gpu_option] = GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)  

OUTPUT

  gwavef=(2,npw*ndat)=matrix elements <G|H|C>.

SIDE EFFECTS

SOURCE

1032 subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,iblock,istwf_k,mgfft,&
1033 &          mpi_enreg,nband_k,ndat,ngfft,npw_k,n4,n5,n6,occ_k,option_fourwf,ucvol,wtk,&
1034 &          bandfft_kpt_tab,gpu_option) ! Optional arguments
1035 
1036 !Arguments ------------------------------------
1037 !scalars
1038  integer,intent(in) :: blocksize,iblock,istwf_k,mgfft,n4,n5,n6,nband_k,ndat,npw_k
1039  integer,intent(in) :: option_fourwf
1040  integer,intent(in),optional :: gpu_option
1041  real(dp),intent(in) :: ucvol,wtk
1042  type(bandfft_kpt_type),optional,target,intent(in) :: bandfft_kpt_tab
1043  type(mpi_type),intent(in) :: mpi_enreg
1044 !arrays
1045  integer,intent(in) :: ngfft(18)
1046  real(dp),intent(in) :: occ_k(nband_k)
1047  real(dp),intent(out) :: rhoaug(n4,n5,n6)
1048  real(dp),intent(in), target :: cwavef(2,npw_k*blocksize)
1049  real(dp),target,intent(inout) :: wfraug(2,n4,n5,n6*ndat)
1050 
1051 !Local variables-------------------------------
1052 !scalars
1053  integer :: bandpp,bandpp_sym,ier,iibandpp,ikpt_this_proc,ind_occ,ind_occ1,ind_occ2,ipw
1054  integer :: istwf_k_,jjbandpp,me_fft,nd3,nproc_band,nproc_fft,npw_fft
1055  integer :: spaceComm=0,tim_fourwf,gpu_option_
1056  integer,pointer :: idatarecv0,ndatarecv,ndatarecv_tot,ndatasend_sym
1057  logical :: flag_inv_sym,have_to_reequilibrate
1058  real(dp) :: weight,weight1,weight2
1059  type(bandfft_kpt_type),pointer :: bandfft_kpt_ptr
1060 !arrays
1061  integer,ABI_CONTIGUOUS pointer :: indices_pw_fft(:),kg_k_fft(:,:),kg_k_gather(:,:),kg_k_gather_sym(:,:)
1062  integer,ABI_CONTIGUOUS pointer :: rdispls(:),rdispls_sym(:)
1063  integer,ABI_CONTIGUOUS pointer :: recvcounts(:),recvcount_fft(:),recvcounts_sym(:),recvcounts_sym_tot(:)
1064  integer,ABI_CONTIGUOUS pointer :: recvdisp_fft(:),sdispls(:),sdispls_sym(:)
1065  integer,ABI_CONTIGUOUS pointer :: sendcounts(:),sendcount_fft(:),sendcounts_sym(:),sendcounts_sym_all(:)
1066  integer,ABI_CONTIGUOUS pointer :: senddisp_fft(:),tab_proc(:)
1067  integer,allocatable :: rdisplsloc(:)
1068  integer,allocatable :: recvcountsloc(:),sdisplsloc(:)
1069  integer,allocatable :: sendcountsloc(:)
1070  integer,allocatable :: index_wavef_band(:),index_wavef_send(:)
1071  integer,pointer :: gbound_(:,:)
1072  real(dp) :: dummy(2,1),tsec(2)
1073  real(dp),allocatable :: buff_wf(:,:)
1074 
1075 #if defined HAVE_GPU && defined HAVE_YAKL
1076  real(c_double), ABI_CONTIGUOUS pointer :: cwavef_alltoall1(:,:) => null()
1077 #else
1078  real(dp),allocatable :: cwavef_alltoall1(:,:)
1079 #endif
1080  real(dp),allocatable :: cwavef_alltoall2(:,:)
1081  real(dp),allocatable :: cwavef_fft(:,:), cwavef_fft_tr(:,:)
1082  real(dp),allocatable :: weight_t(:),weight1_t(:),weight2_t(:)
1083  real(dp),pointer :: ewavef_alltoall_sym(:,:),wfraug_ptr(:,:,:,:)
1084 
1085 #if defined HAVE_GPU && defined HAVE_YAKL
1086  ! this buffer is necessary to avoid mixing "managed memory" buffer with "regular memory" buffer in MPI calls
1087  ! Just to be clear:
1088  ! - managed memory means memory allocated using ABI_MALLOC_MANAGED
1089  ! - regular memory means memory allocated using either ABI_MALLOC or ABI_MALLOC_CUDA
1090  !
1091  ! here we chose to use a CPU buffer, would it be better to use a GPU buffer ? To be checked.
1092  real(dp), allocatable :: cwavef_mpi(:,:)
1093  !type(c_ptr)            :: cwavef_mpi_c
1094  !real(c_double),pointer :: cwavef_mpi(:,:)
1095 #endif
1096 
1097 ! *************************************************************************
1098 
1099  ABI_CHECK((option_fourwf/=3),'Option=3 (FFT r->g) not implemented')
1100  ABI_CHECK((mpi_enreg%bandpp==ndat),'BUG: bandpp/=ndat')
1101 
1102  spaceComm=mpi_enreg%comm_band
1103  nproc_band = mpi_enreg%nproc_band
1104  nproc_fft  = mpi_enreg%nproc_fft
1105  bandpp     = mpi_enreg%bandpp
1106  me_fft     = mpi_enreg%me_fft
1107 
1108  gpu_option_=ABI_GPU_DISABLED;if (present(gpu_option)) gpu_option_=gpu_option
1109 
1110  if (present(bandfft_kpt_tab)) then
1111    bandfft_kpt_ptr => bandfft_kpt_tab
1112  else
1113    ikpt_this_proc=bandfft_kpt_get_ikpt()
1114    bandfft_kpt_ptr => bandfft_kpt(ikpt_this_proc)
1115  end if
1116 
1117  istwf_k_=istwf_k
1118  flag_inv_sym = (istwf_k_==2 .and. any(ngfft(7) == [401,402,312,512]))
1119  if (option_fourwf==0) flag_inv_sym=((flag_inv_sym).and.(gpu_option_==ABI_GPU_DISABLED))
1120 
1121  if (flag_inv_sym) then
1122    istwf_k_       = 1
1123    if (modulo(bandpp,2)==0) then
1124      bandpp_sym   = bandpp/2
1125    else
1126      bandpp_sym   = bandpp
1127    end if
1128  end if
1129 
1130 !====================================================================================
1131  ABI_MALLOC(sendcountsloc,(nproc_band))
1132  ABI_MALLOC(sdisplsloc   ,(nproc_band))
1133  ABI_MALLOC(recvcountsloc,(nproc_band))
1134  ABI_MALLOC(rdisplsloc   ,(nproc_band))
1135 
1136  recvcounts   =>bandfft_kpt_ptr%recvcounts(:)
1137  sendcounts   =>bandfft_kpt_ptr%sendcounts(:)
1138  rdispls      =>bandfft_kpt_ptr%rdispls   (:)
1139  sdispls      =>bandfft_kpt_ptr%sdispls   (:)
1140  ndatarecv    =>bandfft_kpt_ptr%ndatarecv
1141 
1142  kg_k_gather  =>bandfft_kpt_ptr%kg_k_gather(:,:)
1143  gbound_      =>bandfft_kpt_ptr%gbound(:,:)
1144 
1145  if (flag_inv_sym ) then
1146    idatarecv0           =>bandfft_kpt_ptr%idatarecv0
1147    ndatarecv_tot        =>bandfft_kpt_ptr%ndatarecv_tot
1148    ndatasend_sym        =>bandfft_kpt_ptr%ndatasend_sym
1149    kg_k_gather_sym      =>bandfft_kpt_ptr%kg_k_gather_sym(:,:)
1150    rdispls_sym          =>bandfft_kpt_ptr%rdispls_sym(:)
1151    recvcounts_sym       =>bandfft_kpt_ptr%recvcounts_sym(:)
1152    recvcounts_sym_tot   =>bandfft_kpt_ptr%recvcounts_sym_tot(:)
1153    sdispls_sym          =>bandfft_kpt_ptr%sdispls_sym(:)
1154    sendcounts_sym       =>bandfft_kpt_ptr%sendcounts_sym(:)
1155    sendcounts_sym_all   =>bandfft_kpt_ptr%sendcounts_sym_all(:)
1156    tab_proc             =>bandfft_kpt_ptr%tab_proc(:)
1157  end if
1158 
1159  ABI_MALLOC(cwavef_alltoall2,(2,ndatarecv*bandpp))
1160  if ( ((.not.flag_inv_sym) .and. (bandpp>1) ) .or. flag_inv_sym )then
1161    if(gpu_option_==ABI_GPU_KOKKOS) then
1162 #if defined HAVE_GPU && defined HAVE_YAKL
1163      ABI_MALLOC_MANAGED(cwavef_alltoall1,(/2,ndatarecv*bandpp/))
1164 #endif
1165    else
1166      ABI_MALLOC(cwavef_alltoall1,(2,ndatarecv*bandpp))
1167    end if
1168  end if
1169 
1170  recvcountsloc(:)=recvcounts(:)*2*bandpp
1171  rdisplsloc(:)=rdispls(:)*2*bandpp
1172  sendcountsloc(:)=sendcounts(:)*2
1173  sdisplsloc(:)=sdispls(:)*2
1174 
1175  call timab(547,1,tsec)
1176  if(gpu_option_==ABI_GPU_KOKKOS) then
1177 #if defined HAVE_GPU && defined HAVE_YAKL
1178     ABI_MALLOC(cwavef_mpi,(2,npw_k*blocksize))
1179 
1180     call gpu_data_prefetch_async(C_LOC(cwavef), INT(2, c_size_t)*npw_k*blocksize, CPU_DEVICE_ID)
1181     call gpu_device_synchronize()
1182 
1183     cwavef_mpi(:,:) = cwavef(:,:)
1184 
1185     call xmpi_alltoallv(cwavef_mpi,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
1186          & recvcountsloc,rdisplsloc,spaceComm,ier)
1187     ABI_FREE(cwavef_mpi)
1188 #endif
1189  else
1190    call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
1191         & recvcountsloc,rdisplsloc,spaceComm,ier)
1192  end if
1193  call timab(547,2,tsec)
1194 
1195  tim_fourwf=16
1196 
1197 !Eventually adjust load balancing for FFT (by changing FFT distrib)
1198  have_to_reequilibrate = bandfft_kpt_ptr%have_to_reequilibrate
1199  if(have_to_reequilibrate) then
1200    npw_fft =  bandfft_kpt_ptr%npw_fft
1201    sendcount_fft  => bandfft_kpt_ptr%sendcount_fft(:)
1202    recvcount_fft  => bandfft_kpt_ptr%recvcount_fft(:)
1203    senddisp_fft   => bandfft_kpt_ptr%senddisp_fft(:)
1204    recvdisp_fft   => bandfft_kpt_ptr%recvdisp_fft(:)
1205    indices_pw_fft => bandfft_kpt_ptr%indices_pw_fft(:)
1206    kg_k_fft       => bandfft_kpt_ptr%kg_k_fft(:,:)
1207    ABI_MALLOC( buff_wf, (2,ndatarecv*bandpp) ) ! for sorting cgwavef
1208    ABI_MALLOC( cwavef_fft, (2,npw_fft*bandpp) )
1209    if(bandpp>1) then
1210      ABI_MALLOC( cwavef_fft_tr, (2,npw_fft*bandpp))
1211    end if
1212  end if
1213 
1214  if (option_fourwf==0) wfraug(:,:,:,:)=zero
1215 
1216 !====================================================================
1217  if ((.not.(flag_inv_sym)) .and. (bandpp==1)) then
1218 
1219 !  Compute the index of the band
1220    ind_occ = (iblock-1)*blocksize + mpi_enreg%me_band + 1
1221 
1222    if(abs(occ_k(ind_occ))>=tol8.or.option_fourwf==0) then
1223 
1224 !    Compute the weight of the band
1225      weight=occ_k(ind_occ)*wtk/ucvol
1226 
1227      if(have_to_reequilibrate) then
1228 !      filling of sorted send buffers before exchange
1229        do ipw = 1 ,ndatarecv
1230          buff_wf(1:2, indices_pw_fft(ipw) ) = cwavef_alltoall2(1:2,ipw)
1231        end do
1232        call xmpi_alltoallv(buff_wf,2*sendcount_fft,2*senddisp_fft,  &
1233 &       cwavef_fft,2*recvcount_fft, 2*recvdisp_fft, mpi_enreg%comm_fft,ier)
1234        call fourwf(1,rhoaug,cwavef_fft,dummy,wfraug,gbound_,gbound_,&
1235 &       istwf_k_,kg_k_fft,kg_k_fft,mgfft,mpi_enreg,1,&
1236 &       ngfft,npw_fft,1,n4,n5,n6,option_fourwf,tim_fourwf,weight,weight,&
1237 &       gpu_option=gpu_option_)
1238      else
1239        call fourwf(1,rhoaug,cwavef_alltoall2,dummy,wfraug,gbound_,gbound_,&
1240 &       istwf_k_,kg_k_gather,kg_k_gather,mgfft,mpi_enreg,1,&
1241 &       ngfft,ndatarecv,1,n4,n5,n6,option_fourwf,tim_fourwf,weight,weight,&
1242 &       gpu_option=gpu_option_)
1243      end if
1244      if (option_fourwf==0.and.nproc_fft>1) then
1245        if (me_fft>0) then
1246          nd3=(ngfft(3)-1)/nproc_fft+1
1247          wfraug(:,:,:,me_fft*nd3+1:me_fft*nd3+nd3)=wfraug(:,:,:,1:nd3)
1248          wfraug(:,:,:,1:nd3)=zero
1249        end if
1250        call xmpi_sum(wfraug,mpi_enreg%comm_fft,ier)
1251      end if
1252    end if
1253 
1254 !====================================================================
1255  else if ((.not.(flag_inv_sym)) .and. (bandpp>1) ) then
1256 
1257 !  -------------------------------------------------------------
1258 !  Computation of the index to class the waves functions below bandpp
1259 !  -------------------------------------------------------------
1260    call prep_index_wavef_bandpp(nproc_band,bandpp,&
1261 &   1,ndatarecv,&
1262 &   recvcounts,rdispls,&
1263 &   index_wavef_band)
1264 
1265 !  -------------------------------------------------------
1266 !  Sorting of the wave functions below bandpp
1267 !  -------------------------------------------------------
1268    cwavef_alltoall1(:,:) = cwavef_alltoall2(:,index_wavef_band)
1269 
1270    if(have_to_reequilibrate) then
1271 !    filling of sorted send buffers before exchange
1272      do iibandpp=1,bandpp
1273        do ipw = 1 ,ndatarecv
1274          buff_wf(1:2, iibandpp + bandpp*(indices_pw_fft(ipw)-1)) = &
1275 &         cwavef_alltoall1(1:2,ipw + ndatarecv*(iibandpp-1))
1276        end do
1277      end do
1278      call xmpi_alltoallv(buff_wf,2*bandpp*sendcount_fft,2*bandpp*senddisp_fft,  &
1279 &     cwavef_fft_tr,2*bandpp*recvcount_fft, 2*bandpp*recvdisp_fft, mpi_enreg%comm_fft,ier)
1280      do iibandpp=1,bandpp
1281        do ipw = 1 ,npw_fft
1282          cwavef_fft(1:2,  ipw + npw_fft*(iibandpp -1)) = cwavef_fft_tr(1:2,  iibandpp + bandpp*(ipw-1))
1283        end do
1284      end do
1285    end if
1286 
1287 !  -------------------
1288 !  Fourier calculation
1289 !  -------------------
1290 !  Cuda version
1291    if(gpu_option_/=ABI_GPU_DISABLED) then
1292      ABI_MALLOC(weight_t,(bandpp))
1293      do iibandpp=1,bandpp
1294 !      Compute the index of the band
1295        ind_occ = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + iibandpp
1296 !      Compute the weight of the band
1297        weight_t(iibandpp)=occ_k(ind_occ)*wtk/ucvol
1298        if(abs(occ_k(ind_occ)) < tol8) weight_t(iibandpp) = zero
1299      end do
1300 !    Accumulate time because it is not done in gpu_fourwf
1301      call timab(240+tim_fourwf,1,tsec)
1302      if(gpu_option_==ABI_GPU_LEGACY) then
1303 #if defined HAVE_GPU_CUDA
1304        call gpu_fourwf(1,rhoaug,&
1305 &       cwavef_alltoall1,&
1306 &       dummy,wfraug,gbound_,gbound_,&
1307 &       istwf_k_,kg_k_gather,kg_k_gather,mgfft,mpi_enreg,bandpp,&
1308 &       ngfft,ndatarecv,1,n4,n5,n6,option_fourwf,mpi_enreg%paral_kgb,&
1309 &       tim_fourwf,weight_t,weight_t)
1310 #endif
1311      else if(gpu_option_==ABI_GPU_KOKKOS) then
1312 #if defined HAVE_GPU_CUDA
1313        call gpu_fourwf_managed(1,rhoaug,&
1314 &       cwavef_alltoall1,&
1315 &       dummy,wfraug,gbound_,gbound_,&
1316 &       istwf_k_,kg_k_gather,kg_k_gather,mgfft,mpi_enreg,bandpp,&
1317 &       ngfft,ndatarecv,1,n4,n5,n6,option_fourwf,mpi_enreg%paral_kgb,&
1318 &       tim_fourwf,weight_t,weight_t)
1319 #endif
1320      else if(gpu_option_==ABI_GPU_OPENMP) then
1321 #ifdef HAVE_OPENMP_OFFLOAD
1322        call ompgpu_fourwf    (1,rhoaug,&
1323 &       cwavef_alltoall1,&
1324 &       dummy,wfraug,gbound_,gbound_,&
1325 &       istwf_k_,kg_k_gather,kg_k_gather,mgfft,bandpp,&
1326 &       ngfft,ndatarecv,1,n4,n5,n6,option_fourwf,&
1327 &       weight_t,weight_t)
1328 #endif
1329      end if ! gpu_option_
1330      call timab(240+tim_fourwf,2,tsec)
1331      ABI_FREE(weight_t)
1332 
1333 !  Standard version
1334    else
1335      do iibandpp=1,bandpp
1336 !      Compute the index of the band
1337        ind_occ = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + iibandpp
1338 !      Compute the weight of the band
1339        weight=occ_k(ind_occ)*wtk/ucvol
1340        if (option_fourwf==0) then
1341          wfraug_ptr => wfraug(:,:,:,(iibandpp-1)*n6+1:iibandpp*n6)
1342        else
1343          wfraug_ptr => wfraug
1344        end if
1345        if (abs(occ_k(ind_occ)) >=tol8.or.option_fourwf==0) then
1346          if(have_to_reequilibrate) then
1347            call fourwf(1,rhoaug, &
1348 &           cwavef_fft(:,(npw_fft*(iibandpp-1))+1:(npw_fft*iibandpp)), &
1349 &           dummy,wfraug_ptr,gbound_,gbound_,&
1350 &           istwf_k_,kg_k_fft,kg_k_fft,mgfft,mpi_enreg,1,&
1351 &           ngfft,npw_fft,1,n4,n5,n6,option_fourwf,tim_fourwf,weight,weight,&
1352 &           gpu_option=gpu_option_)
1353          else
1354            call fourwf(1,rhoaug,&
1355 &           cwavef_alltoall1(:,(ndatarecv*(iibandpp-1))+1:(ndatarecv*iibandpp)),&
1356 &           dummy,wfraug_ptr,gbound_,gbound_,&
1357 &           istwf_k_,kg_k_gather,kg_k_gather,mgfft,mpi_enreg,1,&
1358 &           ngfft,ndatarecv,1,n4,n5,n6,option_fourwf,&
1359 &           tim_fourwf,weight,weight)
1360          end if
1361          if (option_fourwf==0.and.nproc_fft>1) then
1362            if (me_fft>0) then
1363              nd3=(ngfft(3)-1)/nproc_fft+1
1364              wfraug_ptr(:,:,:,me_fft*nd3+1:me_fft*nd3+nd3)=wfraug_ptr(:,:,:,1:nd3)
1365              wfraug_ptr(:,:,:,1:nd3)=zero
1366            end if
1367            call xmpi_sum(wfraug_ptr,mpi_enreg%comm_fft,ier)
1368          end if
1369        end if
1370      end do
1371    end if ! (gpu_option/=0)
1372 
1373 !  -----------------------------------------------------
1374 !  Sorting waves functions below the processors
1375 !  -----------------------------------------------------
1376 !  cwavef_alltoall(:,index_wavef_band) = cwavef_alltoall(:,:)   ! NOT NEEDED
1377    ABI_FREE(index_wavef_band)
1378 
1379 !====================================================================
1380  else if (flag_inv_sym) then
1381 
1382 !  -------------------------------------------------------------
1383 !  Computation of the index to class the waves functions below bandpp
1384 !  -------------------------------------------------------------
1385    call prep_index_wavef_bandpp(nproc_band,bandpp,&
1386 &   1,ndatarecv,&
1387 &   recvcounts,rdispls,&
1388 &   index_wavef_band)
1389 
1390 !  -------------------------------------------------------
1391 !  Sorting the wave functions below bandpp
1392 !  -------------------------------------------------------
1393    cwavef_alltoall1(:,:) = cwavef_alltoall2(:,index_wavef_band)
1394 
1395 !  ------------------------------------------------------------
1396 !  We associate the waves functions by two
1397 !  ------------------------------------------------------------
1398    call prep_wavef_sym_do(mpi_enreg,bandpp,1,&
1399 &   ndatarecv,&
1400 &   ndatarecv_tot,ndatasend_sym,tab_proc,&
1401 &   cwavef_alltoall1,&
1402 &   sendcounts_sym,sdispls_sym,&
1403 &   recvcounts_sym,rdispls_sym,&
1404 &   ewavef_alltoall_sym,&
1405 &   index_wavef_send)
1406 
1407 !  ------------------------------------------------------------
1408 !  Fourier calculation
1409 !  ------------------------------------------------------------
1410 !  Cuda version
1411    if (gpu_option_/=ABI_GPU_DISABLED) then
1412      ABI_MALLOC(weight1_t,(bandpp_sym))
1413      ABI_MALLOC(weight2_t,(bandpp_sym))
1414      do iibandpp=1,bandpp_sym
1415        if (bandpp/=1) then
1416          ind_occ1 = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + (2*iibandpp-1)
1417          ind_occ2 = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + (2*iibandpp  )
1418        else
1419          ind_occ1 = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + 1
1420          ind_occ2 = ind_occ1
1421        end if
1422        weight1_t(iibandpp) = occ_k(ind_occ1)*wtk/ucvol
1423        weight2_t(iibandpp) = occ_k(ind_occ2)*wtk/ucvol
1424      end do
1425      call timab(240+tim_fourwf,1,tsec)
1426      if (gpu_option_==ABI_GPU_LEGACY) then
1427 #if defined HAVE_GPU_CUDA
1428        call gpu_fourwf(1,rhoaug,&
1429 &       ewavef_alltoall_sym,&
1430 &       dummy,wfraug,gbound_,gbound_,&
1431 &       istwf_k_,kg_k_gather_sym,kg_k_gather_sym,mgfft,mpi_enreg,bandpp_sym,&
1432 &       ngfft,ndatarecv_tot,1,n4,n5,n6,option_fourwf,mpi_enreg%paral_kgb,&
1433 &       tim_fourwf,weight1_t,weight2_t)
1434 #endif
1435      else if(gpu_option_==ABI_GPU_KOKKOS) then
1436 #if defined HAVE_GPU_CUDA
1437        call gpu_fourwf_managed(1,rhoaug,&
1438 &       ewavef_alltoall_sym,&
1439 &       dummy,wfraug,gbound_,gbound_,&
1440 &       istwf_k_,kg_k_gather_sym,kg_k_gather_sym,mgfft,mpi_enreg,bandpp_sym,&
1441 &       ngfft,ndatarecv_tot,1,n4,n5,n6,option_fourwf,mpi_enreg%paral_kgb,&
1442 &       tim_fourwf,weight1_t,weight2_t)
1443 #endif
1444      else if (gpu_option_==ABI_GPU_OPENMP) then
1445 #ifdef HAVE_OPENMP_OFFLOAD
1446        call ompgpu_fourwf(1,rhoaug,&
1447 &       ewavef_alltoall_sym,&
1448 &       dummy,wfraug,gbound_,gbound_,&
1449 &       istwf_k_,kg_k_gather_sym,kg_k_gather_sym,mgfft,bandpp_sym,&
1450 &       ngfft,ndatarecv_tot,1,n4,n5,n6,option_fourwf,&
1451 &       weight1_t,weight2_t)
1452 #endif
1453      end if ! gpu_option_
1454      call timab(240+tim_fourwf,2,tsec)
1455      ABI_FREE(weight1_t)
1456      ABI_FREE(weight2_t)
1457 
1458 !  Standard version
1459    else
1460      if (option_fourwf==0.and.bandpp>1) then
1461        ABI_MALLOC(wfraug_ptr,(2,n4,n5,n6))
1462      else
1463        wfraug_ptr => wfraug
1464      end if
1465      do iibandpp=1,bandpp_sym
1466        if (bandpp/=1) then
1467          ind_occ1 = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + (2*iibandpp-1)
1468          ind_occ2 = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + (2*iibandpp  )
1469        else
1470          ind_occ1 = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + 1
1471          ind_occ2 = ind_occ1
1472        end if
1473        weight1 = occ_k(ind_occ1)*wtk/ucvol
1474        weight2 = occ_k(ind_occ2)*wtk/ucvol
1475        call fourwf(1,rhoaug,&
1476 &       ewavef_alltoall_sym(:,(ndatarecv_tot*(iibandpp-1))+1:(ndatarecv_tot*iibandpp)),&
1477 &       dummy,wfraug_ptr,gbound_,gbound_,&
1478 &       istwf_k_,kg_k_gather_sym,kg_k_gather_sym,mgfft,mpi_enreg,1,&
1479 &       ngfft,ndatarecv_tot,1,n4,n5,n6,option_fourwf,&
1480 &       tim_fourwf,weight1,weight2)
1481        if (option_fourwf==0) then
1482          if (modulo(bandpp,2)==0) then
1483            jjbandpp=2*iibandpp-1
1484            wfraug(1,:,:,(jjbandpp-1)*n6+1:jjbandpp*n6)=wfraug_ptr(1,:,:,1:n6)
1485            wfraug(1,:,:,(jjbandpp)*n6+1:(jjbandpp+1)*n6)=wfraug_ptr(2,:,:,1:n6)
1486          else if (bandpp>1) then
1487            wfraug(1,:,:,(iibandpp-1)*n6+1:iibandpp*n6)=wfraug_ptr(1,:,:,1:n6)
1488          end if
1489          if (nproc_fft>1) then
1490            if (me_fft>0) then
1491              nd3=(ngfft(3)-1)/nproc_fft+1
1492              wfraug(1,:,:,me_fft*nd3+1:me_fft*nd3+nd3)=wfraug(1,:,:,1:nd3)
1493              wfraug(1,:,:,1:nd3)=zero
1494            end if
1495            call xmpi_sum(wfraug,mpi_enreg%comm_fft,ier)
1496          end if
1497        end if
1498      end do
1499      if (option_fourwf==0.and.bandpp>1) then
1500        ABI_FREE(wfraug_ptr)
1501      end if
1502    end if ! (gpu_option/=ABI_GPU_DISABLED)
1503 
1504 !  ------------------------------------------------------------
1505 !  We dissociate each wave function in two waves functions
1506 !  gwavef is classed below of bandpp
1507 !  ------------------------------------------------------------
1508    call prep_wavef_sym_undo(mpi_enreg,bandpp,1,&
1509 &   ndatarecv,&
1510 &   ndatarecv_tot,ndatasend_sym,idatarecv0,&
1511 &   cwavef_alltoall1,&
1512 &   sendcounts_sym,sdispls_sym,&
1513 &   recvcounts_sym,rdispls_sym,&
1514 &   ewavef_alltoall_sym,&
1515 &   index_wavef_send)
1516 
1517    ABI_FREE(ewavef_alltoall_sym)
1518    ABI_FREE(index_wavef_send)
1519 
1520 !  -------------------------------------------------------
1521 !  Sorting waves functions below the processors
1522 !  -------------------------------------------------------
1523 !  cwavef_alltoall(:,index_wavef_band) = cwavef_alltoall(:,:) ! NOT NEEDED
1524 
1525    ABI_FREE(index_wavef_band)
1526 
1527  end if
1528 
1529 !====================================================================
1530  if(have_to_reequilibrate) then
1531    ABI_FREE(buff_wf)
1532    ABI_FREE(cwavef_fft)
1533    if(bandpp > 1) then
1534      ABI_FREE(cwavef_fft_tr)
1535    end if
1536  end if
1537  ABI_FREE(sendcountsloc)
1538  ABI_FREE(sdisplsloc)
1539  ABI_FREE(recvcountsloc)
1540  ABI_FREE(rdisplsloc)
1541  ABI_FREE(cwavef_alltoall2)
1542  if ( ((.not.flag_inv_sym) .and. (bandpp>1) ) .or. flag_inv_sym ) then
1543    if(gpu_option_==ABI_GPU_KOKKOS) then
1544 #if defined HAVE_GPU && defined HAVE_YAKL
1545      ABI_FREE_MANAGED(cwavef_alltoall1)
1546 #endif
1547    else
1548      ABI_FREE(cwavef_alltoall1)
1549    end if
1550  end if
1551 
1552 end subroutine prep_fourwf

ABINIT/prep_getghc [ Functions ]

[ Top ] [ Functions ]

NAME

 prep_getghc

FUNCTION

 this routine prepares the data to the call of getghc.

INPUTS

  blocksize= size of block for FFT
  cpopt=flag defining the status of cprjin%cp(:)=<Proj_i|Cnk> scalars (see below, side effects)
  cwavef(2,npw*my_nspinor*blocksize)=planewave coefficients of wavefunction.
  gs_hamk <type(gs_hamiltonian_type)>=all data for the hamiltonian at k
  gvnlxc=matrix elements <G|Vnonlocal+VFockACE|C>
  lambda=factor to be used when computing <G|H-lambda.S|C> - only for sij_opt=-1
         Typically lambda is the eigenvalue (or its guess)
  mpi_enreg=information about mpi parallelization
  prtvol=control print volume and debugging output
  sij_opt= -PAW ONLY-  if  0, only matrix elements <G|H|C> have to be computed
     (S=overlap)       if  1, matrix elements <G|S|C> have to be computed in gsc in addition to ghc
                       if -1, matrix elements <G|H-lambda.S|C> have to be computed in ghc (gsc not used)

OUTPUT

  gwavef=(2,npw*my_nspinor*blocksize)=matrix elements <G|H|C> (if sij_opt>=0)
                                  or <G|H-lambda.S|C> (if sij_opt=-1).
  swavef=(2,npw*my_nspinor*blocksize)=matrix elements <G|S|C>.

SIDE EFFECTS

  ====== if gs_hamk%usepaw==1
  cwaveprj(natom,my_nspinor*bandpp)= wave functions at k projected with nl projectors

SOURCE

105 subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksize, &
106                        mpi_enreg, prtvol, sij_opt, cpopt, cwaveprj, &
107                        already_transposed) ! optional argument
108 
109 !Arguments ------------------------------------
110 !scalars
111  integer,intent(in) :: blocksize,cpopt,prtvol,sij_opt
112  logical, intent(in),optional :: already_transposed
113  real(dp),intent(in) :: lambda
114  type(gs_hamiltonian_type),intent(inout) :: gs_hamk
115  type(mpi_type),intent(in) :: mpi_enreg
116 !arrays
117  real(dp),intent(in) :: cwavef(:,:)
118  real(dp),intent(inout) :: gvnlxc (:,:),gwavef(:,:),swavef(:,:)
119  type(pawcprj_type), intent(inout) :: cwaveprj(:,:)
120 
121 !Local variables-------------------------------
122 !scalars
123  integer,parameter :: tim_getghc=6
124  integer :: bandpp,bandpp_sym,idatarecv0,ier,ikpt_this_proc,iscalc,mcg,my_nspinor
125  integer :: nbval,ndatarecv,ndatarecv_tot,ndatasend_sym,nproc_band,nproc_fft
126  integer :: spaceComm
127  logical :: flag_inv_sym, do_transpose, local_gvnlxc
128  !character(len=500) :: msg
129 !arrays
130  integer,allocatable :: index_wavef_band(:),index_wavef_send(:),index_wavef_spband(:)
131  integer,allocatable :: rdisplsloc(:),recvcountsloc(:),sdisplsloc(:),sendcountsloc(:)
132  integer,ABI_CONTIGUOUS pointer :: kg_k_gather_sym(:,:)
133  integer,ABI_CONTIGUOUS pointer :: rdispls(:),rdispls_sym(:)
134  integer,ABI_CONTIGUOUS pointer :: recvcounts(:),recvcounts_sym(:),recvcounts_sym_tot(:)
135  integer,ABI_CONTIGUOUS pointer :: sdispls(:),sdispls_sym(:)
136  integer,ABI_CONTIGUOUS pointer :: sendcounts(:),sendcounts_sym(:),sendcounts_sym_all(:)
137  integer,ABI_CONTIGUOUS pointer :: tab_proc(:)
138  real(dp) :: tsec(2)
139  real(dp),allocatable,target :: cwavef_alltoall1(:,:), gvnlxc_alltoall1(:,:)
140  real(dp),allocatable,target :: gwavef_alltoall1(:,:), swavef_alltoall1(:,:)
141 
142 #if defined HAVE_GPU && defined HAVE_YAKL
143  real(c_double), ABI_CONTIGUOUS pointer :: cwavef_alltoall2(:,:) => null()
144  real(c_double), ABI_CONTIGUOUS pointer :: gvnlxc_alltoall2(:,:) => null()
145  real(c_double), ABI_CONTIGUOUS pointer :: gwavef_alltoall2(:,:) => null()
146  real(c_double), ABI_CONTIGUOUS pointer :: swavef_alltoall2(:,:) => null()
147 #else
148  real(dp),allocatable,target :: cwavef_alltoall2(:,:)
149  real(dp),allocatable,target :: gvnlxc_alltoall2(:,:)
150  real(dp),allocatable,target :: gwavef_alltoall2(:,:)
151  real(dp),allocatable,target :: swavef_alltoall2(:,:)
152 #endif
153 
154  real(dp),pointer :: ewavef_alltoall_sym(:,:)
155  real(dp),pointer :: gvnlxc_alltoall_sym(:,:)
156  real(dp),pointer :: gwavef_alltoall_sym(:,:)
157  real(dp),pointer :: swavef_alltoall_sym(:,:)
158 
159 ! *************************************************************************
160 
161  call timab(630,1,tsec)
162  call timab(631,3,tsec)
163 
164 !Some inits
165  nproc_band = mpi_enreg%nproc_band
166  nproc_fft  = mpi_enreg%nproc_fft
167  bandpp     = mpi_enreg%bandpp
168  my_nspinor = max(1,gs_hamk%nspinor/mpi_enreg%nproc_spinor)
169 
170  do_transpose = .true.
171  if(present(already_transposed)) then
172    if(already_transposed) do_transpose = .false.
173  end if
174 
175  flag_inv_sym = (gs_hamk%istwf_k==2 .and. any(gs_hamk%ngfft(7) == [401,402,312,512]))
176  if (flag_inv_sym) then
177    gs_hamk%istwf_k = 1
178    if (modulo(bandpp,2)==0) bandpp_sym = bandpp/2
179    if (modulo(bandpp,2)/=0) bandpp_sym = bandpp
180  end if
181 
182 !Check sizes
183  mcg=2*gs_hamk%npw_fft_k*my_nspinor*bandpp
184  if (do_transpose) mcg=2*gs_hamk%npw_k*my_nspinor*blocksize
185  if (size(cwavef)<mcg) then
186    ABI_BUG('wrong size for cwavef!')
187  end if
188  if (size(gwavef)<mcg) then
189    ABI_BUG('wrong size for gwavef!')
190  end if
191  local_gvnlxc = .false.
192  if (size(gvnlxc)==0) then
193    local_gvnlxc = .true.
194  end if
195  if ((.not.local_gvnlxc).and.size(gvnlxc)<mcg) then
196    ABI_BUG('wrong size for gvnlxc!')
197  end if
198  if (sij_opt==1) then
199    if (size(swavef)<mcg) then
200      ABI_BUG('wrong size for swavef!')
201    end if
202  end if
203  if (gs_hamk%usepaw==1.and.cpopt>=0) then
204    if (size(cwaveprj)<gs_hamk%natom*my_nspinor*bandpp) then
205      ABI_BUG('wrong size for cwaveprj!')
206    end if
207  end if
208 
209 !====================================================================================
210 
211  spaceComm=mpi_enreg%comm_fft
212  if(mpi_enreg%paral_kgb==1) spaceComm=mpi_enreg%comm_band
213 
214  ikpt_this_proc=bandfft_kpt_get_ikpt()
215 
216  ABI_MALLOC(sendcountsloc,(nproc_band))
217  ABI_MALLOC(sdisplsloc   ,(nproc_band))
218  ABI_MALLOC(recvcountsloc,(nproc_band))
219  ABI_MALLOC(rdisplsloc   ,(nproc_band))
220 
221  recvcounts   =>bandfft_kpt(ikpt_this_proc)%recvcounts(:)
222  sendcounts   =>bandfft_kpt(ikpt_this_proc)%sendcounts(:)
223  rdispls      =>bandfft_kpt(ikpt_this_proc)%rdispls   (:)
224  sdispls      =>bandfft_kpt(ikpt_this_proc)%sdispls   (:)
225  ndatarecv    = bandfft_kpt(ikpt_this_proc)%ndatarecv
226 
227  if (flag_inv_sym ) then
228    idatarecv0           = bandfft_kpt(ikpt_this_proc)%idatarecv0
229    ndatarecv_tot        = bandfft_kpt(ikpt_this_proc)%ndatarecv_tot
230    ndatasend_sym        = bandfft_kpt(ikpt_this_proc)%ndatasend_sym
231    kg_k_gather_sym      =>bandfft_kpt(ikpt_this_proc)%kg_k_gather_sym(:,:)
232    rdispls_sym          =>bandfft_kpt(ikpt_this_proc)%rdispls_sym(:)
233    recvcounts_sym       =>bandfft_kpt(ikpt_this_proc)%recvcounts_sym(:)
234    recvcounts_sym_tot   =>bandfft_kpt(ikpt_this_proc)%recvcounts_sym_tot(:)
235    sdispls_sym          =>bandfft_kpt(ikpt_this_proc)%sdispls_sym(:)
236    sendcounts_sym       =>bandfft_kpt(ikpt_this_proc)%sendcounts_sym(:)
237    sendcounts_sym_all   =>bandfft_kpt(ikpt_this_proc)%sendcounts_sym_all(:)
238    tab_proc             =>bandfft_kpt(ikpt_this_proc)%tab_proc(:)
239  end if
240  iscalc=(sij_opt+1)/2  ! 0 if S not calculated, 1 otherwise
241  nbval=(ndatarecv*my_nspinor*bandpp)*iscalc
242 
243  if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
244 & ((.not.flag_inv_sym) .and. bandpp>1) .or.  flag_inv_sym  ) then
245    ABI_MALLOC(cwavef_alltoall1,(2,ndatarecv*my_nspinor*bandpp))
246    ABI_MALLOC(gwavef_alltoall1,(2,ndatarecv*my_nspinor*bandpp))
247    ABI_MALLOC(swavef_alltoall1,(2,ndatarecv*my_nspinor*bandpp))
248    if (local_gvnlxc) then
249      ABI_MALLOC(gvnlxc_alltoall1,(0,0))
250    else
251      ABI_MALLOC(gvnlxc_alltoall1,(2,ndatarecv*my_nspinor*bandpp))
252    end if
253    swavef_alltoall1(:,:)=zero
254    cwavef_alltoall1(:,:)=zero
255    gwavef_alltoall1(:,:)=zero
256    if (.not.local_gvnlxc) gvnlxc_alltoall1(:,:)=zero
257  end if
258 
259  if(gs_hamk%gpu_option==ABI_GPU_KOKKOS) then
260 #if defined HAVE_GPU && defined HAVE_YAKL
261    ABI_MALLOC_MANAGED(cwavef_alltoall2,(/2,ndatarecv*my_nspinor*bandpp/))
262    ABI_MALLOC_MANAGED(gwavef_alltoall2,(/2,ndatarecv*my_nspinor*bandpp/))
263    ABI_MALLOC_MANAGED(swavef_alltoall2,(/2,ndatarecv*my_nspinor*bandpp/))
264    if (local_gvnlxc) then
265      ABI_MALLOC_MANAGED(gvnlxc_alltoall2,(/0,0/))
266    else
267      ABI_MALLOC_MANAGED(gvnlxc_alltoall2,(/2,ndatarecv*my_nspinor*bandpp/))
268    end if
269 #endif
270  else
271    ABI_MALLOC(cwavef_alltoall2,(2,ndatarecv*my_nspinor*bandpp))
272    ABI_MALLOC(gwavef_alltoall2,(2,ndatarecv*my_nspinor*bandpp))
273    ABI_MALLOC(swavef_alltoall2,(2,ndatarecv*my_nspinor*bandpp))
274    if (local_gvnlxc) then
275      ABI_MALLOC(gvnlxc_alltoall2,(0,0))
276    else
277      ABI_MALLOC(gvnlxc_alltoall2,(2,ndatarecv*my_nspinor*bandpp))
278    end if
279  end if
280 
281  swavef_alltoall2(:,:)=zero
282  cwavef_alltoall2(:,:)=zero
283  gwavef_alltoall2(:,:)=zero
284  if (.not.local_gvnlxc) gvnlxc_alltoall2(:,:)=zero
285 
286  recvcountsloc(:)=recvcounts(:)*2*my_nspinor*bandpp
287  rdisplsloc(:)=rdispls(:)*2*my_nspinor*bandpp
288  sendcountsloc(:)=sendcounts(:)*2*my_nspinor
289  sdisplsloc(:)=sdispls(:)*2*my_nspinor
290  call timab(631,2,tsec)
291 
292  if(do_transpose) then
293    call timab(545,3,tsec)
294    if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
295 &   ((.not.flag_inv_sym) .and. bandpp>1) .or.  flag_inv_sym  ) then
296      call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall1,&
297 &     recvcountsloc,rdisplsloc,spaceComm,ier)
298    else
299      call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
300 &     recvcountsloc,rdisplsloc,spaceComm,ier)
301    end if
302    call timab(545,2,tsec)
303  else
304    ! Here, we cheat, and use DCOPY to bypass some compiler's overzealous bound-checking
305    ! (ndatarecv*my_nspinor*bandpp might be greater than the declared size of cwavef)
306    call DCOPY(2*ndatarecv*my_nspinor*bandpp, cwavef, 1, cwavef_alltoall2, 1)
307  end if
308 
309 !====================================================================
310  if ((.not.(flag_inv_sym)) .and. (bandpp==1)) then
311    if (do_transpose .and. mpi_enreg%paral_spinor==0.and.my_nspinor==2)then
312      call timab(632,3,tsec)
313 !    Sort to have all ispinor=1 first, then all ispinor=2
314      call prep_sort_wavef_spin(nproc_band,my_nspinor,ndatarecv,recvcounts,rdispls,index_wavef_spband)
315      cwavef_alltoall2(:,:)=cwavef_alltoall1(:,index_wavef_spband)
316      call timab(632,2,tsec)
317    end if
318 
319    call timab(635,3,tsec)
320    call multithreaded_getghc(cpopt,cwavef_alltoall2,cwaveprj,gwavef_alltoall2,swavef_alltoall2(:,1:nbval),&
321 &   gs_hamk,gvnlxc_alltoall2,lambda,mpi_enreg,1,prtvol,sij_opt,tim_getghc,0)
322    call timab(635,2,tsec)
323 
324    if (do_transpose .and. mpi_enreg%paral_spinor==0.and.my_nspinor==2)then
325      call timab(634,3,tsec)
326      gwavef_alltoall1(:,index_wavef_spband)=gwavef_alltoall2(:,:)
327      if (sij_opt==1) swavef_alltoall1(:,index_wavef_spband)=swavef_alltoall2(:,:)
328      if (.not.local_gvnlxc) gvnlxc_alltoall1(:,index_wavef_spband)=gvnlxc_alltoall2(:,:)
329      ABI_FREE(index_wavef_spband)
330      call timab(634,2,tsec)
331    end if
332 
333  else if ((.not.(flag_inv_sym)) .and. (bandpp>1)) then
334 !  -------------------------------------------------------------
335 !  Computation of the index to class the waves functions below bandpp
336 !  -------------------------------------------------------------
337 
338    if(do_transpose) then
339      call timab(632,3,tsec)
340      call prep_index_wavef_bandpp(nproc_band,bandpp,&
341 &     my_nspinor,ndatarecv, recvcounts,rdispls, index_wavef_band)
342 !  -------------------------------------------------------
343 !  Sorting of the waves functions below bandpp
344 !  -------------------------------------------------------
345      cwavef_alltoall2(:,:) = cwavef_alltoall1(:,index_wavef_band)
346      call timab(632,2,tsec)
347    end if
348 
349 !  ----------------------
350 !  Fourier transformation
351 !  ----------------------
352    call timab(636,3,tsec)
353    call multithreaded_getghc(cpopt,cwavef_alltoall2,cwaveprj,gwavef_alltoall2,swavef_alltoall2,gs_hamk,&
354 &   gvnlxc_alltoall2,lambda,mpi_enreg,bandpp,prtvol,sij_opt,tim_getghc,0)
355    call timab(636,2,tsec)
356 
357 !  -----------------------------------------------------
358 !  Sorting of waves functions below the processors
359 !  -----------------------------------------------------
360    if(do_transpose) then
361      call timab(634,3,tsec)
362      gwavef_alltoall1(:,index_wavef_band) = gwavef_alltoall2(:,:)
363      if (sij_opt==1) swavef_alltoall1(:,index_wavef_band) = swavef_alltoall2(:,:)
364      if (.not.local_gvnlxc) gvnlxc_alltoall1(:,index_wavef_band)  = gvnlxc_alltoall2(:,:)
365      ABI_FREE(index_wavef_band)
366      call timab(634,2,tsec)
367    end if
368 
369 
370  else if (flag_inv_sym) then
371 
372 !  -------------------------------------------------------------
373 !  Computation of the index to class the waves functions below bandpp
374 !  -------------------------------------------------------------
375    if(do_transpose) then
376      call timab(632,3,tsec)
377      call prep_index_wavef_bandpp(nproc_band,bandpp,&
378 &     my_nspinor,ndatarecv,&
379 &     recvcounts,rdispls,&
380 &     index_wavef_band)
381 
382 !  -------------------------------------------------------
383 !  Sorting the wave functions below bandpp
384 !  -------------------------------------------------------
385      cwavef_alltoall2(:,:) = cwavef_alltoall1(:,index_wavef_band)
386    end if
387 
388 !  ------------------------------------------------------------
389 !  We associate the waves functions by two
390 !  ------------------------------------------------------------
391    call prep_wavef_sym_do(mpi_enreg,bandpp,my_nspinor,&
392 &   ndatarecv,&
393 &   ndatarecv_tot,ndatasend_sym,tab_proc,&
394 &   cwavef_alltoall2,&
395 &   sendcounts_sym,sdispls_sym,&
396 &   recvcounts_sym,rdispls_sym,&
397 &   ewavef_alltoall_sym,&
398 &   index_wavef_send)
399 
400 !  ------------------------------------------------------------
401 !  Allocation
402 !  ------------------------------------------------------------
403    ABI_MALLOC(gwavef_alltoall_sym,(2,ndatarecv_tot*bandpp_sym))
404    ABI_MALLOC(swavef_alltoall_sym,(2,(ndatarecv_tot*bandpp_sym)*iscalc))
405    if (local_gvnlxc) then
406      ABI_MALLOC(gvnlxc_alltoall_sym ,(0,0))
407    else
408      ABI_MALLOC(gvnlxc_alltoall_sym ,(2,ndatarecv_tot*bandpp_sym))
409    end if
410    gwavef_alltoall_sym(:,:)=zero
411    swavef_alltoall_sym(:,:)=zero
412    if (.not.local_gvnlxc) gvnlxc_alltoall_sym(:,:)=zero
413 
414    call timab(632,2,tsec)
415 
416 !  ------------------------------------------------------------
417 !  Fourier calculation
418 !  ------------------------------------------------------------
419    call timab(637,3,tsec)
420    call multithreaded_getghc(cpopt,ewavef_alltoall_sym,cwaveprj,gwavef_alltoall_sym,swavef_alltoall_sym,gs_hamk,&
421 &   gvnlxc_alltoall_sym,lambda,mpi_enreg,bandpp_sym,prtvol,sij_opt,tim_getghc,1,&
422 &   kg_fft_k=kg_k_gather_sym)
423    call timab(637,2,tsec)
424 
425    call timab(633,3,tsec)
426 
427 !  ------------------------------------------------------------
428 !  We dissociate each wave function in two waves functions
429 !  gwavef is classed below of bandpp
430 !  ------------------------------------------------------------
431    call prep_wavef_sym_undo(mpi_enreg,bandpp,my_nspinor,&
432 &   ndatarecv,&
433 &   ndatarecv_tot,ndatasend_sym,idatarecv0,&
434 &   gwavef_alltoall2,&
435 &   sendcounts_sym,sdispls_sym,&
436 &   recvcounts_sym,rdispls_sym,&
437 &   gwavef_alltoall_sym,&
438 &   index_wavef_send)
439    if (sij_opt==1)then
440      call prep_wavef_sym_undo(mpi_enreg,bandpp,my_nspinor,&
441 &     ndatarecv,&
442 &     ndatarecv_tot,ndatasend_sym,idatarecv0,&
443 &     swavef_alltoall2,&
444 &     sendcounts_sym,sdispls_sym,&
445 &     recvcounts_sym,rdispls_sym,&
446 &     swavef_alltoall_sym,&
447 &     index_wavef_send)
448    end if
449    if (.not.local_gvnlxc) call prep_wavef_sym_undo(mpi_enreg,bandpp,my_nspinor,&
450 &   ndatarecv,&
451 &   ndatarecv_tot,ndatasend_sym,idatarecv0,&
452 &   gvnlxc_alltoall2,&
453 &   sendcounts_sym,sdispls_sym,&
454 &   recvcounts_sym,rdispls_sym,&
455 &   gvnlxc_alltoall_sym,&
456 &   index_wavef_send)
457 
458    ABI_FREE(ewavef_alltoall_sym)
459    ABI_FREE(index_wavef_send)
460    ABI_FREE(gwavef_alltoall_sym)
461    ABI_FREE(swavef_alltoall_sym)
462    ABI_FREE(gvnlxc_alltoall_sym)
463 
464 !  -------------------------------------------
465 !  We call getghc to calculate the nl matrix elements.
466 !  --------------------------------------------
467    gs_hamk%istwf_k=2
468    !!write(std_out,*)"Setting iswfk_k to 2"
469 
470    call timab(633,2,tsec)
471 
472    call timab(638,3,tsec)
473    call multithreaded_getghc(cpopt,cwavef_alltoall2,cwaveprj,gwavef_alltoall2,swavef_alltoall2,gs_hamk,&
474 &   gvnlxc_alltoall2,lambda,mpi_enreg,bandpp,prtvol,sij_opt,tim_getghc,2)
475    call timab(638,2,tsec)
476 
477    call timab(634,3,tsec)
478 
479    gs_hamk%istwf_k=1
480 
481 !  -------------------------------------------------------
482 !  Sorting the wave functions below the processors
483 !  -------------------------------------------------------
484    if(do_transpose) then
485 !    cwavef_alltoall(:,index_wavef_band) = cwavef_alltoall(:,:)   ! NOT NEEDED
486      gwavef_alltoall1(:,index_wavef_band) = gwavef_alltoall2(:,:)
487      if (sij_opt==1) swavef_alltoall1(:,index_wavef_band) = swavef_alltoall2(:,:)
488      if (.not.local_gvnlxc) gvnlxc_alltoall1(:,index_wavef_band)  = gvnlxc_alltoall2(:,:)
489      ABI_FREE(index_wavef_band)
490      call timab(634,2,tsec)
491    end if
492 
493  end if
494 !====================================================================
495 
496  if(do_transpose) then
497 
498    call timab(545,3,tsec)
499    if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
500 &   ((.not.flag_inv_sym) .and. bandpp>1) .or.  flag_inv_sym  ) then
501      if (sij_opt==1) then
502        call xmpi_alltoallv(swavef_alltoall1,recvcountsloc,rdisplsloc,swavef,&
503 &       sendcountsloc,sdisplsloc,spaceComm,ier)
504      end if
505      if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall1,recvcountsloc,rdisplsloc,gvnlxc,&
506 &     sendcountsloc,sdisplsloc,spaceComm,ier)
507      call xmpi_alltoallv(gwavef_alltoall1,recvcountsloc,rdisplsloc,gwavef,&
508 &     sendcountsloc,sdisplsloc,spaceComm,ier)
509    else
510      if (sij_opt==1) then
511        call xmpi_alltoallv(swavef_alltoall2,recvcountsloc,rdisplsloc,swavef,&
512 &       sendcountsloc,sdisplsloc,spaceComm,ier)
513      end if
514      if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall2,recvcountsloc,rdisplsloc,gvnlxc,&
515 &     sendcountsloc,sdisplsloc,spaceComm,ier)
516      call xmpi_alltoallv(gwavef_alltoall2,recvcountsloc,rdisplsloc,gwavef,&
517 &     sendcountsloc,sdisplsloc,spaceComm,ier)
518    end if
519 
520    call timab(545,2,tsec)
521  else
522    if(sij_opt == 1) then
523      call DCOPY(2*ndatarecv*my_nspinor*bandpp, swavef_alltoall2, 1, swavef, 1)
524    end if
525    if (.not.local_gvnlxc) call DCOPY(2*ndatarecv*my_nspinor*bandpp, gvnlxc_alltoall2, 1, gvnlxc, 1)
526    call DCOPY(2*ndatarecv*my_nspinor*bandpp, gwavef_alltoall2, 1, gwavef, 1)
527  end if
528 
529 !====================================================================
530  if (flag_inv_sym) then
531    gs_hamk%istwf_k = 2
532  end if
533 !====================================================================
534  ABI_FREE(sendcountsloc)
535  ABI_FREE(sdisplsloc)
536  ABI_FREE(recvcountsloc)
537  ABI_FREE(rdisplsloc)
538 
539  if(gs_hamk%gpu_option==ABI_GPU_KOKKOS) then
540 #if defined HAVE_GPU && defined HAVE_YAKL
541    ABI_FREE_MANAGED(cwavef_alltoall2)
542    ABI_FREE_MANAGED(gwavef_alltoall2)
543    ABI_FREE_MANAGED(gvnlxc_alltoall2)
544    ABI_FREE_MANAGED(swavef_alltoall2)
545 #endif
546  else
547    ABI_FREE(cwavef_alltoall2)
548    ABI_FREE(gwavef_alltoall2)
549    ABI_FREE(gvnlxc_alltoall2)
550    ABI_FREE(swavef_alltoall2)
551  end if
552 
553  if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. &
554 & ((.not.flag_inv_sym) .and. bandpp>1) .or.  flag_inv_sym  ) then
555    ABI_FREE(cwavef_alltoall1)
556    ABI_FREE(gwavef_alltoall1)
557    ABI_FREE(gvnlxc_alltoall1)
558    ABI_FREE(swavef_alltoall1)
559  end if
560 
561  call timab(630,2,tsec)
562 
563 end subroutine prep_getghc

ABINIT/prep_index_wavef_bandpp [ Functions ]

[ Top ] [ Functions ]

NAME

 prep_index_wavef_bandpp

FUNCTION

 this routine sorts the waves functions by bandpp and by processors
 after the alltoall

INPUTS

  nproc_band = number of processors below the band
  bandpp     = number of groups of couple of waves functions
  nspinor    = number of spin
  ndatarecv  = total number of values received by the processor and sended
               by the other processors band
  recvcounts = number of values sended by each processor band and received
               by the processor
  rdispls    = positions of the values received by the processor and
               sended by each processor band

OUTPUT

  index_wavef_band = position of the sorted values

SOURCE

2078 subroutine prep_index_wavef_bandpp(nproc_band,bandpp,&
2079                              nspinor,ndatarecv,&
2080                              recvcounts,rdispls,&
2081                              index_wavef_band)
2082 
2083 !Arguments ------------------------------------
2084 !scalars
2085  integer,intent(in) :: bandpp,ndatarecv,nproc_band,nspinor
2086 !arrays
2087  integer,intent(in) :: rdispls(nproc_band),recvcounts(nproc_band)
2088  integer,allocatable,intent(out) :: index_wavef_band(:)
2089 
2090 !Local variables-------------------------------
2091 !scalars
2092  integer :: delta,idebc,idebe,ifinc,ifine,iindex,iproc,kbandpp,nb
2093 
2094 ! *********************************************************************
2095 
2096 !DEBUG
2097 !write(std_out,*)' prep_index_wavef_banpp : enter '
2098 !write(std_out,*) 'ndatarecv = ', ndatarecv
2099 !write(std_out,*) 'rdispls(:) = ', rdispls(:)
2100 !write(std_out,*) 'recvcounts(:) = ', recvcounts(:)
2101 !ENDDEBUG
2102 
2103 
2104 !---------------------------------------------
2105 !Allocation
2106 !---------------------------------------------
2107  ABI_MALLOC(index_wavef_band ,(bandpp*nspinor*ndatarecv))
2108  index_wavef_band(:)   =0
2109 
2110 !---------------------------------------------
2111 !Calcul : loops on bandpp and processors band
2112 !---------------------------------------------
2113  nb = sum(recvcounts(1:nproc_band))
2114  do kbandpp=1,bandpp
2115 
2116    do iproc=1,nproc_band
2117 
2118      idebe = (rdispls(iproc) + 1)  + (kbandpp-1) * ndatarecv*nspinor
2119      ifine = idebe + recvcounts(iproc) -1
2120 
2121      if (iproc==1) then
2122        idebc =   (kbandpp-1)* recvcounts(iproc)*nspinor + 1
2123      else
2124        idebc = (bandpp)  * sum(recvcounts(1:iproc-1))*nspinor &
2125        + (kbandpp-1)* recvcounts(iproc)*nspinor &
2126        + 1
2127      end if
2128      ifinc = idebc + recvcounts(iproc) -1
2129      index_wavef_band(idebe:ifine) = (/( iindex,iindex=idebc,ifinc)/)
2130      delta=ifine-idebe
2131      if (nspinor==2) then
2132        index_wavef_band(idebe+nb :idebe+nb +delta)=(/( iindex,iindex=ifinc+1,ifinc+1+delta)/)
2133      end if
2134    end do
2135  end do
2136 
2137 end subroutine prep_index_wavef_bandpp

abinit/prep_nonlop [ Functions ]

[ Top ] [ abinit ] [ Functions ]

NAME

 prep_nonlop

FUNCTION

 this routine prepares the data to the call of nonlop.

INPUTS

  choice: chooses possible output:
    choice=1 => a non-local energy contribution
          =2 => a gradient with respect to atomic position(s)
          =3 => a gradient with respect to strain(s)
          =23=> a gradient with respect to atm. pos. and strain(s)
          =4 => a 2nd derivative with respect to atomic pos.
          =24=> a gradient and 2nd derivative with respect to atomic pos.
          =5 => a gradient with respect to k wavevector
          =6 => 2nd derivatives with respect to strain and atm. pos.
          =7 => no operator, just projections
  blocksize= size of block for FFT
  cpopt=flag defining the status of cwaveprj=<Proj_i|Cnk> scalars (see below, side effects)
  cwavef(2,npw*my_nspinor*blocksize)=planewave coefficients of wavefunction.
  gvnlxc=matrix elements <G|Vnonlocal+VFockACE|C>
  hamk <type(gs_hamiltonian_type)>=data defining the Hamiltonian at a given k (NL part involved here)
  idir=direction of the - atom to be moved in the case (choice=2,signs=2),
                        - k point direction in the case (choice=5,signs=2)
                        - strain component (1:6) in the case (choice=2,signs=2) or (choice=6,signs=1)
  lambdablock(blocksize)=factor to be used when computing (Vln-lambda.S) - only for paw_opt=2
  mpi_enreg=information about mpi parallelization
  nnlout=dimension of enlout (when signs=1):
  ntypat=number of types of atoms in cell
  paw_opt= define the nonlocal operator concerned with
  signs= if 1, get contracted elements (energy, forces, stress, ...)
         if 2, applies the non-local operator to a function in reciprocal space
  tim_nonlop=timing code of the calling routine (can be set to 0 if not attributed)
  vectproj(2,nprojs,my_nspinor*ndat)=Optional, vector to be used instead of cprjin%cp when provided

OUTPUT

  ==== if (signs==1) ====
  enlout_block(nnlout)=
    if paw_opt==0, 1 or 2: contribution of this block of states to the nl part of various properties
    if paw_opt==3:         contribution of this block of states to <c|S|c>  (where S=overlap when PAW)
  ==== if (signs==2) ====
    if paw_opt==0, 1, 2 or 4:
       gvnlc(2,my_nspinor*npw)=result of the application of the nl operator
                        or one of its derivative to the input vect.
    if paw_opt==3 or 4:
       gsc(2,my_nspinor*npw*(paw_opt/3))=result of the aplication of (I+S)
                        to the input vect. (where S=overlap when PAW)

SIDE EFFECTS

  ==== ONLY IF useylm=1
  cwaveprj(natom,my_nspinor) <type(pawcprj_type)>=projected input wave function |c> on non-local projector
                                  =<p_lmn|c> and derivatives
                                  Treatment depends on cpopt parameter:
                     if cpopt=-1, <p_lmn|in> (and derivatives)
                                  have to be computed (and not saved)
                     if cpopt= 0, <p_lmn|in> have to be computed and saved
                                  derivatives are eventually computed but not saved
                     if cpopt= 1, <p_lmn|in> and first derivatives have to be computed and saved
                                  other derivatives are eventually computed but not saved
                     if cpopt= 2  <p_lmn|in> are already in memory;
                                  only derivatives are computed here and not saved
 (if useylm=0, should have cpopt=-1)

NOTES

  cprj (as well as cg) is distributed over band processors.
  Only the mod((iband-1)/mpi_enreg%bandpp,mpi_enreg%nproc_band) projected WFs are stored on each proc.

SOURCE

636 subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,&
637                        blocksize,mpi_enreg,nnlout,paw_opt,signs,gsc, tim_nonlop,cwavef,gvnlc, &
638                        already_transposed,gpu_option,vectproj) ! optional
639 
640 !Arguments ------------------------------------
641  integer,         intent(in)            :: blocksize,choice,cpopt,idir,signs,nnlout,paw_opt
642  logical,optional,intent(in)            :: already_transposed
643  integer,optional,intent(in)            :: gpu_option
644  real(dp),        intent(in)            :: lambdablock(blocksize)
645  real(dp),        intent(out)           :: enlout_block(nnlout*blocksize),gvnlc(:,:),gsc(:,:)
646  real(dp),        intent(inout)         :: cwavef(:,:)
647  real(dp),ABI_CONTIGUOUS optional,intent(inout)        :: vectproj(:,:,:)
648  type(gs_hamiltonian_type),intent(in)   :: hamk
649  type(mpi_type),intent(in)              :: mpi_enreg
650  type(pawcprj_type),intent(inout)       :: cwaveprj(:,:)
651 
652 !Local variables-------------------------------
653 !scalars
654  integer :: bandpp,ier,ikpt_this_proc,my_nspinor,ndatarecv,nproc_band,npw,nspinortot
655  integer :: spaceComm=0,tim_nonlop
656  logical :: do_transpose
657  integer :: l_gpu_option
658  !character(len=500) :: msg
659 !arrays
660  integer,  allocatable :: index_wavef_band(:)
661  integer,  allocatable :: rdisplsloc(:),recvcountsloc(:),sdisplsloc(:),sendcountsloc(:)
662  integer,ABI_CONTIGUOUS  pointer :: rdispls(:),recvcounts(:),sdispls(:),sendcounts(:)
663  real(dp) :: lambda_nonlop(mpi_enreg%bandpp)
664  real(dp) :: tsec(2)
665 
666 #if defined HAVE_GPU && defined HAVE_YAKL
667  real(c_double), ABI_CONTIGUOUS pointer :: cwavef_alltoall2(:,:) => null()
668  real(c_double), ABI_CONTIGUOUS pointer :: gvnlc_alltoall2(:,:)  => null()
669  real(c_double), ABI_CONTIGUOUS pointer :: gsc_alltoall2(:,:)    => null()
670  integer(kind=C_SIZE_T) :: cwavef_alltoall2_size
671  integer(kind=C_SIZE_T) :: gvnlc_alltoall2_size
672  integer(kind=C_SIZE_T) :: gsc_alltoall2_size
673 #else
674  real(dp), allocatable :: cwavef_alltoall2(:,:)
675  real(dp), allocatable :: gvnlc_alltoall2(:,:)
676  real(dp), allocatable :: gsc_alltoall2(:,:)
677 #endif
678 
679  real(dp), allocatable :: cwavef_alltoall1(:,:)
680  real(dp), allocatable :: gvnlc_alltoall1(:,:)
681  real(dp), allocatable :: gsc_alltoall1(:,:)
682  real(dp), allocatable :: enlout(:)
683 
684 #if defined HAVE_GPU && defined HAVE_YAKL
685  ! this buffer is necessary to avoid mixing "managed memory" buffer with "regular memory" buffer in MPI calls
686  ! Just to be clear:
687  ! - managed memory means memory allocated using ABI_MALLOC_MANAGED
688  ! - regular memory means memory allocated using either ABI_MALLOC or ABI_MALLOC_CUDA
689  !
690  ! here we chose to use a GPU buffer, would it be better to use a CPU buffer ? To be checked.
691  !real(dp), allocatable :: cwavef_mpi(:,:)
692  type(c_ptr)            :: cwavef_mpi_c
693  real(c_double),pointer :: cwavef_mpi(:,:)
694 #endif
695 
696 ! *************************************************************************
697 
698  DBG_ENTER('COLL')
699 
700  call timab(570,1,tsec)
701 
702  do_transpose = .true.
703  if(present(already_transposed)) then
704    if(already_transposed) do_transpose = .false.
705  end if
706 
707  l_gpu_option=ABI_GPU_DISABLED
708  if (present(gpu_option)) then
709     l_gpu_option = gpu_option
710  end if
711 
712  nproc_band = mpi_enreg%nproc_band
713  bandpp     = mpi_enreg%bandpp
714  spaceComm=mpi_enreg%comm_fft
715  if(mpi_enreg%paral_kgb==1) spaceComm=mpi_enreg%comm_band
716  my_nspinor=max(1,hamk%nspinor/mpi_enreg%nproc_spinor)
717  nspinortot=hamk%nspinor
718 
719 !Check sizes
720  npw=hamk%npw_k;if (.not.do_transpose) npw=hamk%npw_fft_k
721  if (size(cwavef)/=2*npw*my_nspinor*blocksize) then
722    ABI_BUG('Incorrect size for cwavef!')
723  end if
724  if(choice/=0.and.signs==2) then
725    if (paw_opt/=3) then
726      if (size(gvnlc)/=2*npw*my_nspinor*blocksize) then
727        ABI_BUG('Incorrect size for gvnlc!')
728      end if
729    end if
730    if(paw_opt>=3) then
731      if (size(gsc)/=2*npw*my_nspinor*blocksize) then
732        ABI_BUG('Incorrect size for gsc!')
733      end if
734    end if
735  end if
736  if(cpopt>=0.and. .not. present(vectproj)) then
737    if (size(cwaveprj)/=hamk%natom*my_nspinor*mpi_enreg%bandpp) then
738      ABI_BUG('Incorrect size for cwaveprj!')
739    end if
740  end if
741 
742  ABI_MALLOC(sendcountsloc,(nproc_band))
743  ABI_MALLOC(sdisplsloc   ,(nproc_band))
744  ABI_MALLOC(recvcountsloc,(nproc_band))
745  ABI_MALLOC(rdisplsloc   ,(nproc_band))
746 
747  ikpt_this_proc=bandfft_kpt_get_ikpt()
748 
749  recvcounts   => bandfft_kpt(ikpt_this_proc)%recvcounts(:)
750  sendcounts   => bandfft_kpt(ikpt_this_proc)%sendcounts(:)
751  rdispls      => bandfft_kpt(ikpt_this_proc)%rdispls   (:)
752  sdispls      => bandfft_kpt(ikpt_this_proc)%sdispls   (:)
753  ndatarecv    =  bandfft_kpt(ikpt_this_proc)%ndatarecv
754 
755  if(hamk%gpu_option==ABI_GPU_KOKKOS) then
756 #if defined HAVE_GPU && defined HAVE_YAKL
757    ABI_MALLOC_MANAGED(cwavef_alltoall2, (/2,ndatarecv*my_nspinor*bandpp/))
758    cwavef_alltoall2_size = 2*ndatarecv*my_nspinor*bandpp*dp
759 
760    if (paw_opt >= 0 .and. paw_opt < 3) then
761       gsc_alltoall2 => null()
762       gsc_alltoall2_size = 0
763    else
764       ABI_MALLOC_MANAGED(gsc_alltoall2,    (/2,ndatarecv*my_nspinor*(paw_opt/3)*bandpp/))
765       gsc_alltoall2_size = 2*ndatarecv*my_nspinor*(paw_opt/3)*bandpp*dp
766    endif
767 
768    ABI_MALLOC_MANAGED(gvnlc_alltoall2,  (/2,ndatarecv*my_nspinor*bandpp/))
769    gvnlc_alltoall2_size = 2*ndatarecv*my_nspinor*bandpp*dp
770 #endif
771  else
772    ABI_MALLOC(cwavef_alltoall2, (2,ndatarecv*my_nspinor*bandpp))
773    ABI_MALLOC(gsc_alltoall2,    (2,ndatarecv*my_nspinor*(paw_opt/3)*bandpp))
774    ABI_MALLOC(gvnlc_alltoall2,  (2,ndatarecv*my_nspinor*bandpp))
775  end if
776 
777  if(do_transpose .and. (bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2)))then
778    ABI_MALLOC(cwavef_alltoall1,(2,ndatarecv*my_nspinor*bandpp))
779    if(signs==2)then
780      if (paw_opt/=3) then
781        ABI_MALLOC(gvnlc_alltoall1,(2,ndatarecv*my_nspinor*bandpp))
782      end if
783      if (paw_opt==3.or.paw_opt==4) then
784        ABI_MALLOC(gsc_alltoall1,(2,ndatarecv*my_nspinor*bandpp))
785      end if
786    end if
787  end if
788 
789  ABI_MALLOC(enlout,(nnlout*bandpp))
790  enlout = zero
791 
792  recvcountsloc(:)=recvcounts(:)*2*my_nspinor*bandpp
793  rdisplsloc(:)=rdispls(:)*2*my_nspinor*bandpp
794  sendcountsloc(:)=sendcounts(:)*2*my_nspinor
795  sdisplsloc(:)=sdispls(:)*2*my_nspinor
796 
797  if(do_transpose) then
798    call timab(581,1,tsec)
799    if (bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2)) then
800    if (l_gpu_option == ABI_GPU_KOKKOS) then
801 #if defined HAVE_GPU && defined HAVE_YAKL
802       ABI_MALLOC_CUDA(cwavef_mpi_c,  INT(2, c_size_t) * npw * my_nspinor * blocksize * dp)
803       call c_f_pointer(cwavef_mpi_c, cwavef_mpi, (/2, npw * my_nspinor * blocksize/))
804 
805       ! use cwavef_mpi instead of cwavef (don't use managed memory in MPI calls)
806       call copy_gpu_to_gpu(cwavef_mpi_c, C_LOC(cwavef), INT(2, c_size_t) * npw * my_nspinor * blocksize * dp)
807 
808       call xmpi_alltoallv(cwavef_mpi,sendcountsloc,sdisplsloc,cwavef_alltoall1,&
809            &     recvcountsloc,rdisplsloc,spaceComm,ier)
810 
811       ABI_FREE_CUDA(cwavef_mpi_c)
812 #endif
813    else
814      call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall1,&
815          &     recvcountsloc,rdisplsloc,spaceComm,ier)
816    end if
817 
818 
819    else
820       call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,&
821            &     recvcountsloc,rdisplsloc,spaceComm,ier)
822    end if
823    call timab(581,2,tsec)
824  else
825    ! Here, we cheat, and use DCOPY to bypass some compiler's overzealous bound-checking
826    ! (ndatarecv*my_nspinor*bandpp might be greater than the declared size of cwavef)
827     if (l_gpu_option == ABI_GPU_KOKKOS) then
828 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS)
829        call copy_gpu_to_gpu(C_LOC(cwavef_alltoall2), C_LOC(cwavef), INT(2, c_size_t) * ndatarecv * my_nspinor * bandpp * dp)
830 #endif
831     else
832        call DCOPY(2*ndatarecv*my_nspinor*bandpp,cwavef,1,cwavef_alltoall2,1)
833     end if
834  end if
835 
836 !=====================================================================
837  if (bandpp==1) then
838 
839 
840    if (do_transpose .and. mpi_enreg%paral_spinor==0.and.nspinortot==2) then !Sort WF by spin
841      call prep_sort_wavef_spin(nproc_band,my_nspinor,ndatarecv,recvcounts,rdispls,index_wavef_band)
842      cwavef_alltoall2(:, :) = cwavef_alltoall1(:,index_wavef_band)
843    end if
844 
845    if (paw_opt==2) then
846       lambda_nonlop(1)=lambdablock(mpi_enreg%me_band+1)
847    end if
848    call nonlop(choice,cpopt,cwaveprj,enlout,hamk,idir,lambda_nonlop,mpi_enreg,1,nnlout,paw_opt,&
849 &   signs,gsc_alltoall2,tim_nonlop,cwavef_alltoall2,gvnlc_alltoall2,vectproj=vectproj)
850 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
851    !call gpu_device_synchronize()
852    !call gpu_data_prefetch_async_f(C_LOC(cwavef_alltoall2), cwavef_alltoall2_size, CPU_DEVICE_ID)
853    !call gpu_data_prefetch_async_f(C_LOC(gvnlc_alltoall2), gvnlc_alltoall2_size, CPU_DEVICE_ID)
854    !call gpu_data_prefetch_async_f(C_LOC(gsc_alltoall2), gsc_alltoall2_size, CPU_DEVICE_ID)
855 #endif
856 
857    if (do_transpose .and. mpi_enreg%paral_spinor == 0 .and. nspinortot==2.and.signs==2) then
858      if (paw_opt/=3) gvnlc_alltoall1(:,index_wavef_band)=gvnlc_alltoall2(:,:)
859      if (paw_opt==3.or.paw_opt==4) gsc_alltoall1(:,index_wavef_band)=gsc_alltoall2(:,:)
860    end if
861 
862  else   ! bandpp/=1
863 
864 !  -------------------------------------------------------------
865 !  Computation of the index used to sort the waves functions below bandpp
866 !  -------------------------------------------------------------
867    if(do_transpose) then
868      call prep_index_wavef_bandpp(nproc_band,bandpp,&
869 &     my_nspinor,ndatarecv,recvcounts,rdispls,index_wavef_band)
870 
871 !  -------------------------------------------------------
872 !  Sorting of the waves functions below bandpp
873 !  -------------------------------------------------------
874      cwavef_alltoall2(:,:) = cwavef_alltoall1(:,index_wavef_band)
875    end if
876 
877 !  -------------------------------------------------------
878 !  Call nonlop
879 !  -------------------------------------------------------
880    if(paw_opt == 2) then
881       lambda_nonlop(1:bandpp) = lambdablock((mpi_enreg%me_band*bandpp)+1:((mpi_enreg%me_band+1)*bandpp))
882    end if
883    call nonlop(choice,cpopt,cwaveprj,enlout,hamk,idir,lambda_nonlop,mpi_enreg,bandpp,nnlout,paw_opt,&
884 &   signs,gsc_alltoall2,tim_nonlop,cwavef_alltoall2,gvnlc_alltoall2,vectproj=vectproj)
885 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
886    if(hamk%gpu_option==ABI_GPU_KOKKOS) call gpu_device_synchronize()
887    !call gpu_data_prefetch_async_f(C_LOC(cwavef_alltoall2), cwavef_alltoall2_size, CPU_DEVICE_ID)
888    !call gpu_data_prefetch_async_f(C_LOC(gvnlc_alltoall2), gvnlc_alltoall2_size, CPU_DEVICE_ID)
889    !if (associated(gsc_alltoall2)) then
890    !   call gpu_data_prefetch_async_f(C_LOC(gsc_alltoall2), gsc_alltoall2_size, CPU_DEVICE_ID)
891    !end if
892 #endif
893 
894 !  -----------------------------------------------------
895 !  Sorting of waves functions below the processors
896 !  -----------------------------------------------------
897    if(do_transpose.and.signs==2) then
898      if (paw_opt/=3) gvnlc_alltoall1(:,index_wavef_band)=gvnlc_alltoall2(:,:)
899      if (paw_opt==3.or.paw_opt==4) gsc_alltoall1(:,index_wavef_band)=gsc_alltoall2(:,:)
900    end if
901 
902  end if
903 
904 !=====================================================================
905 !  -------------------------------------------------------
906 !  Deallocation
907 !  -------------------------------------------------------
908  if (allocated(index_wavef_band)) then
909    ABI_FREE(index_wavef_band)
910  end if
911 
912 !Transpose the gsc_alltoall or gvlnc_alltoall tabs
913 !according to the paw_opt and signs values
914  if(do_transpose) then
915    if (signs==2) then
916      call timab(581,1,tsec)
917      if(bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2))then
918        if (paw_opt/=3) then
919          call xmpi_alltoallv(gvnlc_alltoall1,recvcountsloc,rdisplsloc,gvnlc,&
920 &         sendcountsloc,sdisplsloc,spaceComm,ier)
921        end if
922        if (paw_opt==3.or.paw_opt==4) then
923          call xmpi_alltoallv(gsc_alltoall1,recvcountsloc,rdisplsloc,gsc,&
924 &         sendcountsloc,sdisplsloc,spaceComm,ier)
925        end if
926      else
927        if (paw_opt/=3) then
928          call xmpi_alltoallv(gvnlc_alltoall2,recvcountsloc,rdisplsloc,gvnlc,&
929 &         sendcountsloc,sdisplsloc,spaceComm,ier)
930        end if
931        if (paw_opt==3.or.paw_opt==4) then
932          call xmpi_alltoallv(gsc_alltoall2,recvcountsloc,rdisplsloc,gsc,&
933 &         sendcountsloc,sdisplsloc,spaceComm,ier)
934        end if
935      end if
936      call timab(581,2,tsec)
937    end if
938  else
939    ! TODO check other usages, maybe
940     if (l_gpu_option == ABI_GPU_KOKKOS) then
941 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS)
942        call copy_gpu_to_gpu(C_LOC(gsc), C_LOC(gsc_alltoall2), INT(2, c_size_t) * ndatarecv * my_nspinor * bandpp * dp)
943 #endif
944     else
945        call DCOPY(2*ndatarecv*my_nspinor*bandpp, gsc_alltoall2, 1, gsc, 1)
946     end if
947  end if
948 
949  if (nnlout>0) then
950    call xmpi_allgather(enlout,nnlout*bandpp,enlout_block,spaceComm,ier)
951  end if
952  ABI_FREE(enlout)
953  ABI_FREE(sendcountsloc)
954  ABI_FREE(sdisplsloc)
955  ABI_FREE(recvcountsloc)
956  ABI_FREE(rdisplsloc)
957 
958  if(hamk%gpu_option==ABI_GPU_KOKKOS) then
959 #if defined HAVE_GPU && defined HAVE_YAKL
960    ABI_FREE_MANAGED(cwavef_alltoall2)
961    ABI_FREE_MANAGED(gvnlc_alltoall2)
962    if (paw_opt >= 3) then
963       ABI_FREE_MANAGED(gsc_alltoall2)
964    end if
965 #endif
966  else
967    ABI_FREE(cwavef_alltoall2)
968    ABI_FREE(gvnlc_alltoall2)
969    ABI_FREE(gsc_alltoall2)
970  end if
971 
972  if(do_transpose .and. (bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2)))then
973    ABI_FREE(cwavef_alltoall1)
974    if(signs==2)then
975      if (paw_opt/=3) then
976        ABI_FREE(gvnlc_alltoall1)
977      end if
978      if (paw_opt==3.or.paw_opt==4) then
979        ABI_FREE(gsc_alltoall1)
980      end if
981    end if
982  end if
983 
984  call timab(570,2,tsec)
985 
986  DBG_EXIT('COLL')
987 
988 end subroutine prep_nonlop

ABINIT/prep_sort_wavef_spin [ Functions ]

[ Top ] [ Functions ]

NAME

 prep_sort_wavef_spin

FUNCTION

 Compute index used to sort a spinorial wave-function by spin
 Sort to have all nspinor=1 fisrt, then all nspinor=2

INPUTS

  nproc_band=size of "band" communicator
  nspinor=number of spinorial components of the wavefunction
  ndatarecv=total number of values on all processors
  recvcounts(nproc_band)= number of received values by the processor
  rdispls(nproc_band)= offsets of the received values by the processor

OUTPUT

  index_wavef(:)=array containing the sorted indexes (pointer, allocated in this routine)

SOURCE

2160 subroutine prep_sort_wavef_spin(nproc_band,nspinor,ndatarecv,recvcounts,rdispls,index_wavef)
2161 
2162 !Arguments ------------------------------------
2163 !scalars
2164  integer,intent(in) :: ndatarecv,nproc_band,nspinor
2165 !arrays
2166  integer,intent(in) :: rdispls(nproc_band),recvcounts(nproc_band)
2167  integer,allocatable,intent(out) :: index_wavef(:)
2168 
2169 !Local variables-------------------------------
2170 !scalars
2171  integer :: isft,isft1,iproc,iindex
2172 !arrays
2173  integer,allocatable :: recvcountsloc(:),rdisplsloc(:)
2174 
2175 ! *********************************************************************
2176 
2177  ABI_MALLOC(index_wavef,(ndatarecv*nspinor))
2178 
2179  ABI_MALLOC(recvcountsloc,(nproc_band))
2180  ABI_MALLOC(rdisplsloc,(nproc_band))
2181  recvcountsloc(:)=recvcounts(:)*2*nspinor
2182  rdisplsloc(:)=rdispls(:)*2*nspinor
2183 
2184 !---------------------------------------------
2185 !Loops on bandpp and processors band
2186 !---------------------------------------------
2187  isft=0
2188  do iproc=1,nproc_band
2189 
2190 !  ===== Spin up
2191    if (iproc==1) then
2192      isft= 0
2193    else
2194      isft= sum(recvcounts(1: (iproc-1)))
2195    end if
2196    isft1 = 0.5*rdisplsloc(iproc)
2197 
2198    index_wavef(1+isft:isft+recvcounts(iproc))= &
2199 &   (/(iindex,iindex=isft1+1,isft1+recvcounts(iproc))/)
2200 
2201 !  =====Spin down
2202    if (iproc==1)then
2203      isft=sum(recvcounts(1:nproc_band))
2204    else
2205      isft=sum(recvcounts(1:nproc_band)) &
2206 &     +sum(recvcounts(1:iproc-1))
2207    end if
2208    isft1 = 0.5 * rdisplsloc(iproc) + recvcounts(iproc)
2209 
2210    index_wavef(1+isft:isft+recvcounts(iproc))= &
2211 &   (/(iindex,iindex=isft1+1,isft1+ recvcounts(iproc))/)
2212 
2213  end do
2214 
2215  ABI_FREE(recvcountsloc)
2216  ABI_FREE(rdisplsloc)
2217 
2218 end subroutine prep_sort_wavef_spin

ABINIT/prep_wavef_sym_do [ Functions ]

[ Top ] [ Functions ]

NAME

 prep_wavef_sym_do

FUNCTION

 this routine associates waves functions by two as following
      E(G)  = C(G) + D(G)
      E(-G) = C*(G) + iD*(G)
 the values are distributed on the processors in function of
 the value of mpi_enreg%distribfft%tab_fftwf2_distrib( (-kg_k_gather(2,i) )

INPUTS

  mpi_enreg          = information about mpi parallelization
  bandpp             = number of couple of waves functions
  nspinor            = number of spin
  ndatarecv          = number of values received by the processor and sended
                       by the other processors band
  ndatarecv_tot      = total number of received values
                       (ndatarecv   + number of received opposited planewave coordinates)
  ndatasend_sym      = number of sended values to the processors fft to create opposited
                       planewave coordinates
  tab_proc           = positions of opposited planewave coordinates in the list of the
                       processors fft
  cwavef_alltoall    = planewave coefficients of wavefunction
                      ( initial of the processor + sended by other processors band)
  sendcounts_sym     = number of sended values by the processor to each processor fft
  sdispls_sym        = postions of the sended values by the processor to each processor fft

  recvcounts_sym     = number of the received values by the processor from each processor fft
  rdispls_sym        = postions of the received values by the processor from each processor fft

OUTPUT

  ewavef_alltoall_sym = planewave coefficients of wavefunction
                        initial of the processor +
                        sended by other processors band +
                        sended by other processors fft  +
                        and compisited if bandpp >1
  index_wavef_send    = index to send the values in blocks to the other processor fft

SIDE EFFECTS

SOURCE

1598 subroutine prep_wavef_sym_do(mpi_enreg,bandpp,nspinor,&
1599 &     ndatarecv,&
1600 &     ndatarecv_tot,ndatasend_sym,tab_proc,&
1601 &     cwavef_alltoall,&
1602 &     sendcounts_sym,sdispls_sym,&
1603 &     recvcounts_sym,rdispls_sym,&
1604 &     ewavef_alltoall_sym,&
1605 &     index_wavef_send)
1606 
1607 !Arguments ------------------------------------
1608 !scalars
1609  integer,intent(in) :: bandpp,ndatarecv,ndatarecv_tot,ndatasend_sym
1610  integer,intent(in) :: nspinor
1611  type(mpi_type),intent(in) :: mpi_enreg
1612 !arrays
1613  integer,allocatable,intent(out) :: index_wavef_send(:)
1614  integer,intent(in) :: rdispls_sym(:),recvcounts_sym(:)
1615  integer,intent(in) :: sdispls_sym(:),sendcounts_sym(:)
1616  integer,intent(in) :: tab_proc(:)
1617  real(dp),intent(inout) :: cwavef_alltoall(2,ndatarecv*nspinor*bandpp)
1618  real(dp),pointer :: ewavef_alltoall_sym(:,:)
1619 
1620 !Local variables-------------------------------
1621 !scalars
1622  integer :: bandpp_sym,ibandpp,idatarecv,ideb_loc,idebc,idebd,idebe
1623  integer :: ier,ifin_loc,ifinc,ifind,ifine,iproc,jbandpp,jsendloc
1624  integer :: kbandpp,newspacecomm,nproc_fft
1625  logical :: flag_compose
1626 !arrays
1627  integer,allocatable :: rdispls_sym_loc(:),recvcounts_sym_loc(:)
1628  integer,allocatable :: sdispls_sym_loc(:),sendcounts_sym_loc(:)
1629  real(dp),allocatable :: ewavef_alltoall_loc(:,:),ewavef_alltoall_send(:,:)
1630 
1631 ! *********************************************************************
1632 
1633 !DEBUG
1634 !write(std_out,*)' prep_wavef_sym_do : enter '
1635 !ENDDEBUG
1636 
1637 !---------------------------------------------
1638 !Initialisation
1639 !---------------------------------------------
1640  nproc_fft    = mpi_enreg%nproc_fft
1641 
1642  newspacecomm = mpi_enreg%comm_fft
1643 
1644  if (modulo(bandpp,2)==0) then
1645    bandpp_sym   = bandpp/2
1646    flag_compose = .TRUE.
1647  else
1648    bandpp_sym   = bandpp
1649    flag_compose = .FALSE.
1650  end if
1651 
1652 !---------------------------------------------
1653 !Allocation
1654 !---------------------------------------------
1655  ABI_MALLOC(ewavef_alltoall_sym     ,(2,ndatarecv_tot*bandpp_sym))
1656  ABI_MALLOC(ewavef_alltoall_loc     ,(2,ndatarecv    *bandpp_sym))
1657  ABI_MALLOC(ewavef_alltoall_send    ,(2,ndatasend_sym*bandpp_sym))
1658  ABI_MALLOC(index_wavef_send        ,(  ndatasend_sym*bandpp_sym))
1659 
1660  ABI_MALLOC(sendcounts_sym_loc    ,(nproc_fft))
1661  ABI_MALLOC(sdispls_sym_loc       ,(nproc_fft))
1662  ABI_MALLOC(recvcounts_sym_loc    ,(nproc_fft))
1663  ABI_MALLOC(rdispls_sym_loc       ,(nproc_fft))
1664 
1665 
1666 !Initialisation
1667 !--------------
1668  ewavef_alltoall_sym(:,:) =0.
1669  ewavef_alltoall_loc(:,:) =0.
1670 
1671  sendcounts_sym_loc(:) =0
1672  sdispls_sym_loc(:)    =0
1673  recvcounts_sym_loc(:) =0
1674  rdispls_sym_loc(:)    =0
1675 
1676  index_wavef_send(:)   =0
1677 
1678 
1679 !-------------------------------------------------
1680 !We are bandpp blocks which we want to :
1681 !associate by two      (band_sym==bandpp/2)
1682 !or not associate by two  (band_sym==bandpp)
1683 !
1684 !So We'll have got bandpp_sym blocks
1685 !So we loop on the bandpp_sym blocks
1686 !--------------------------------------------------
1687 
1688  do kbandpp=1,bandpp_sym
1689 
1690 !  position of the two blocks
1691 !  --------------------------
1692    ibandpp = (kbandpp-1) * 2
1693    jbandpp =  ibandpp    + 1
1694 
1695    idebe = (kbandpp-1) * ndatarecv_tot + 1
1696    ifine = idebe       + ndatarecv     - 1
1697 
1698    idebc = ibandpp * ndatarecv     + 1
1699    ifinc = idebc   + ndatarecv     - 1
1700 
1701    idebd = jbandpp * ndatarecv     + 1
1702    ifind = idebd   + ndatarecv     - 1
1703 
1704    ideb_loc = (kbandpp-1) * ndatarecv  + 1
1705    ifin_loc = ideb_loc    + ndatarecv  - 1
1706 
1707 
1708    if (flag_compose) then
1709 
1710 
1711 !    calcul ewavef(G)
1712 !    ----------------
1713      ewavef_alltoall_sym(1,idebe:ifine) =    &
1714 &     cwavef_alltoall(1,idebc:ifinc) &
1715 &     - cwavef_alltoall(2,idebd:ifind)
1716 
1717      ewavef_alltoall_sym(2,idebe:ifine) =    &
1718 &     cwavef_alltoall(2,idebc:ifinc) &
1719 &     + cwavef_alltoall(1,idebd:ifind)
1720 
1721 !    calcul ewavef_loc(-G)
1722 !    ---------------------
1723      ewavef_alltoall_loc(1,ideb_loc:ifin_loc) =  &
1724 &     cwavef_alltoall(1,idebc:ifinc) &
1725 &     + cwavef_alltoall(2,idebd:ifind)
1726 
1727      ewavef_alltoall_loc(2,ideb_loc:ifin_loc) =  &
1728 &     - cwavef_alltoall(2,idebc:ifinc) &
1729 &     + cwavef_alltoall(1,idebd:ifind)
1730    else
1731 
1732 !    calcul ewavef(G)
1733 !    ----------------
1734      ewavef_alltoall_sym(1,idebe:ifine)   = cwavef_alltoall(1,idebc:ifinc)
1735      ewavef_alltoall_sym(2,idebe:ifine)   = cwavef_alltoall(2,idebc:ifinc)
1736 
1737 !    calcul ewavef_loc(-G)
1738 !    ---------------------
1739      ewavef_alltoall_loc(1,ideb_loc:ifin_loc) =   cwavef_alltoall(1,idebc:ifinc)
1740      ewavef_alltoall_loc(2,ideb_loc:ifin_loc) = - cwavef_alltoall(2,idebc:ifinc)
1741 
1742    end if
1743 
1744  end do
1745 
1746 
1747 
1748 !------------------------------------------------------------------------
1749 !Creation of datas blocks for each processor fft from ewavef_alltoall_loc
1750 !to send datas by blocks with a alltoall...
1751 !------------------------------------------------------------------------
1752 
1753 !Position of the blocks
1754 !----------------------
1755  jsendloc=0
1756  do ibandpp=1,bandpp_sym
1757    do iproc=1,nproc_fft
1758      do idatarecv=1,ndatarecv
1759        if (tab_proc(idatarecv)==(iproc-1)) then
1760          jsendloc=jsendloc+1
1761          index_wavef_send(jsendloc)  = idatarecv + ndatarecv * (ibandpp-1)
1762        end if
1763      end do
1764    end do
1765  end do
1766 
1767 !Classment
1768 !----------
1769  ewavef_alltoall_send(:,:)=ewavef_alltoall_loc(:,index_wavef_send)
1770 
1771 
1772 !-------------------------------------------------
1773 !Calcul of the number of received and sended datas
1774 !-------------------------------------------------
1775  sendcounts_sym_loc = sendcounts_sym*2
1776  recvcounts_sym_loc = recvcounts_sym*2
1777 
1778 !------------------------------------------
1779 !Exchange of the datas ewavef_allto_all_loc
1780 !------------------------------------------
1781  do ibandpp=1,bandpp_sym
1782 
1783 !  ------------------------------------------------
1784 !  Deplacment of the sended datas because of bandpp
1785 !  ------------------------------------------------
1786    sdispls_sym_loc(:) = sdispls_sym(:) + ndatasend_sym * (ibandpp-1)
1787    sdispls_sym_loc    = sdispls_sym_loc   *2
1788 
1789 !  --------------------------------------------------
1790 !  Deplacment of the received datas because of bandpp
1791 !  --------------------------------------------------
1792    rdispls_sym_loc(:) = rdispls_sym(:) + ndatarecv_tot * (ibandpp-1)
1793    rdispls_sym_loc    = rdispls_sym_loc   *2
1794 
1795 
1796    call xmpi_alltoallv(&
1797 &   ewavef_alltoall_send(:,:) ,sendcounts_sym_loc,sdispls_sym_loc,&
1798 &   ewavef_alltoall_sym(:,:)  ,recvcounts_sym_loc,rdispls_sym_loc,&
1799 &   newspacecomm,ier)
1800 
1801  end do
1802 
1803 !-----------------------
1804 !Desallocation
1805 !-----------------------
1806 
1807  ABI_FREE(sendcounts_sym_loc)
1808  ABI_FREE(recvcounts_sym_loc)
1809  ABI_FREE(sdispls_sym_loc)
1810  ABI_FREE(rdispls_sym_loc)
1811 
1812  ABI_FREE(ewavef_alltoall_loc)
1813  ABI_FREE(ewavef_alltoall_send)
1814 
1815 end subroutine prep_wavef_sym_do

ABINIT/prep_wavef_sym_undo [ Functions ]

[ Top ] [ Functions ]

NAME

 prep_wavef_sym_undo

FUNCTION

 this routine dissociates each wave function in two waves functions as following
      C(G) =   ( E*(-G) + E(G))/2
      D(G) = i*( E*(-G) - E(G))/2
 the values are redistributed on the processors in function of
 the value of mpi_enreg%distribfft%tab_fftwf2_distrib( (-kg_k_gather(2,i) )

INPUTS

  mpi_enreg          = information about mpi parallelization
  bandpp             = number of groups of couple of waves functions
  nspinor            = number of spin
  ndatarecv          = number of values received by the processor and sended
                       by the other processors band
  ndatarecv_tot      = total number of received values
                       (ndatarecv   + number of received opposited planewave coordinates)
  ndatasend_sym      = number of sended values to the processors fft to create opposited
                       planewave coordinates
  idatarecv0         = position of the planewave coordinates (0,0,0)
  sendcounts_sym     = number of sended values by the processor to each processor fft
  sdispls_sym        = postions of the sended values by the processor to each processor fft

  recvcounts_sym     = number of the received values by the processor to each processor fft

  gwavef_alltoall_sym = planewave coefficients of wavefunction
                        initial of the processor +
                        sended by other processors band +
                        sended by other processors fft  +
                        and composited if bandpp >1
  index_wavef_send    = index to send the values by block to the other processor fft

OUTPUT

  gwavef_alltoall     = planewave coefficients of wavefunction
                        ( for of the processor + to send to other processors band)

SOURCE

1859 subroutine prep_wavef_sym_undo(mpi_enreg,bandpp,nspinor,&
1860 &     ndatarecv,&
1861 &     ndatarecv_tot,ndatasend_sym,idatarecv0,&
1862 &     gwavef_alltoall,&
1863 &     sendcounts_sym,sdispls_sym,&
1864 &     recvcounts_sym,rdispls_sym,&
1865 &     gwavef_alltoall_sym,&
1866 &     index_wavef_send)
1867 
1868 !Arguments ------------------------------------
1869 !scalars
1870  integer,intent(in) :: bandpp,idatarecv0,ndatarecv,ndatarecv_tot,ndatasend_sym
1871  integer,intent(in) :: nspinor
1872  type(mpi_type),intent(in) :: mpi_enreg
1873 !arrays
1874  integer,intent(in) :: index_wavef_send(:),rdispls_sym(:),recvcounts_sym(:)
1875  integer,intent(in) :: sdispls_sym(:),sendcounts_sym(:)
1876  real(dp),intent(inout) :: gwavef_alltoall(2,ndatarecv*nspinor*bandpp)
1877  real(dp),intent(inout) :: gwavef_alltoall_sym(:,:)
1878 
1879 !Local variables-------------------------------
1880 !scalars
1881  integer :: bandpp_sym,ibandpp,ideb_loc,idebc,idebd
1882  integer :: idebe,ier,ifin_loc,ifinc,ifind,ifine
1883  integer :: jbandpp,kbandpp,newspacecomm,nproc_fft
1884  logical :: flag_compose
1885 !arrays
1886  integer,allocatable :: rdispls_sym_loc(:),recvcounts_sym_loc(:)
1887  integer,allocatable :: sdispls_sym_loc(:),sendcounts_sym_loc(:)
1888  real(dp),allocatable :: gwavef_alltoall_loc(:,:),gwavef_alltoall_rcv(:,:)
1889 
1890 ! *********************************************************************
1891 
1892 !DEBUG
1893 !write(std_out,*)' prep_wavef_sym_undo : enter '
1894 !ENDDEBUG
1895 
1896 
1897 !---------------------------------------------
1898 !Initialisation
1899 !---------------------------------------------
1900  nproc_fft    = mpi_enreg%nproc_fft
1901 
1902  newspacecomm = mpi_enreg%comm_fft
1903 
1904  if (modulo(bandpp,2)==0) then
1905    bandpp_sym   = bandpp/2
1906    flag_compose = .TRUE.
1907  else
1908    bandpp_sym   = bandpp
1909    flag_compose = .FALSE.
1910  end if
1911 
1912 !---------------------------------------------
1913 !Allocation
1914 !---------------------------------------------
1915  ABI_MALLOC(gwavef_alltoall_loc     ,(2,ndatarecv     *bandpp_sym))
1916  ABI_MALLOC(gwavef_alltoall_rcv     ,(2,ndatasend_sym *bandpp_sym))
1917 
1918  ABI_MALLOC(sendcounts_sym_loc    ,(nproc_fft))
1919  ABI_MALLOC(sdispls_sym_loc       ,(nproc_fft))
1920  ABI_MALLOC(recvcounts_sym_loc    ,(nproc_fft))
1921  ABI_MALLOC(rdispls_sym_loc       ,(nproc_fft))
1922 
1923 
1924 !---------------------------------------------
1925 !Initialisation
1926 !---------------------------------------------
1927  gwavef_alltoall_loc(:,:) =0.
1928 
1929  sendcounts_sym_loc(:) =0
1930  sdispls_sym_loc(:)    =0
1931  recvcounts_sym_loc(:) =0
1932  rdispls_sym_loc(:)    =0
1933 
1934 
1935 !-------------------------------------------------
1936 !Calcul of number of the sended and received datas
1937 !-------------------------------------------------
1938  sendcounts_sym_loc = sendcounts_sym*2
1939  recvcounts_sym_loc = recvcounts_sym*2
1940 
1941 !----------------------------------------------------
1942 !Sending of the values
1943 !----------------------------------------------------
1944  do ibandpp = 1,bandpp_sym
1945 
1946 !  -------------------------------------------------
1947 !  Deplacment of the sended values because of bandpp
1948 !  -------------------------------------------------
1949    sdispls_sym_loc(:) = sdispls_sym(:) + ndatasend_sym * (ibandpp-1)
1950    sdispls_sym_loc    = sdispls_sym_loc   *2
1951 
1952 !  ---------------------------------------------------
1953 !  Deplacment of the received values because of bandpp
1954 !  ---------------------------------------------------
1955    rdispls_sym_loc(:) = rdispls_sym(:) + ndatarecv_tot * (ibandpp-1)
1956    rdispls_sym_loc    = rdispls_sym_loc   *2
1957 
1958 
1959    call xmpi_alltoallv(&
1960 &   gwavef_alltoall_sym(:,:) ,recvcounts_sym_loc,rdispls_sym_loc,&
1961 &   gwavef_alltoall_rcv(:,:) ,sendcounts_sym_loc,sdispls_sym_loc,&
1962 &   newspacecomm,ier)
1963 
1964  end do
1965 
1966 
1967 !----------------------
1968 !Dispatching the blocks
1969 !----------------------
1970  gwavef_alltoall_loc(:,index_wavef_send(:)) = gwavef_alltoall_rcv(:,:)
1971 
1972 !----------------------
1973 !Case  -kg = [0 0 0]
1974 !----------------------
1975  if (idatarecv0/=-1) then
1976    do kbandpp=1,bandpp_sym
1977      gwavef_alltoall_loc(:,(kbandpp-1)*ndatarecv     + idatarecv0)= &
1978      gwavef_alltoall_sym(:,(kbandpp-1)*ndatarecv_tot + idatarecv0)
1979    end do
1980  end if
1981 
1982 !---------------------------------------------------
1983 !Build of hwavef_alltoall
1984 !
1985 !We have got :
1986 !bandpp_sym blocks to dissociate
1987 !or   bandpp_sym blokcs to not dissociate
1988 !--------------------------------------------------
1989  do kbandpp=1,bandpp_sym
1990 
1991 !  position of the 2 blocks
1992 !  ----------------------------------
1993    ibandpp = (kbandpp-1) * 2
1994    jbandpp =  ibandpp    + 1
1995 
1996    idebe = (kbandpp-1) * ndatarecv_tot + 1
1997    ifine = idebe       + ndatarecv     - 1
1998 
1999    idebc = ibandpp * ndatarecv     + 1
2000    ifinc = idebc   + ndatarecv     - 1
2001 
2002    idebd = jbandpp * ndatarecv     + 1
2003    ifind = idebd   + ndatarecv     - 1
2004 
2005    ideb_loc = (kbandpp-1) * ndatarecv  + 1
2006    ifin_loc = ideb_loc    + ndatarecv  - 1
2007 
2008 
2009    if (flag_compose) then
2010 
2011 !    calcul cwavef(G)
2012 !    ----------------
2013      gwavef_alltoall(1,idebc:ifinc) =   gwavef_alltoall_sym(1,idebe:ifine)  &
2014 &     + gwavef_alltoall_loc(1,ideb_loc:ifin_loc)
2015      gwavef_alltoall(2,idebc:ifinc) =   gwavef_alltoall_sym(2,idebe:ifine)  &
2016 &     - gwavef_alltoall_loc(2,ideb_loc:ifin_loc)
2017 
2018 !    calcul dwavef(G)
2019 !    ------------------
2020      gwavef_alltoall(1,idebd:ifind) =   gwavef_alltoall_sym(2,idebe:ifine) &
2021 &     + gwavef_alltoall_loc(2,ideb_loc:ifin_loc)
2022      gwavef_alltoall(2,idebd:ifind) = - gwavef_alltoall_sym(1,idebe:ifine) &
2023 &     + gwavef_alltoall_loc(1,ideb_loc:ifin_loc)
2024    else
2025 
2026 !    calcul cwavef(G)
2027 !    ----------------
2028      gwavef_alltoall(1,idebc:ifinc) =   gwavef_alltoall_sym(1,idebe:ifine)  &
2029 &     + gwavef_alltoall_loc(1,ideb_loc:ifin_loc)
2030      gwavef_alltoall(2,idebc:ifinc) =   gwavef_alltoall_sym(2,idebe:ifine)  &
2031 &     - gwavef_alltoall_loc(2,ideb_loc:ifin_loc)
2032    end if
2033 
2034  end do
2035 
2036 !We divise by two
2037  gwavef_alltoall(:,:)    = gwavef_alltoall(:,:)/2
2038 
2039 !-----------------------
2040 !Desallocation
2041 !-----------------------
2042 
2043  ABI_FREE(sendcounts_sym_loc)
2044  ABI_FREE(recvcounts_sym_loc)
2045  ABI_FREE(sdispls_sym_loc)
2046  ABI_FREE(rdispls_sym_loc)
2047 
2048  ABI_FREE(gwavef_alltoall_loc)
2049  ABI_FREE(gwavef_alltoall_rcv)
2050 
2051 end subroutine prep_wavef_sym_undo