TABLE OF CONTENTS
- ABINIT/m_prep_kgb
- ABINIT/prep_fourwf
- ABINIT/prep_getghc
- ABINIT/prep_index_wavef_bandpp
- abinit/prep_nonlop
- ABINIT/prep_sort_wavef_spin
- ABINIT/prep_wavef_sym_do
- ABINIT/prep_wavef_sym_undo
ABINIT/m_prep_kgb [ Modules ]
NAME
m_prep_kgb
FUNCTION
This module provides wrappers that used to apply the full Hamiltonian or just the Vnl part or to perform the FFT of the wavefunctions when the orbitals are distributed in linalg mode (paral_kgb = 1).
COPYRIGHT
Copyright (C) 1998-2024 ABINIT group (FBottin,MT,GZ,MD,FDahm) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
17 #if defined HAVE_CONFIG_H 18 #include "config.h" 19 #endif 20 21 #include "abi_common.h" 22 23 module m_prep_kgb 24 25 use defs_basis 26 use m_abicore 27 use m_errors 28 use m_xmpi 29 use m_abi_linalg 30 31 use defs_abitypes, only : MPI_type 32 use m_time, only : timab 33 use m_bandfft_kpt, only : bandfft_kpt, bandfft_kpt_get_ikpt, bandfft_kpt_type 34 use m_pawcprj, only : pawcprj_type 35 use m_hamiltonian, only : gs_hamiltonian_type 36 use m_nonlop, only : nonlop 37 use m_getghc, only : multithreaded_getghc 38 use m_fft, only : fourwf 39 40 #if defined HAVE_GPU_CUDA 41 use m_manage_cuda 42 #endif 43 44 #if defined HAVE_YAKL 45 use gator_mod 46 #endif 47 48 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL) 49 use m_gpu_toolbox, only : CPU_DEVICE_ID, gpu_device_synchronize, gpu_data_prefetch_async 50 #endif 51 52 #if defined HAVE_OPENMP_OFFLOAD 53 use m_ompgpu_fourwf 54 #endif 55 56 implicit none 57 58 private
ABINIT/prep_fourwf [ Functions ]
NAME
prep_fourwf
FUNCTION
this routine prepares the data to the call of fourwf.
INPUTS
blocksize= size of block for FFT cwavef(2,npw*ndat)=planewave coefficients of wavefunction (one spinorial component?). dtfil <type(datafiles_type)>=variables related to files kg_k(3,npw_k)=reduced planewave coordinates. lmnmax=if useylm=1, max number of (l,m,n) comp. over all type of psps =if useylm=0, max number of (l,n) comp. over all type of psps mgfft=maximum size of 1d ffts mpi_enreg=information about mpi parallelization mpsang= 1+maximum angular momentum for nonlocal pseudopotentials mpssoang= 1+maximum (spin*angular momentum) for nonlocal pseudopotentials natom=number of atoms in cell. nband_k=number of bands at this k point for that spin polarization ndat=number of FFT to do in // ngfft(18)= contain all needed information about 3D FFT npw_k=number of plane waves at this k point nspinor=number of spinorial components of the wavefunctions ntypat=number of types of atoms in unit cell. n4,n5,n6 used for dimensionning of vlocal option_fourwf=option for fourwf (see fourwf.F90) prtvol=control print volume and debugging output ucvol=unit cell volume [bandfft_kpt_tab]= (optional) if present, contains tabs used to implement the "band-fft" parallelism if not present, the bandfft_kpt global variable is used [gpu_option] = GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)
OUTPUT
gwavef=(2,npw*ndat)=matrix elements <G|H|C>.
SIDE EFFECTS
SOURCE
1032 subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,iblock,istwf_k,mgfft,& 1033 & mpi_enreg,nband_k,ndat,ngfft,npw_k,n4,n5,n6,occ_k,option_fourwf,ucvol,wtk,& 1034 & bandfft_kpt_tab,gpu_option) ! Optional arguments 1035 1036 !Arguments ------------------------------------ 1037 !scalars 1038 integer,intent(in) :: blocksize,iblock,istwf_k,mgfft,n4,n5,n6,nband_k,ndat,npw_k 1039 integer,intent(in) :: option_fourwf 1040 integer,intent(in),optional :: gpu_option 1041 real(dp),intent(in) :: ucvol,wtk 1042 type(bandfft_kpt_type),optional,target,intent(in) :: bandfft_kpt_tab 1043 type(mpi_type),intent(in) :: mpi_enreg 1044 !arrays 1045 integer,intent(in) :: ngfft(18) 1046 real(dp),intent(in) :: occ_k(nband_k) 1047 real(dp),intent(out) :: rhoaug(n4,n5,n6) 1048 real(dp),intent(in), target :: cwavef(2,npw_k*blocksize) 1049 real(dp),target,intent(inout) :: wfraug(2,n4,n5,n6*ndat) 1050 1051 !Local variables------------------------------- 1052 !scalars 1053 integer :: bandpp,bandpp_sym,ier,iibandpp,ikpt_this_proc,ind_occ,ind_occ1,ind_occ2,ipw 1054 integer :: istwf_k_,jjbandpp,me_fft,nd3,nproc_band,nproc_fft,npw_fft 1055 integer :: spaceComm=0,tim_fourwf,gpu_option_ 1056 integer,pointer :: idatarecv0,ndatarecv,ndatarecv_tot,ndatasend_sym 1057 logical :: flag_inv_sym,have_to_reequilibrate 1058 real(dp) :: weight,weight1,weight2 1059 type(bandfft_kpt_type),pointer :: bandfft_kpt_ptr 1060 !arrays 1061 integer,ABI_CONTIGUOUS pointer :: indices_pw_fft(:),kg_k_fft(:,:),kg_k_gather(:,:),kg_k_gather_sym(:,:) 1062 integer,ABI_CONTIGUOUS pointer :: rdispls(:),rdispls_sym(:) 1063 integer,ABI_CONTIGUOUS pointer :: recvcounts(:),recvcount_fft(:),recvcounts_sym(:),recvcounts_sym_tot(:) 1064 integer,ABI_CONTIGUOUS pointer :: recvdisp_fft(:),sdispls(:),sdispls_sym(:) 1065 integer,ABI_CONTIGUOUS pointer :: sendcounts(:),sendcount_fft(:),sendcounts_sym(:),sendcounts_sym_all(:) 1066 integer,ABI_CONTIGUOUS pointer :: senddisp_fft(:),tab_proc(:) 1067 integer,allocatable :: rdisplsloc(:) 1068 integer,allocatable :: recvcountsloc(:),sdisplsloc(:) 1069 integer,allocatable :: sendcountsloc(:) 1070 integer,allocatable :: index_wavef_band(:),index_wavef_send(:) 1071 integer,pointer :: gbound_(:,:) 1072 real(dp) :: dummy(2,1),tsec(2) 1073 real(dp),allocatable :: buff_wf(:,:) 1074 1075 #if defined HAVE_GPU && defined HAVE_YAKL 1076 real(c_double), ABI_CONTIGUOUS pointer :: cwavef_alltoall1(:,:) => null() 1077 #else 1078 real(dp),allocatable :: cwavef_alltoall1(:,:) 1079 #endif 1080 real(dp),allocatable :: cwavef_alltoall2(:,:) 1081 real(dp),allocatable :: cwavef_fft(:,:), cwavef_fft_tr(:,:) 1082 real(dp),allocatable :: weight_t(:),weight1_t(:),weight2_t(:) 1083 real(dp),pointer :: ewavef_alltoall_sym(:,:),wfraug_ptr(:,:,:,:) 1084 1085 #if defined HAVE_GPU && defined HAVE_YAKL 1086 ! this buffer is necessary to avoid mixing "managed memory" buffer with "regular memory" buffer in MPI calls 1087 ! Just to be clear: 1088 ! - managed memory means memory allocated using ABI_MALLOC_MANAGED 1089 ! - regular memory means memory allocated using either ABI_MALLOC or ABI_MALLOC_CUDA 1090 ! 1091 ! here we chose to use a CPU buffer, would it be better to use a GPU buffer ? To be checked. 1092 real(dp), allocatable :: cwavef_mpi(:,:) 1093 !type(c_ptr) :: cwavef_mpi_c 1094 !real(c_double),pointer :: cwavef_mpi(:,:) 1095 #endif 1096 1097 ! ************************************************************************* 1098 1099 ABI_CHECK((option_fourwf/=3),'Option=3 (FFT r->g) not implemented') 1100 ABI_CHECK((mpi_enreg%bandpp==ndat),'BUG: bandpp/=ndat') 1101 1102 spaceComm=mpi_enreg%comm_band 1103 nproc_band = mpi_enreg%nproc_band 1104 nproc_fft = mpi_enreg%nproc_fft 1105 bandpp = mpi_enreg%bandpp 1106 me_fft = mpi_enreg%me_fft 1107 1108 gpu_option_=ABI_GPU_DISABLED;if (present(gpu_option)) gpu_option_=gpu_option 1109 1110 if (present(bandfft_kpt_tab)) then 1111 bandfft_kpt_ptr => bandfft_kpt_tab 1112 else 1113 ikpt_this_proc=bandfft_kpt_get_ikpt() 1114 bandfft_kpt_ptr => bandfft_kpt(ikpt_this_proc) 1115 end if 1116 1117 istwf_k_=istwf_k 1118 flag_inv_sym = (istwf_k_==2 .and. any(ngfft(7) == [401,402,312,512])) 1119 if (option_fourwf==0) flag_inv_sym=((flag_inv_sym).and.(gpu_option_==ABI_GPU_DISABLED)) 1120 1121 if (flag_inv_sym) then 1122 istwf_k_ = 1 1123 if (modulo(bandpp,2)==0) then 1124 bandpp_sym = bandpp/2 1125 else 1126 bandpp_sym = bandpp 1127 end if 1128 end if 1129 1130 !==================================================================================== 1131 ABI_MALLOC(sendcountsloc,(nproc_band)) 1132 ABI_MALLOC(sdisplsloc ,(nproc_band)) 1133 ABI_MALLOC(recvcountsloc,(nproc_band)) 1134 ABI_MALLOC(rdisplsloc ,(nproc_band)) 1135 1136 recvcounts =>bandfft_kpt_ptr%recvcounts(:) 1137 sendcounts =>bandfft_kpt_ptr%sendcounts(:) 1138 rdispls =>bandfft_kpt_ptr%rdispls (:) 1139 sdispls =>bandfft_kpt_ptr%sdispls (:) 1140 ndatarecv =>bandfft_kpt_ptr%ndatarecv 1141 1142 kg_k_gather =>bandfft_kpt_ptr%kg_k_gather(:,:) 1143 gbound_ =>bandfft_kpt_ptr%gbound(:,:) 1144 1145 if (flag_inv_sym ) then 1146 idatarecv0 =>bandfft_kpt_ptr%idatarecv0 1147 ndatarecv_tot =>bandfft_kpt_ptr%ndatarecv_tot 1148 ndatasend_sym =>bandfft_kpt_ptr%ndatasend_sym 1149 kg_k_gather_sym =>bandfft_kpt_ptr%kg_k_gather_sym(:,:) 1150 rdispls_sym =>bandfft_kpt_ptr%rdispls_sym(:) 1151 recvcounts_sym =>bandfft_kpt_ptr%recvcounts_sym(:) 1152 recvcounts_sym_tot =>bandfft_kpt_ptr%recvcounts_sym_tot(:) 1153 sdispls_sym =>bandfft_kpt_ptr%sdispls_sym(:) 1154 sendcounts_sym =>bandfft_kpt_ptr%sendcounts_sym(:) 1155 sendcounts_sym_all =>bandfft_kpt_ptr%sendcounts_sym_all(:) 1156 tab_proc =>bandfft_kpt_ptr%tab_proc(:) 1157 end if 1158 1159 ABI_MALLOC(cwavef_alltoall2,(2,ndatarecv*bandpp)) 1160 if ( ((.not.flag_inv_sym) .and. (bandpp>1) ) .or. flag_inv_sym )then 1161 if(gpu_option_==ABI_GPU_KOKKOS) then 1162 #if defined HAVE_GPU && defined HAVE_YAKL 1163 ABI_MALLOC_MANAGED(cwavef_alltoall1,(/2,ndatarecv*bandpp/)) 1164 #endif 1165 else 1166 ABI_MALLOC(cwavef_alltoall1,(2,ndatarecv*bandpp)) 1167 end if 1168 end if 1169 1170 recvcountsloc(:)=recvcounts(:)*2*bandpp 1171 rdisplsloc(:)=rdispls(:)*2*bandpp 1172 sendcountsloc(:)=sendcounts(:)*2 1173 sdisplsloc(:)=sdispls(:)*2 1174 1175 call timab(547,1,tsec) 1176 if(gpu_option_==ABI_GPU_KOKKOS) then 1177 #if defined HAVE_GPU && defined HAVE_YAKL 1178 ABI_MALLOC(cwavef_mpi,(2,npw_k*blocksize)) 1179 1180 call gpu_data_prefetch_async(C_LOC(cwavef), INT(2, c_size_t)*npw_k*blocksize, CPU_DEVICE_ID) 1181 call gpu_device_synchronize() 1182 1183 cwavef_mpi(:,:) = cwavef(:,:) 1184 1185 call xmpi_alltoallv(cwavef_mpi,sendcountsloc,sdisplsloc,cwavef_alltoall2,& 1186 & recvcountsloc,rdisplsloc,spaceComm,ier) 1187 ABI_FREE(cwavef_mpi) 1188 #endif 1189 else 1190 call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,& 1191 & recvcountsloc,rdisplsloc,spaceComm,ier) 1192 end if 1193 call timab(547,2,tsec) 1194 1195 tim_fourwf=16 1196 1197 !Eventually adjust load balancing for FFT (by changing FFT distrib) 1198 have_to_reequilibrate = bandfft_kpt_ptr%have_to_reequilibrate 1199 if(have_to_reequilibrate) then 1200 npw_fft = bandfft_kpt_ptr%npw_fft 1201 sendcount_fft => bandfft_kpt_ptr%sendcount_fft(:) 1202 recvcount_fft => bandfft_kpt_ptr%recvcount_fft(:) 1203 senddisp_fft => bandfft_kpt_ptr%senddisp_fft(:) 1204 recvdisp_fft => bandfft_kpt_ptr%recvdisp_fft(:) 1205 indices_pw_fft => bandfft_kpt_ptr%indices_pw_fft(:) 1206 kg_k_fft => bandfft_kpt_ptr%kg_k_fft(:,:) 1207 ABI_MALLOC( buff_wf, (2,ndatarecv*bandpp) ) ! for sorting cgwavef 1208 ABI_MALLOC( cwavef_fft, (2,npw_fft*bandpp) ) 1209 if(bandpp>1) then 1210 ABI_MALLOC( cwavef_fft_tr, (2,npw_fft*bandpp)) 1211 end if 1212 end if 1213 1214 if (option_fourwf==0) wfraug(:,:,:,:)=zero 1215 1216 !==================================================================== 1217 if ((.not.(flag_inv_sym)) .and. (bandpp==1)) then 1218 1219 ! Compute the index of the band 1220 ind_occ = (iblock-1)*blocksize + mpi_enreg%me_band + 1 1221 1222 if(abs(occ_k(ind_occ))>=tol8.or.option_fourwf==0) then 1223 1224 ! Compute the weight of the band 1225 weight=occ_k(ind_occ)*wtk/ucvol 1226 1227 if(have_to_reequilibrate) then 1228 ! filling of sorted send buffers before exchange 1229 do ipw = 1 ,ndatarecv 1230 buff_wf(1:2, indices_pw_fft(ipw) ) = cwavef_alltoall2(1:2,ipw) 1231 end do 1232 call xmpi_alltoallv(buff_wf,2*sendcount_fft,2*senddisp_fft, & 1233 & cwavef_fft,2*recvcount_fft, 2*recvdisp_fft, mpi_enreg%comm_fft,ier) 1234 call fourwf(1,rhoaug,cwavef_fft,dummy,wfraug,gbound_,gbound_,& 1235 & istwf_k_,kg_k_fft,kg_k_fft,mgfft,mpi_enreg,1,& 1236 & ngfft,npw_fft,1,n4,n5,n6,option_fourwf,tim_fourwf,weight,weight,& 1237 & gpu_option=gpu_option_) 1238 else 1239 call fourwf(1,rhoaug,cwavef_alltoall2,dummy,wfraug,gbound_,gbound_,& 1240 & istwf_k_,kg_k_gather,kg_k_gather,mgfft,mpi_enreg,1,& 1241 & ngfft,ndatarecv,1,n4,n5,n6,option_fourwf,tim_fourwf,weight,weight,& 1242 & gpu_option=gpu_option_) 1243 end if 1244 if (option_fourwf==0.and.nproc_fft>1) then 1245 if (me_fft>0) then 1246 nd3=(ngfft(3)-1)/nproc_fft+1 1247 wfraug(:,:,:,me_fft*nd3+1:me_fft*nd3+nd3)=wfraug(:,:,:,1:nd3) 1248 wfraug(:,:,:,1:nd3)=zero 1249 end if 1250 call xmpi_sum(wfraug,mpi_enreg%comm_fft,ier) 1251 end if 1252 end if 1253 1254 !==================================================================== 1255 else if ((.not.(flag_inv_sym)) .and. (bandpp>1) ) then 1256 1257 ! ------------------------------------------------------------- 1258 ! Computation of the index to class the waves functions below bandpp 1259 ! ------------------------------------------------------------- 1260 call prep_index_wavef_bandpp(nproc_band,bandpp,& 1261 & 1,ndatarecv,& 1262 & recvcounts,rdispls,& 1263 & index_wavef_band) 1264 1265 ! ------------------------------------------------------- 1266 ! Sorting of the wave functions below bandpp 1267 ! ------------------------------------------------------- 1268 cwavef_alltoall1(:,:) = cwavef_alltoall2(:,index_wavef_band) 1269 1270 if(have_to_reequilibrate) then 1271 ! filling of sorted send buffers before exchange 1272 do iibandpp=1,bandpp 1273 do ipw = 1 ,ndatarecv 1274 buff_wf(1:2, iibandpp + bandpp*(indices_pw_fft(ipw)-1)) = & 1275 & cwavef_alltoall1(1:2,ipw + ndatarecv*(iibandpp-1)) 1276 end do 1277 end do 1278 call xmpi_alltoallv(buff_wf,2*bandpp*sendcount_fft,2*bandpp*senddisp_fft, & 1279 & cwavef_fft_tr,2*bandpp*recvcount_fft, 2*bandpp*recvdisp_fft, mpi_enreg%comm_fft,ier) 1280 do iibandpp=1,bandpp 1281 do ipw = 1 ,npw_fft 1282 cwavef_fft(1:2, ipw + npw_fft*(iibandpp -1)) = cwavef_fft_tr(1:2, iibandpp + bandpp*(ipw-1)) 1283 end do 1284 end do 1285 end if 1286 1287 ! ------------------- 1288 ! Fourier calculation 1289 ! ------------------- 1290 ! Cuda version 1291 if(gpu_option_/=ABI_GPU_DISABLED) then 1292 ABI_MALLOC(weight_t,(bandpp)) 1293 do iibandpp=1,bandpp 1294 ! Compute the index of the band 1295 ind_occ = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + iibandpp 1296 ! Compute the weight of the band 1297 weight_t(iibandpp)=occ_k(ind_occ)*wtk/ucvol 1298 if(abs(occ_k(ind_occ)) < tol8) weight_t(iibandpp) = zero 1299 end do 1300 ! Accumulate time because it is not done in gpu_fourwf 1301 call timab(240+tim_fourwf,1,tsec) 1302 if(gpu_option_==ABI_GPU_LEGACY) then 1303 #if defined HAVE_GPU_CUDA 1304 call gpu_fourwf(1,rhoaug,& 1305 & cwavef_alltoall1,& 1306 & dummy,wfraug,gbound_,gbound_,& 1307 & istwf_k_,kg_k_gather,kg_k_gather,mgfft,mpi_enreg,bandpp,& 1308 & ngfft,ndatarecv,1,n4,n5,n6,option_fourwf,mpi_enreg%paral_kgb,& 1309 & tim_fourwf,weight_t,weight_t) 1310 #endif 1311 else if(gpu_option_==ABI_GPU_KOKKOS) then 1312 #if defined HAVE_GPU_CUDA 1313 call gpu_fourwf_managed(1,rhoaug,& 1314 & cwavef_alltoall1,& 1315 & dummy,wfraug,gbound_,gbound_,& 1316 & istwf_k_,kg_k_gather,kg_k_gather,mgfft,mpi_enreg,bandpp,& 1317 & ngfft,ndatarecv,1,n4,n5,n6,option_fourwf,mpi_enreg%paral_kgb,& 1318 & tim_fourwf,weight_t,weight_t) 1319 #endif 1320 else if(gpu_option_==ABI_GPU_OPENMP) then 1321 #ifdef HAVE_OPENMP_OFFLOAD 1322 call ompgpu_fourwf (1,rhoaug,& 1323 & cwavef_alltoall1,& 1324 & dummy,wfraug,gbound_,gbound_,& 1325 & istwf_k_,kg_k_gather,kg_k_gather,mgfft,bandpp,& 1326 & ngfft,ndatarecv,1,n4,n5,n6,option_fourwf,& 1327 & weight_t,weight_t) 1328 #endif 1329 end if ! gpu_option_ 1330 call timab(240+tim_fourwf,2,tsec) 1331 ABI_FREE(weight_t) 1332 1333 ! Standard version 1334 else 1335 do iibandpp=1,bandpp 1336 ! Compute the index of the band 1337 ind_occ = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + iibandpp 1338 ! Compute the weight of the band 1339 weight=occ_k(ind_occ)*wtk/ucvol 1340 if (option_fourwf==0) then 1341 wfraug_ptr => wfraug(:,:,:,(iibandpp-1)*n6+1:iibandpp*n6) 1342 else 1343 wfraug_ptr => wfraug 1344 end if 1345 if (abs(occ_k(ind_occ)) >=tol8.or.option_fourwf==0) then 1346 if(have_to_reequilibrate) then 1347 call fourwf(1,rhoaug, & 1348 & cwavef_fft(:,(npw_fft*(iibandpp-1))+1:(npw_fft*iibandpp)), & 1349 & dummy,wfraug_ptr,gbound_,gbound_,& 1350 & istwf_k_,kg_k_fft,kg_k_fft,mgfft,mpi_enreg,1,& 1351 & ngfft,npw_fft,1,n4,n5,n6,option_fourwf,tim_fourwf,weight,weight,& 1352 & gpu_option=gpu_option_) 1353 else 1354 call fourwf(1,rhoaug,& 1355 & cwavef_alltoall1(:,(ndatarecv*(iibandpp-1))+1:(ndatarecv*iibandpp)),& 1356 & dummy,wfraug_ptr,gbound_,gbound_,& 1357 & istwf_k_,kg_k_gather,kg_k_gather,mgfft,mpi_enreg,1,& 1358 & ngfft,ndatarecv,1,n4,n5,n6,option_fourwf,& 1359 & tim_fourwf,weight,weight) 1360 end if 1361 if (option_fourwf==0.and.nproc_fft>1) then 1362 if (me_fft>0) then 1363 nd3=(ngfft(3)-1)/nproc_fft+1 1364 wfraug_ptr(:,:,:,me_fft*nd3+1:me_fft*nd3+nd3)=wfraug_ptr(:,:,:,1:nd3) 1365 wfraug_ptr(:,:,:,1:nd3)=zero 1366 end if 1367 call xmpi_sum(wfraug_ptr,mpi_enreg%comm_fft,ier) 1368 end if 1369 end if 1370 end do 1371 end if ! (gpu_option/=0) 1372 1373 ! ----------------------------------------------------- 1374 ! Sorting waves functions below the processors 1375 ! ----------------------------------------------------- 1376 ! cwavef_alltoall(:,index_wavef_band) = cwavef_alltoall(:,:) ! NOT NEEDED 1377 ABI_FREE(index_wavef_band) 1378 1379 !==================================================================== 1380 else if (flag_inv_sym) then 1381 1382 ! ------------------------------------------------------------- 1383 ! Computation of the index to class the waves functions below bandpp 1384 ! ------------------------------------------------------------- 1385 call prep_index_wavef_bandpp(nproc_band,bandpp,& 1386 & 1,ndatarecv,& 1387 & recvcounts,rdispls,& 1388 & index_wavef_band) 1389 1390 ! ------------------------------------------------------- 1391 ! Sorting the wave functions below bandpp 1392 ! ------------------------------------------------------- 1393 cwavef_alltoall1(:,:) = cwavef_alltoall2(:,index_wavef_band) 1394 1395 ! ------------------------------------------------------------ 1396 ! We associate the waves functions by two 1397 ! ------------------------------------------------------------ 1398 call prep_wavef_sym_do(mpi_enreg,bandpp,1,& 1399 & ndatarecv,& 1400 & ndatarecv_tot,ndatasend_sym,tab_proc,& 1401 & cwavef_alltoall1,& 1402 & sendcounts_sym,sdispls_sym,& 1403 & recvcounts_sym,rdispls_sym,& 1404 & ewavef_alltoall_sym,& 1405 & index_wavef_send) 1406 1407 ! ------------------------------------------------------------ 1408 ! Fourier calculation 1409 ! ------------------------------------------------------------ 1410 ! Cuda version 1411 if (gpu_option_/=ABI_GPU_DISABLED) then 1412 ABI_MALLOC(weight1_t,(bandpp_sym)) 1413 ABI_MALLOC(weight2_t,(bandpp_sym)) 1414 do iibandpp=1,bandpp_sym 1415 if (bandpp/=1) then 1416 ind_occ1 = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + (2*iibandpp-1) 1417 ind_occ2 = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + (2*iibandpp ) 1418 else 1419 ind_occ1 = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + 1 1420 ind_occ2 = ind_occ1 1421 end if 1422 weight1_t(iibandpp) = occ_k(ind_occ1)*wtk/ucvol 1423 weight2_t(iibandpp) = occ_k(ind_occ2)*wtk/ucvol 1424 end do 1425 call timab(240+tim_fourwf,1,tsec) 1426 if (gpu_option_==ABI_GPU_LEGACY) then 1427 #if defined HAVE_GPU_CUDA 1428 call gpu_fourwf(1,rhoaug,& 1429 & ewavef_alltoall_sym,& 1430 & dummy,wfraug,gbound_,gbound_,& 1431 & istwf_k_,kg_k_gather_sym,kg_k_gather_sym,mgfft,mpi_enreg,bandpp_sym,& 1432 & ngfft,ndatarecv_tot,1,n4,n5,n6,option_fourwf,mpi_enreg%paral_kgb,& 1433 & tim_fourwf,weight1_t,weight2_t) 1434 #endif 1435 else if(gpu_option_==ABI_GPU_KOKKOS) then 1436 #if defined HAVE_GPU_CUDA 1437 call gpu_fourwf_managed(1,rhoaug,& 1438 & ewavef_alltoall_sym,& 1439 & dummy,wfraug,gbound_,gbound_,& 1440 & istwf_k_,kg_k_gather_sym,kg_k_gather_sym,mgfft,mpi_enreg,bandpp_sym,& 1441 & ngfft,ndatarecv_tot,1,n4,n5,n6,option_fourwf,mpi_enreg%paral_kgb,& 1442 & tim_fourwf,weight1_t,weight2_t) 1443 #endif 1444 else if (gpu_option_==ABI_GPU_OPENMP) then 1445 #ifdef HAVE_OPENMP_OFFLOAD 1446 call ompgpu_fourwf(1,rhoaug,& 1447 & ewavef_alltoall_sym,& 1448 & dummy,wfraug,gbound_,gbound_,& 1449 & istwf_k_,kg_k_gather_sym,kg_k_gather_sym,mgfft,bandpp_sym,& 1450 & ngfft,ndatarecv_tot,1,n4,n5,n6,option_fourwf,& 1451 & weight1_t,weight2_t) 1452 #endif 1453 end if ! gpu_option_ 1454 call timab(240+tim_fourwf,2,tsec) 1455 ABI_FREE(weight1_t) 1456 ABI_FREE(weight2_t) 1457 1458 ! Standard version 1459 else 1460 if (option_fourwf==0.and.bandpp>1) then 1461 ABI_MALLOC(wfraug_ptr,(2,n4,n5,n6)) 1462 else 1463 wfraug_ptr => wfraug 1464 end if 1465 do iibandpp=1,bandpp_sym 1466 if (bandpp/=1) then 1467 ind_occ1 = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + (2*iibandpp-1) 1468 ind_occ2 = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + (2*iibandpp ) 1469 else 1470 ind_occ1 = (iblock-1)*blocksize + (mpi_enreg%me_band * bandpp) + 1 1471 ind_occ2 = ind_occ1 1472 end if 1473 weight1 = occ_k(ind_occ1)*wtk/ucvol 1474 weight2 = occ_k(ind_occ2)*wtk/ucvol 1475 call fourwf(1,rhoaug,& 1476 & ewavef_alltoall_sym(:,(ndatarecv_tot*(iibandpp-1))+1:(ndatarecv_tot*iibandpp)),& 1477 & dummy,wfraug_ptr,gbound_,gbound_,& 1478 & istwf_k_,kg_k_gather_sym,kg_k_gather_sym,mgfft,mpi_enreg,1,& 1479 & ngfft,ndatarecv_tot,1,n4,n5,n6,option_fourwf,& 1480 & tim_fourwf,weight1,weight2) 1481 if (option_fourwf==0) then 1482 if (modulo(bandpp,2)==0) then 1483 jjbandpp=2*iibandpp-1 1484 wfraug(1,:,:,(jjbandpp-1)*n6+1:jjbandpp*n6)=wfraug_ptr(1,:,:,1:n6) 1485 wfraug(1,:,:,(jjbandpp)*n6+1:(jjbandpp+1)*n6)=wfraug_ptr(2,:,:,1:n6) 1486 else if (bandpp>1) then 1487 wfraug(1,:,:,(iibandpp-1)*n6+1:iibandpp*n6)=wfraug_ptr(1,:,:,1:n6) 1488 end if 1489 if (nproc_fft>1) then 1490 if (me_fft>0) then 1491 nd3=(ngfft(3)-1)/nproc_fft+1 1492 wfraug(1,:,:,me_fft*nd3+1:me_fft*nd3+nd3)=wfraug(1,:,:,1:nd3) 1493 wfraug(1,:,:,1:nd3)=zero 1494 end if 1495 call xmpi_sum(wfraug,mpi_enreg%comm_fft,ier) 1496 end if 1497 end if 1498 end do 1499 if (option_fourwf==0.and.bandpp>1) then 1500 ABI_FREE(wfraug_ptr) 1501 end if 1502 end if ! (gpu_option/=ABI_GPU_DISABLED) 1503 1504 ! ------------------------------------------------------------ 1505 ! We dissociate each wave function in two waves functions 1506 ! gwavef is classed below of bandpp 1507 ! ------------------------------------------------------------ 1508 call prep_wavef_sym_undo(mpi_enreg,bandpp,1,& 1509 & ndatarecv,& 1510 & ndatarecv_tot,ndatasend_sym,idatarecv0,& 1511 & cwavef_alltoall1,& 1512 & sendcounts_sym,sdispls_sym,& 1513 & recvcounts_sym,rdispls_sym,& 1514 & ewavef_alltoall_sym,& 1515 & index_wavef_send) 1516 1517 ABI_FREE(ewavef_alltoall_sym) 1518 ABI_FREE(index_wavef_send) 1519 1520 ! ------------------------------------------------------- 1521 ! Sorting waves functions below the processors 1522 ! ------------------------------------------------------- 1523 ! cwavef_alltoall(:,index_wavef_band) = cwavef_alltoall(:,:) ! NOT NEEDED 1524 1525 ABI_FREE(index_wavef_band) 1526 1527 end if 1528 1529 !==================================================================== 1530 if(have_to_reequilibrate) then 1531 ABI_FREE(buff_wf) 1532 ABI_FREE(cwavef_fft) 1533 if(bandpp > 1) then 1534 ABI_FREE(cwavef_fft_tr) 1535 end if 1536 end if 1537 ABI_FREE(sendcountsloc) 1538 ABI_FREE(sdisplsloc) 1539 ABI_FREE(recvcountsloc) 1540 ABI_FREE(rdisplsloc) 1541 ABI_FREE(cwavef_alltoall2) 1542 if ( ((.not.flag_inv_sym) .and. (bandpp>1) ) .or. flag_inv_sym ) then 1543 if(gpu_option_==ABI_GPU_KOKKOS) then 1544 #if defined HAVE_GPU && defined HAVE_YAKL 1545 ABI_FREE_MANAGED(cwavef_alltoall1) 1546 #endif 1547 else 1548 ABI_FREE(cwavef_alltoall1) 1549 end if 1550 end if 1551 1552 end subroutine prep_fourwf
ABINIT/prep_getghc [ Functions ]
NAME
prep_getghc
FUNCTION
this routine prepares the data to the call of getghc.
INPUTS
blocksize= size of block for FFT cpopt=flag defining the status of cprjin%cp(:)=<Proj_i|Cnk> scalars (see below, side effects) cwavef(2,npw*my_nspinor*blocksize)=planewave coefficients of wavefunction. gs_hamk <type(gs_hamiltonian_type)>=all data for the hamiltonian at k gvnlxc=matrix elements <G|Vnonlocal+VFockACE|C> lambda=factor to be used when computing <G|H-lambda.S|C> - only for sij_opt=-1 Typically lambda is the eigenvalue (or its guess) mpi_enreg=information about mpi parallelization prtvol=control print volume and debugging output sij_opt= -PAW ONLY- if 0, only matrix elements <G|H|C> have to be computed (S=overlap) if 1, matrix elements <G|S|C> have to be computed in gsc in addition to ghc if -1, matrix elements <G|H-lambda.S|C> have to be computed in ghc (gsc not used)
OUTPUT
gwavef=(2,npw*my_nspinor*blocksize)=matrix elements <G|H|C> (if sij_opt>=0) or <G|H-lambda.S|C> (if sij_opt=-1). swavef=(2,npw*my_nspinor*blocksize)=matrix elements <G|S|C>.
SIDE EFFECTS
====== if gs_hamk%usepaw==1 cwaveprj(natom,my_nspinor*bandpp)= wave functions at k projected with nl projectors
SOURCE
105 subroutine prep_getghc(cwavef, gs_hamk, gvnlxc, gwavef, swavef, lambda, blocksize, & 106 mpi_enreg, prtvol, sij_opt, cpopt, cwaveprj, & 107 already_transposed) ! optional argument 108 109 !Arguments ------------------------------------ 110 !scalars 111 integer,intent(in) :: blocksize,cpopt,prtvol,sij_opt 112 logical, intent(in),optional :: already_transposed 113 real(dp),intent(in) :: lambda 114 type(gs_hamiltonian_type),intent(inout) :: gs_hamk 115 type(mpi_type),intent(in) :: mpi_enreg 116 !arrays 117 real(dp),intent(in) :: cwavef(:,:) 118 real(dp),intent(inout) :: gvnlxc (:,:),gwavef(:,:),swavef(:,:) 119 type(pawcprj_type), intent(inout) :: cwaveprj(:,:) 120 121 !Local variables------------------------------- 122 !scalars 123 integer,parameter :: tim_getghc=6 124 integer :: bandpp,bandpp_sym,idatarecv0,ier,ikpt_this_proc,iscalc,mcg,my_nspinor 125 integer :: nbval,ndatarecv,ndatarecv_tot,ndatasend_sym,nproc_band,nproc_fft 126 integer :: spaceComm 127 logical :: flag_inv_sym, do_transpose, local_gvnlxc 128 !character(len=500) :: msg 129 !arrays 130 integer,allocatable :: index_wavef_band(:),index_wavef_send(:),index_wavef_spband(:) 131 integer,allocatable :: rdisplsloc(:),recvcountsloc(:),sdisplsloc(:),sendcountsloc(:) 132 integer,ABI_CONTIGUOUS pointer :: kg_k_gather_sym(:,:) 133 integer,ABI_CONTIGUOUS pointer :: rdispls(:),rdispls_sym(:) 134 integer,ABI_CONTIGUOUS pointer :: recvcounts(:),recvcounts_sym(:),recvcounts_sym_tot(:) 135 integer,ABI_CONTIGUOUS pointer :: sdispls(:),sdispls_sym(:) 136 integer,ABI_CONTIGUOUS pointer :: sendcounts(:),sendcounts_sym(:),sendcounts_sym_all(:) 137 integer,ABI_CONTIGUOUS pointer :: tab_proc(:) 138 real(dp) :: tsec(2) 139 real(dp),allocatable,target :: cwavef_alltoall1(:,:), gvnlxc_alltoall1(:,:) 140 real(dp),allocatable,target :: gwavef_alltoall1(:,:), swavef_alltoall1(:,:) 141 142 #if defined HAVE_GPU && defined HAVE_YAKL 143 real(c_double), ABI_CONTIGUOUS pointer :: cwavef_alltoall2(:,:) => null() 144 real(c_double), ABI_CONTIGUOUS pointer :: gvnlxc_alltoall2(:,:) => null() 145 real(c_double), ABI_CONTIGUOUS pointer :: gwavef_alltoall2(:,:) => null() 146 real(c_double), ABI_CONTIGUOUS pointer :: swavef_alltoall2(:,:) => null() 147 #else 148 real(dp),allocatable,target :: cwavef_alltoall2(:,:) 149 real(dp),allocatable,target :: gvnlxc_alltoall2(:,:) 150 real(dp),allocatable,target :: gwavef_alltoall2(:,:) 151 real(dp),allocatable,target :: swavef_alltoall2(:,:) 152 #endif 153 154 real(dp),pointer :: ewavef_alltoall_sym(:,:) 155 real(dp),pointer :: gvnlxc_alltoall_sym(:,:) 156 real(dp),pointer :: gwavef_alltoall_sym(:,:) 157 real(dp),pointer :: swavef_alltoall_sym(:,:) 158 159 ! ************************************************************************* 160 161 call timab(630,1,tsec) 162 call timab(631,3,tsec) 163 164 !Some inits 165 nproc_band = mpi_enreg%nproc_band 166 nproc_fft = mpi_enreg%nproc_fft 167 bandpp = mpi_enreg%bandpp 168 my_nspinor = max(1,gs_hamk%nspinor/mpi_enreg%nproc_spinor) 169 170 do_transpose = .true. 171 if(present(already_transposed)) then 172 if(already_transposed) do_transpose = .false. 173 end if 174 175 flag_inv_sym = (gs_hamk%istwf_k==2 .and. any(gs_hamk%ngfft(7) == [401,402,312,512])) 176 if (flag_inv_sym) then 177 gs_hamk%istwf_k = 1 178 if (modulo(bandpp,2)==0) bandpp_sym = bandpp/2 179 if (modulo(bandpp,2)/=0) bandpp_sym = bandpp 180 end if 181 182 !Check sizes 183 mcg=2*gs_hamk%npw_fft_k*my_nspinor*bandpp 184 if (do_transpose) mcg=2*gs_hamk%npw_k*my_nspinor*blocksize 185 if (size(cwavef)<mcg) then 186 ABI_BUG('wrong size for cwavef!') 187 end if 188 if (size(gwavef)<mcg) then 189 ABI_BUG('wrong size for gwavef!') 190 end if 191 local_gvnlxc = .false. 192 if (size(gvnlxc)==0) then 193 local_gvnlxc = .true. 194 end if 195 if ((.not.local_gvnlxc).and.size(gvnlxc)<mcg) then 196 ABI_BUG('wrong size for gvnlxc!') 197 end if 198 if (sij_opt==1) then 199 if (size(swavef)<mcg) then 200 ABI_BUG('wrong size for swavef!') 201 end if 202 end if 203 if (gs_hamk%usepaw==1.and.cpopt>=0) then 204 if (size(cwaveprj)<gs_hamk%natom*my_nspinor*bandpp) then 205 ABI_BUG('wrong size for cwaveprj!') 206 end if 207 end if 208 209 !==================================================================================== 210 211 spaceComm=mpi_enreg%comm_fft 212 if(mpi_enreg%paral_kgb==1) spaceComm=mpi_enreg%comm_band 213 214 ikpt_this_proc=bandfft_kpt_get_ikpt() 215 216 ABI_MALLOC(sendcountsloc,(nproc_band)) 217 ABI_MALLOC(sdisplsloc ,(nproc_band)) 218 ABI_MALLOC(recvcountsloc,(nproc_band)) 219 ABI_MALLOC(rdisplsloc ,(nproc_band)) 220 221 recvcounts =>bandfft_kpt(ikpt_this_proc)%recvcounts(:) 222 sendcounts =>bandfft_kpt(ikpt_this_proc)%sendcounts(:) 223 rdispls =>bandfft_kpt(ikpt_this_proc)%rdispls (:) 224 sdispls =>bandfft_kpt(ikpt_this_proc)%sdispls (:) 225 ndatarecv = bandfft_kpt(ikpt_this_proc)%ndatarecv 226 227 if (flag_inv_sym ) then 228 idatarecv0 = bandfft_kpt(ikpt_this_proc)%idatarecv0 229 ndatarecv_tot = bandfft_kpt(ikpt_this_proc)%ndatarecv_tot 230 ndatasend_sym = bandfft_kpt(ikpt_this_proc)%ndatasend_sym 231 kg_k_gather_sym =>bandfft_kpt(ikpt_this_proc)%kg_k_gather_sym(:,:) 232 rdispls_sym =>bandfft_kpt(ikpt_this_proc)%rdispls_sym(:) 233 recvcounts_sym =>bandfft_kpt(ikpt_this_proc)%recvcounts_sym(:) 234 recvcounts_sym_tot =>bandfft_kpt(ikpt_this_proc)%recvcounts_sym_tot(:) 235 sdispls_sym =>bandfft_kpt(ikpt_this_proc)%sdispls_sym(:) 236 sendcounts_sym =>bandfft_kpt(ikpt_this_proc)%sendcounts_sym(:) 237 sendcounts_sym_all =>bandfft_kpt(ikpt_this_proc)%sendcounts_sym_all(:) 238 tab_proc =>bandfft_kpt(ikpt_this_proc)%tab_proc(:) 239 end if 240 iscalc=(sij_opt+1)/2 ! 0 if S not calculated, 1 otherwise 241 nbval=(ndatarecv*my_nspinor*bandpp)*iscalc 242 243 if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. & 244 & ((.not.flag_inv_sym) .and. bandpp>1) .or. flag_inv_sym ) then 245 ABI_MALLOC(cwavef_alltoall1,(2,ndatarecv*my_nspinor*bandpp)) 246 ABI_MALLOC(gwavef_alltoall1,(2,ndatarecv*my_nspinor*bandpp)) 247 ABI_MALLOC(swavef_alltoall1,(2,ndatarecv*my_nspinor*bandpp)) 248 if (local_gvnlxc) then 249 ABI_MALLOC(gvnlxc_alltoall1,(0,0)) 250 else 251 ABI_MALLOC(gvnlxc_alltoall1,(2,ndatarecv*my_nspinor*bandpp)) 252 end if 253 swavef_alltoall1(:,:)=zero 254 cwavef_alltoall1(:,:)=zero 255 gwavef_alltoall1(:,:)=zero 256 if (.not.local_gvnlxc) gvnlxc_alltoall1(:,:)=zero 257 end if 258 259 if(gs_hamk%gpu_option==ABI_GPU_KOKKOS) then 260 #if defined HAVE_GPU && defined HAVE_YAKL 261 ABI_MALLOC_MANAGED(cwavef_alltoall2,(/2,ndatarecv*my_nspinor*bandpp/)) 262 ABI_MALLOC_MANAGED(gwavef_alltoall2,(/2,ndatarecv*my_nspinor*bandpp/)) 263 ABI_MALLOC_MANAGED(swavef_alltoall2,(/2,ndatarecv*my_nspinor*bandpp/)) 264 if (local_gvnlxc) then 265 ABI_MALLOC_MANAGED(gvnlxc_alltoall2,(/0,0/)) 266 else 267 ABI_MALLOC_MANAGED(gvnlxc_alltoall2,(/2,ndatarecv*my_nspinor*bandpp/)) 268 end if 269 #endif 270 else 271 ABI_MALLOC(cwavef_alltoall2,(2,ndatarecv*my_nspinor*bandpp)) 272 ABI_MALLOC(gwavef_alltoall2,(2,ndatarecv*my_nspinor*bandpp)) 273 ABI_MALLOC(swavef_alltoall2,(2,ndatarecv*my_nspinor*bandpp)) 274 if (local_gvnlxc) then 275 ABI_MALLOC(gvnlxc_alltoall2,(0,0)) 276 else 277 ABI_MALLOC(gvnlxc_alltoall2,(2,ndatarecv*my_nspinor*bandpp)) 278 end if 279 end if 280 281 swavef_alltoall2(:,:)=zero 282 cwavef_alltoall2(:,:)=zero 283 gwavef_alltoall2(:,:)=zero 284 if (.not.local_gvnlxc) gvnlxc_alltoall2(:,:)=zero 285 286 recvcountsloc(:)=recvcounts(:)*2*my_nspinor*bandpp 287 rdisplsloc(:)=rdispls(:)*2*my_nspinor*bandpp 288 sendcountsloc(:)=sendcounts(:)*2*my_nspinor 289 sdisplsloc(:)=sdispls(:)*2*my_nspinor 290 call timab(631,2,tsec) 291 292 if(do_transpose) then 293 call timab(545,3,tsec) 294 if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. & 295 & ((.not.flag_inv_sym) .and. bandpp>1) .or. flag_inv_sym ) then 296 call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall1,& 297 & recvcountsloc,rdisplsloc,spaceComm,ier) 298 else 299 call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,& 300 & recvcountsloc,rdisplsloc,spaceComm,ier) 301 end if 302 call timab(545,2,tsec) 303 else 304 ! Here, we cheat, and use DCOPY to bypass some compiler's overzealous bound-checking 305 ! (ndatarecv*my_nspinor*bandpp might be greater than the declared size of cwavef) 306 call DCOPY(2*ndatarecv*my_nspinor*bandpp, cwavef, 1, cwavef_alltoall2, 1) 307 end if 308 309 !==================================================================== 310 if ((.not.(flag_inv_sym)) .and. (bandpp==1)) then 311 if (do_transpose .and. mpi_enreg%paral_spinor==0.and.my_nspinor==2)then 312 call timab(632,3,tsec) 313 ! Sort to have all ispinor=1 first, then all ispinor=2 314 call prep_sort_wavef_spin(nproc_band,my_nspinor,ndatarecv,recvcounts,rdispls,index_wavef_spband) 315 cwavef_alltoall2(:,:)=cwavef_alltoall1(:,index_wavef_spband) 316 call timab(632,2,tsec) 317 end if 318 319 call timab(635,3,tsec) 320 call multithreaded_getghc(cpopt,cwavef_alltoall2,cwaveprj,gwavef_alltoall2,swavef_alltoall2(:,1:nbval),& 321 & gs_hamk,gvnlxc_alltoall2,lambda,mpi_enreg,1,prtvol,sij_opt,tim_getghc,0) 322 call timab(635,2,tsec) 323 324 if (do_transpose .and. mpi_enreg%paral_spinor==0.and.my_nspinor==2)then 325 call timab(634,3,tsec) 326 gwavef_alltoall1(:,index_wavef_spband)=gwavef_alltoall2(:,:) 327 if (sij_opt==1) swavef_alltoall1(:,index_wavef_spband)=swavef_alltoall2(:,:) 328 if (.not.local_gvnlxc) gvnlxc_alltoall1(:,index_wavef_spband)=gvnlxc_alltoall2(:,:) 329 ABI_FREE(index_wavef_spband) 330 call timab(634,2,tsec) 331 end if 332 333 else if ((.not.(flag_inv_sym)) .and. (bandpp>1)) then 334 ! ------------------------------------------------------------- 335 ! Computation of the index to class the waves functions below bandpp 336 ! ------------------------------------------------------------- 337 338 if(do_transpose) then 339 call timab(632,3,tsec) 340 call prep_index_wavef_bandpp(nproc_band,bandpp,& 341 & my_nspinor,ndatarecv, recvcounts,rdispls, index_wavef_band) 342 ! ------------------------------------------------------- 343 ! Sorting of the waves functions below bandpp 344 ! ------------------------------------------------------- 345 cwavef_alltoall2(:,:) = cwavef_alltoall1(:,index_wavef_band) 346 call timab(632,2,tsec) 347 end if 348 349 ! ---------------------- 350 ! Fourier transformation 351 ! ---------------------- 352 call timab(636,3,tsec) 353 call multithreaded_getghc(cpopt,cwavef_alltoall2,cwaveprj,gwavef_alltoall2,swavef_alltoall2,gs_hamk,& 354 & gvnlxc_alltoall2,lambda,mpi_enreg,bandpp,prtvol,sij_opt,tim_getghc,0) 355 call timab(636,2,tsec) 356 357 ! ----------------------------------------------------- 358 ! Sorting of waves functions below the processors 359 ! ----------------------------------------------------- 360 if(do_transpose) then 361 call timab(634,3,tsec) 362 gwavef_alltoall1(:,index_wavef_band) = gwavef_alltoall2(:,:) 363 if (sij_opt==1) swavef_alltoall1(:,index_wavef_band) = swavef_alltoall2(:,:) 364 if (.not.local_gvnlxc) gvnlxc_alltoall1(:,index_wavef_band) = gvnlxc_alltoall2(:,:) 365 ABI_FREE(index_wavef_band) 366 call timab(634,2,tsec) 367 end if 368 369 370 else if (flag_inv_sym) then 371 372 ! ------------------------------------------------------------- 373 ! Computation of the index to class the waves functions below bandpp 374 ! ------------------------------------------------------------- 375 if(do_transpose) then 376 call timab(632,3,tsec) 377 call prep_index_wavef_bandpp(nproc_band,bandpp,& 378 & my_nspinor,ndatarecv,& 379 & recvcounts,rdispls,& 380 & index_wavef_band) 381 382 ! ------------------------------------------------------- 383 ! Sorting the wave functions below bandpp 384 ! ------------------------------------------------------- 385 cwavef_alltoall2(:,:) = cwavef_alltoall1(:,index_wavef_band) 386 end if 387 388 ! ------------------------------------------------------------ 389 ! We associate the waves functions by two 390 ! ------------------------------------------------------------ 391 call prep_wavef_sym_do(mpi_enreg,bandpp,my_nspinor,& 392 & ndatarecv,& 393 & ndatarecv_tot,ndatasend_sym,tab_proc,& 394 & cwavef_alltoall2,& 395 & sendcounts_sym,sdispls_sym,& 396 & recvcounts_sym,rdispls_sym,& 397 & ewavef_alltoall_sym,& 398 & index_wavef_send) 399 400 ! ------------------------------------------------------------ 401 ! Allocation 402 ! ------------------------------------------------------------ 403 ABI_MALLOC(gwavef_alltoall_sym,(2,ndatarecv_tot*bandpp_sym)) 404 ABI_MALLOC(swavef_alltoall_sym,(2,(ndatarecv_tot*bandpp_sym)*iscalc)) 405 if (local_gvnlxc) then 406 ABI_MALLOC(gvnlxc_alltoall_sym ,(0,0)) 407 else 408 ABI_MALLOC(gvnlxc_alltoall_sym ,(2,ndatarecv_tot*bandpp_sym)) 409 end if 410 gwavef_alltoall_sym(:,:)=zero 411 swavef_alltoall_sym(:,:)=zero 412 if (.not.local_gvnlxc) gvnlxc_alltoall_sym(:,:)=zero 413 414 call timab(632,2,tsec) 415 416 ! ------------------------------------------------------------ 417 ! Fourier calculation 418 ! ------------------------------------------------------------ 419 call timab(637,3,tsec) 420 call multithreaded_getghc(cpopt,ewavef_alltoall_sym,cwaveprj,gwavef_alltoall_sym,swavef_alltoall_sym,gs_hamk,& 421 & gvnlxc_alltoall_sym,lambda,mpi_enreg,bandpp_sym,prtvol,sij_opt,tim_getghc,1,& 422 & kg_fft_k=kg_k_gather_sym) 423 call timab(637,2,tsec) 424 425 call timab(633,3,tsec) 426 427 ! ------------------------------------------------------------ 428 ! We dissociate each wave function in two waves functions 429 ! gwavef is classed below of bandpp 430 ! ------------------------------------------------------------ 431 call prep_wavef_sym_undo(mpi_enreg,bandpp,my_nspinor,& 432 & ndatarecv,& 433 & ndatarecv_tot,ndatasend_sym,idatarecv0,& 434 & gwavef_alltoall2,& 435 & sendcounts_sym,sdispls_sym,& 436 & recvcounts_sym,rdispls_sym,& 437 & gwavef_alltoall_sym,& 438 & index_wavef_send) 439 if (sij_opt==1)then 440 call prep_wavef_sym_undo(mpi_enreg,bandpp,my_nspinor,& 441 & ndatarecv,& 442 & ndatarecv_tot,ndatasend_sym,idatarecv0,& 443 & swavef_alltoall2,& 444 & sendcounts_sym,sdispls_sym,& 445 & recvcounts_sym,rdispls_sym,& 446 & swavef_alltoall_sym,& 447 & index_wavef_send) 448 end if 449 if (.not.local_gvnlxc) call prep_wavef_sym_undo(mpi_enreg,bandpp,my_nspinor,& 450 & ndatarecv,& 451 & ndatarecv_tot,ndatasend_sym,idatarecv0,& 452 & gvnlxc_alltoall2,& 453 & sendcounts_sym,sdispls_sym,& 454 & recvcounts_sym,rdispls_sym,& 455 & gvnlxc_alltoall_sym,& 456 & index_wavef_send) 457 458 ABI_FREE(ewavef_alltoall_sym) 459 ABI_FREE(index_wavef_send) 460 ABI_FREE(gwavef_alltoall_sym) 461 ABI_FREE(swavef_alltoall_sym) 462 ABI_FREE(gvnlxc_alltoall_sym) 463 464 ! ------------------------------------------- 465 ! We call getghc to calculate the nl matrix elements. 466 ! -------------------------------------------- 467 gs_hamk%istwf_k=2 468 !!write(std_out,*)"Setting iswfk_k to 2" 469 470 call timab(633,2,tsec) 471 472 call timab(638,3,tsec) 473 call multithreaded_getghc(cpopt,cwavef_alltoall2,cwaveprj,gwavef_alltoall2,swavef_alltoall2,gs_hamk,& 474 & gvnlxc_alltoall2,lambda,mpi_enreg,bandpp,prtvol,sij_opt,tim_getghc,2) 475 call timab(638,2,tsec) 476 477 call timab(634,3,tsec) 478 479 gs_hamk%istwf_k=1 480 481 ! ------------------------------------------------------- 482 ! Sorting the wave functions below the processors 483 ! ------------------------------------------------------- 484 if(do_transpose) then 485 ! cwavef_alltoall(:,index_wavef_band) = cwavef_alltoall(:,:) ! NOT NEEDED 486 gwavef_alltoall1(:,index_wavef_band) = gwavef_alltoall2(:,:) 487 if (sij_opt==1) swavef_alltoall1(:,index_wavef_band) = swavef_alltoall2(:,:) 488 if (.not.local_gvnlxc) gvnlxc_alltoall1(:,index_wavef_band) = gvnlxc_alltoall2(:,:) 489 ABI_FREE(index_wavef_band) 490 call timab(634,2,tsec) 491 end if 492 493 end if 494 !==================================================================== 495 496 if(do_transpose) then 497 498 call timab(545,3,tsec) 499 if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. & 500 & ((.not.flag_inv_sym) .and. bandpp>1) .or. flag_inv_sym ) then 501 if (sij_opt==1) then 502 call xmpi_alltoallv(swavef_alltoall1,recvcountsloc,rdisplsloc,swavef,& 503 & sendcountsloc,sdisplsloc,spaceComm,ier) 504 end if 505 if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall1,recvcountsloc,rdisplsloc,gvnlxc,& 506 & sendcountsloc,sdisplsloc,spaceComm,ier) 507 call xmpi_alltoallv(gwavef_alltoall1,recvcountsloc,rdisplsloc,gwavef,& 508 & sendcountsloc,sdisplsloc,spaceComm,ier) 509 else 510 if (sij_opt==1) then 511 call xmpi_alltoallv(swavef_alltoall2,recvcountsloc,rdisplsloc,swavef,& 512 & sendcountsloc,sdisplsloc,spaceComm,ier) 513 end if 514 if (.not.local_gvnlxc) call xmpi_alltoallv(gvnlxc_alltoall2,recvcountsloc,rdisplsloc,gvnlxc,& 515 & sendcountsloc,sdisplsloc,spaceComm,ier) 516 call xmpi_alltoallv(gwavef_alltoall2,recvcountsloc,rdisplsloc,gwavef,& 517 & sendcountsloc,sdisplsloc,spaceComm,ier) 518 end if 519 520 call timab(545,2,tsec) 521 else 522 if(sij_opt == 1) then 523 call DCOPY(2*ndatarecv*my_nspinor*bandpp, swavef_alltoall2, 1, swavef, 1) 524 end if 525 if (.not.local_gvnlxc) call DCOPY(2*ndatarecv*my_nspinor*bandpp, gvnlxc_alltoall2, 1, gvnlxc, 1) 526 call DCOPY(2*ndatarecv*my_nspinor*bandpp, gwavef_alltoall2, 1, gwavef, 1) 527 end if 528 529 !==================================================================== 530 if (flag_inv_sym) then 531 gs_hamk%istwf_k = 2 532 end if 533 !==================================================================== 534 ABI_FREE(sendcountsloc) 535 ABI_FREE(sdisplsloc) 536 ABI_FREE(recvcountsloc) 537 ABI_FREE(rdisplsloc) 538 539 if(gs_hamk%gpu_option==ABI_GPU_KOKKOS) then 540 #if defined HAVE_GPU && defined HAVE_YAKL 541 ABI_FREE_MANAGED(cwavef_alltoall2) 542 ABI_FREE_MANAGED(gwavef_alltoall2) 543 ABI_FREE_MANAGED(gvnlxc_alltoall2) 544 ABI_FREE_MANAGED(swavef_alltoall2) 545 #endif 546 else 547 ABI_FREE(cwavef_alltoall2) 548 ABI_FREE(gwavef_alltoall2) 549 ABI_FREE(gvnlxc_alltoall2) 550 ABI_FREE(swavef_alltoall2) 551 end if 552 553 if ( ((.not.flag_inv_sym) .and. bandpp==1 .and. mpi_enreg%paral_spinor==0 .and. my_nspinor==2 ).or. & 554 & ((.not.flag_inv_sym) .and. bandpp>1) .or. flag_inv_sym ) then 555 ABI_FREE(cwavef_alltoall1) 556 ABI_FREE(gwavef_alltoall1) 557 ABI_FREE(gvnlxc_alltoall1) 558 ABI_FREE(swavef_alltoall1) 559 end if 560 561 call timab(630,2,tsec) 562 563 end subroutine prep_getghc
ABINIT/prep_index_wavef_bandpp [ Functions ]
NAME
prep_index_wavef_bandpp
FUNCTION
this routine sorts the waves functions by bandpp and by processors after the alltoall
INPUTS
nproc_band = number of processors below the band bandpp = number of groups of couple of waves functions nspinor = number of spin ndatarecv = total number of values received by the processor and sended by the other processors band recvcounts = number of values sended by each processor band and received by the processor rdispls = positions of the values received by the processor and sended by each processor band
OUTPUT
index_wavef_band = position of the sorted values
SOURCE
2078 subroutine prep_index_wavef_bandpp(nproc_band,bandpp,& 2079 nspinor,ndatarecv,& 2080 recvcounts,rdispls,& 2081 index_wavef_band) 2082 2083 !Arguments ------------------------------------ 2084 !scalars 2085 integer,intent(in) :: bandpp,ndatarecv,nproc_band,nspinor 2086 !arrays 2087 integer,intent(in) :: rdispls(nproc_band),recvcounts(nproc_band) 2088 integer,allocatable,intent(out) :: index_wavef_band(:) 2089 2090 !Local variables------------------------------- 2091 !scalars 2092 integer :: delta,idebc,idebe,ifinc,ifine,iindex,iproc,kbandpp,nb 2093 2094 ! ********************************************************************* 2095 2096 !DEBUG 2097 !write(std_out,*)' prep_index_wavef_banpp : enter ' 2098 !write(std_out,*) 'ndatarecv = ', ndatarecv 2099 !write(std_out,*) 'rdispls(:) = ', rdispls(:) 2100 !write(std_out,*) 'recvcounts(:) = ', recvcounts(:) 2101 !ENDDEBUG 2102 2103 2104 !--------------------------------------------- 2105 !Allocation 2106 !--------------------------------------------- 2107 ABI_MALLOC(index_wavef_band ,(bandpp*nspinor*ndatarecv)) 2108 index_wavef_band(:) =0 2109 2110 !--------------------------------------------- 2111 !Calcul : loops on bandpp and processors band 2112 !--------------------------------------------- 2113 nb = sum(recvcounts(1:nproc_band)) 2114 do kbandpp=1,bandpp 2115 2116 do iproc=1,nproc_band 2117 2118 idebe = (rdispls(iproc) + 1) + (kbandpp-1) * ndatarecv*nspinor 2119 ifine = idebe + recvcounts(iproc) -1 2120 2121 if (iproc==1) then 2122 idebc = (kbandpp-1)* recvcounts(iproc)*nspinor + 1 2123 else 2124 idebc = (bandpp) * sum(recvcounts(1:iproc-1))*nspinor & 2125 + (kbandpp-1)* recvcounts(iproc)*nspinor & 2126 + 1 2127 end if 2128 ifinc = idebc + recvcounts(iproc) -1 2129 index_wavef_band(idebe:ifine) = (/( iindex,iindex=idebc,ifinc)/) 2130 delta=ifine-idebe 2131 if (nspinor==2) then 2132 index_wavef_band(idebe+nb :idebe+nb +delta)=(/( iindex,iindex=ifinc+1,ifinc+1+delta)/) 2133 end if 2134 end do 2135 end do 2136 2137 end subroutine prep_index_wavef_bandpp
abinit/prep_nonlop [ Functions ]
[ Top ] [ abinit ] [ Functions ]
NAME
prep_nonlop
FUNCTION
this routine prepares the data to the call of nonlop.
INPUTS
choice: chooses possible output: choice=1 => a non-local energy contribution =2 => a gradient with respect to atomic position(s) =3 => a gradient with respect to strain(s) =23=> a gradient with respect to atm. pos. and strain(s) =4 => a 2nd derivative with respect to atomic pos. =24=> a gradient and 2nd derivative with respect to atomic pos. =5 => a gradient with respect to k wavevector =6 => 2nd derivatives with respect to strain and atm. pos. =7 => no operator, just projections blocksize= size of block for FFT cpopt=flag defining the status of cwaveprj=<Proj_i|Cnk> scalars (see below, side effects) cwavef(2,npw*my_nspinor*blocksize)=planewave coefficients of wavefunction. gvnlxc=matrix elements <G|Vnonlocal+VFockACE|C> hamk <type(gs_hamiltonian_type)>=data defining the Hamiltonian at a given k (NL part involved here) idir=direction of the - atom to be moved in the case (choice=2,signs=2), - k point direction in the case (choice=5,signs=2) - strain component (1:6) in the case (choice=2,signs=2) or (choice=6,signs=1) lambdablock(blocksize)=factor to be used when computing (Vln-lambda.S) - only for paw_opt=2 mpi_enreg=information about mpi parallelization nnlout=dimension of enlout (when signs=1): ntypat=number of types of atoms in cell paw_opt= define the nonlocal operator concerned with signs= if 1, get contracted elements (energy, forces, stress, ...) if 2, applies the non-local operator to a function in reciprocal space tim_nonlop=timing code of the calling routine (can be set to 0 if not attributed) vectproj(2,nprojs,my_nspinor*ndat)=Optional, vector to be used instead of cprjin%cp when provided
OUTPUT
==== if (signs==1) ==== enlout_block(nnlout)= if paw_opt==0, 1 or 2: contribution of this block of states to the nl part of various properties if paw_opt==3: contribution of this block of states to <c|S|c> (where S=overlap when PAW) ==== if (signs==2) ==== if paw_opt==0, 1, 2 or 4: gvnlc(2,my_nspinor*npw)=result of the application of the nl operator or one of its derivative to the input vect. if paw_opt==3 or 4: gsc(2,my_nspinor*npw*(paw_opt/3))=result of the aplication of (I+S) to the input vect. (where S=overlap when PAW)
SIDE EFFECTS
==== ONLY IF useylm=1 cwaveprj(natom,my_nspinor) <type(pawcprj_type)>=projected input wave function |c> on non-local projector =<p_lmn|c> and derivatives Treatment depends on cpopt parameter: if cpopt=-1, <p_lmn|in> (and derivatives) have to be computed (and not saved) if cpopt= 0, <p_lmn|in> have to be computed and saved derivatives are eventually computed but not saved if cpopt= 1, <p_lmn|in> and first derivatives have to be computed and saved other derivatives are eventually computed but not saved if cpopt= 2 <p_lmn|in> are already in memory; only derivatives are computed here and not saved (if useylm=0, should have cpopt=-1)
NOTES
cprj (as well as cg) is distributed over band processors. Only the mod((iband-1)/mpi_enreg%bandpp,mpi_enreg%nproc_band) projected WFs are stored on each proc.
SOURCE
636 subroutine prep_nonlop(choice,cpopt,cwaveprj,enlout_block,hamk,idir,lambdablock,& 637 blocksize,mpi_enreg,nnlout,paw_opt,signs,gsc, tim_nonlop,cwavef,gvnlc, & 638 already_transposed,gpu_option,vectproj) ! optional 639 640 !Arguments ------------------------------------ 641 integer, intent(in) :: blocksize,choice,cpopt,idir,signs,nnlout,paw_opt 642 logical,optional,intent(in) :: already_transposed 643 integer,optional,intent(in) :: gpu_option 644 real(dp), intent(in) :: lambdablock(blocksize) 645 real(dp), intent(out) :: enlout_block(nnlout*blocksize),gvnlc(:,:),gsc(:,:) 646 real(dp), intent(inout) :: cwavef(:,:) 647 real(dp),ABI_CONTIGUOUS optional,intent(inout) :: vectproj(:,:,:) 648 type(gs_hamiltonian_type),intent(in) :: hamk 649 type(mpi_type),intent(in) :: mpi_enreg 650 type(pawcprj_type),intent(inout) :: cwaveprj(:,:) 651 652 !Local variables------------------------------- 653 !scalars 654 integer :: bandpp,ier,ikpt_this_proc,my_nspinor,ndatarecv,nproc_band,npw,nspinortot 655 integer :: spaceComm=0,tim_nonlop 656 logical :: do_transpose 657 integer :: l_gpu_option 658 !character(len=500) :: msg 659 !arrays 660 integer, allocatable :: index_wavef_band(:) 661 integer, allocatable :: rdisplsloc(:),recvcountsloc(:),sdisplsloc(:),sendcountsloc(:) 662 integer,ABI_CONTIGUOUS pointer :: rdispls(:),recvcounts(:),sdispls(:),sendcounts(:) 663 real(dp) :: lambda_nonlop(mpi_enreg%bandpp) 664 real(dp) :: tsec(2) 665 666 #if defined HAVE_GPU && defined HAVE_YAKL 667 real(c_double), ABI_CONTIGUOUS pointer :: cwavef_alltoall2(:,:) => null() 668 real(c_double), ABI_CONTIGUOUS pointer :: gvnlc_alltoall2(:,:) => null() 669 real(c_double), ABI_CONTIGUOUS pointer :: gsc_alltoall2(:,:) => null() 670 integer(kind=C_SIZE_T) :: cwavef_alltoall2_size 671 integer(kind=C_SIZE_T) :: gvnlc_alltoall2_size 672 integer(kind=C_SIZE_T) :: gsc_alltoall2_size 673 #else 674 real(dp), allocatable :: cwavef_alltoall2(:,:) 675 real(dp), allocatable :: gvnlc_alltoall2(:,:) 676 real(dp), allocatable :: gsc_alltoall2(:,:) 677 #endif 678 679 real(dp), allocatable :: cwavef_alltoall1(:,:) 680 real(dp), allocatable :: gvnlc_alltoall1(:,:) 681 real(dp), allocatable :: gsc_alltoall1(:,:) 682 real(dp), allocatable :: enlout(:) 683 684 #if defined HAVE_GPU && defined HAVE_YAKL 685 ! this buffer is necessary to avoid mixing "managed memory" buffer with "regular memory" buffer in MPI calls 686 ! Just to be clear: 687 ! - managed memory means memory allocated using ABI_MALLOC_MANAGED 688 ! - regular memory means memory allocated using either ABI_MALLOC or ABI_MALLOC_CUDA 689 ! 690 ! here we chose to use a GPU buffer, would it be better to use a CPU buffer ? To be checked. 691 !real(dp), allocatable :: cwavef_mpi(:,:) 692 type(c_ptr) :: cwavef_mpi_c 693 real(c_double),pointer :: cwavef_mpi(:,:) 694 #endif 695 696 ! ************************************************************************* 697 698 DBG_ENTER('COLL') 699 700 call timab(570,1,tsec) 701 702 do_transpose = .true. 703 if(present(already_transposed)) then 704 if(already_transposed) do_transpose = .false. 705 end if 706 707 l_gpu_option=ABI_GPU_DISABLED 708 if (present(gpu_option)) then 709 l_gpu_option = gpu_option 710 end if 711 712 nproc_band = mpi_enreg%nproc_band 713 bandpp = mpi_enreg%bandpp 714 spaceComm=mpi_enreg%comm_fft 715 if(mpi_enreg%paral_kgb==1) spaceComm=mpi_enreg%comm_band 716 my_nspinor=max(1,hamk%nspinor/mpi_enreg%nproc_spinor) 717 nspinortot=hamk%nspinor 718 719 !Check sizes 720 npw=hamk%npw_k;if (.not.do_transpose) npw=hamk%npw_fft_k 721 if (size(cwavef)/=2*npw*my_nspinor*blocksize) then 722 ABI_BUG('Incorrect size for cwavef!') 723 end if 724 if(choice/=0.and.signs==2) then 725 if (paw_opt/=3) then 726 if (size(gvnlc)/=2*npw*my_nspinor*blocksize) then 727 ABI_BUG('Incorrect size for gvnlc!') 728 end if 729 end if 730 if(paw_opt>=3) then 731 if (size(gsc)/=2*npw*my_nspinor*blocksize) then 732 ABI_BUG('Incorrect size for gsc!') 733 end if 734 end if 735 end if 736 if(cpopt>=0.and. .not. present(vectproj)) then 737 if (size(cwaveprj)/=hamk%natom*my_nspinor*mpi_enreg%bandpp) then 738 ABI_BUG('Incorrect size for cwaveprj!') 739 end if 740 end if 741 742 ABI_MALLOC(sendcountsloc,(nproc_band)) 743 ABI_MALLOC(sdisplsloc ,(nproc_band)) 744 ABI_MALLOC(recvcountsloc,(nproc_band)) 745 ABI_MALLOC(rdisplsloc ,(nproc_band)) 746 747 ikpt_this_proc=bandfft_kpt_get_ikpt() 748 749 recvcounts => bandfft_kpt(ikpt_this_proc)%recvcounts(:) 750 sendcounts => bandfft_kpt(ikpt_this_proc)%sendcounts(:) 751 rdispls => bandfft_kpt(ikpt_this_proc)%rdispls (:) 752 sdispls => bandfft_kpt(ikpt_this_proc)%sdispls (:) 753 ndatarecv = bandfft_kpt(ikpt_this_proc)%ndatarecv 754 755 if(hamk%gpu_option==ABI_GPU_KOKKOS) then 756 #if defined HAVE_GPU && defined HAVE_YAKL 757 ABI_MALLOC_MANAGED(cwavef_alltoall2, (/2,ndatarecv*my_nspinor*bandpp/)) 758 cwavef_alltoall2_size = 2*ndatarecv*my_nspinor*bandpp*dp 759 760 if (paw_opt >= 0 .and. paw_opt < 3) then 761 gsc_alltoall2 => null() 762 gsc_alltoall2_size = 0 763 else 764 ABI_MALLOC_MANAGED(gsc_alltoall2, (/2,ndatarecv*my_nspinor*(paw_opt/3)*bandpp/)) 765 gsc_alltoall2_size = 2*ndatarecv*my_nspinor*(paw_opt/3)*bandpp*dp 766 endif 767 768 ABI_MALLOC_MANAGED(gvnlc_alltoall2, (/2,ndatarecv*my_nspinor*bandpp/)) 769 gvnlc_alltoall2_size = 2*ndatarecv*my_nspinor*bandpp*dp 770 #endif 771 else 772 ABI_MALLOC(cwavef_alltoall2, (2,ndatarecv*my_nspinor*bandpp)) 773 ABI_MALLOC(gsc_alltoall2, (2,ndatarecv*my_nspinor*(paw_opt/3)*bandpp)) 774 ABI_MALLOC(gvnlc_alltoall2, (2,ndatarecv*my_nspinor*bandpp)) 775 end if 776 777 if(do_transpose .and. (bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2)))then 778 ABI_MALLOC(cwavef_alltoall1,(2,ndatarecv*my_nspinor*bandpp)) 779 if(signs==2)then 780 if (paw_opt/=3) then 781 ABI_MALLOC(gvnlc_alltoall1,(2,ndatarecv*my_nspinor*bandpp)) 782 end if 783 if (paw_opt==3.or.paw_opt==4) then 784 ABI_MALLOC(gsc_alltoall1,(2,ndatarecv*my_nspinor*bandpp)) 785 end if 786 end if 787 end if 788 789 ABI_MALLOC(enlout,(nnlout*bandpp)) 790 enlout = zero 791 792 recvcountsloc(:)=recvcounts(:)*2*my_nspinor*bandpp 793 rdisplsloc(:)=rdispls(:)*2*my_nspinor*bandpp 794 sendcountsloc(:)=sendcounts(:)*2*my_nspinor 795 sdisplsloc(:)=sdispls(:)*2*my_nspinor 796 797 if(do_transpose) then 798 call timab(581,1,tsec) 799 if (bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2)) then 800 if (l_gpu_option == ABI_GPU_KOKKOS) then 801 #if defined HAVE_GPU && defined HAVE_YAKL 802 ABI_MALLOC_CUDA(cwavef_mpi_c, INT(2, c_size_t) * npw * my_nspinor * blocksize * dp) 803 call c_f_pointer(cwavef_mpi_c, cwavef_mpi, (/2, npw * my_nspinor * blocksize/)) 804 805 ! use cwavef_mpi instead of cwavef (don't use managed memory in MPI calls) 806 call copy_gpu_to_gpu(cwavef_mpi_c, C_LOC(cwavef), INT(2, c_size_t) * npw * my_nspinor * blocksize * dp) 807 808 call xmpi_alltoallv(cwavef_mpi,sendcountsloc,sdisplsloc,cwavef_alltoall1,& 809 & recvcountsloc,rdisplsloc,spaceComm,ier) 810 811 ABI_FREE_CUDA(cwavef_mpi_c) 812 #endif 813 else 814 call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall1,& 815 & recvcountsloc,rdisplsloc,spaceComm,ier) 816 end if 817 818 819 else 820 call xmpi_alltoallv(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall2,& 821 & recvcountsloc,rdisplsloc,spaceComm,ier) 822 end if 823 call timab(581,2,tsec) 824 else 825 ! Here, we cheat, and use DCOPY to bypass some compiler's overzealous bound-checking 826 ! (ndatarecv*my_nspinor*bandpp might be greater than the declared size of cwavef) 827 if (l_gpu_option == ABI_GPU_KOKKOS) then 828 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) 829 call copy_gpu_to_gpu(C_LOC(cwavef_alltoall2), C_LOC(cwavef), INT(2, c_size_t) * ndatarecv * my_nspinor * bandpp * dp) 830 #endif 831 else 832 call DCOPY(2*ndatarecv*my_nspinor*bandpp,cwavef,1,cwavef_alltoall2,1) 833 end if 834 end if 835 836 !===================================================================== 837 if (bandpp==1) then 838 839 840 if (do_transpose .and. mpi_enreg%paral_spinor==0.and.nspinortot==2) then !Sort WF by spin 841 call prep_sort_wavef_spin(nproc_band,my_nspinor,ndatarecv,recvcounts,rdispls,index_wavef_band) 842 cwavef_alltoall2(:, :) = cwavef_alltoall1(:,index_wavef_band) 843 end if 844 845 if (paw_opt==2) then 846 lambda_nonlop(1)=lambdablock(mpi_enreg%me_band+1) 847 end if 848 call nonlop(choice,cpopt,cwaveprj,enlout,hamk,idir,lambda_nonlop,mpi_enreg,1,nnlout,paw_opt,& 849 & signs,gsc_alltoall2,tim_nonlop,cwavef_alltoall2,gvnlc_alltoall2,vectproj=vectproj) 850 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL) 851 !call gpu_device_synchronize() 852 !call gpu_data_prefetch_async_f(C_LOC(cwavef_alltoall2), cwavef_alltoall2_size, CPU_DEVICE_ID) 853 !call gpu_data_prefetch_async_f(C_LOC(gvnlc_alltoall2), gvnlc_alltoall2_size, CPU_DEVICE_ID) 854 !call gpu_data_prefetch_async_f(C_LOC(gsc_alltoall2), gsc_alltoall2_size, CPU_DEVICE_ID) 855 #endif 856 857 if (do_transpose .and. mpi_enreg%paral_spinor == 0 .and. nspinortot==2.and.signs==2) then 858 if (paw_opt/=3) gvnlc_alltoall1(:,index_wavef_band)=gvnlc_alltoall2(:,:) 859 if (paw_opt==3.or.paw_opt==4) gsc_alltoall1(:,index_wavef_band)=gsc_alltoall2(:,:) 860 end if 861 862 else ! bandpp/=1 863 864 ! ------------------------------------------------------------- 865 ! Computation of the index used to sort the waves functions below bandpp 866 ! ------------------------------------------------------------- 867 if(do_transpose) then 868 call prep_index_wavef_bandpp(nproc_band,bandpp,& 869 & my_nspinor,ndatarecv,recvcounts,rdispls,index_wavef_band) 870 871 ! ------------------------------------------------------- 872 ! Sorting of the waves functions below bandpp 873 ! ------------------------------------------------------- 874 cwavef_alltoall2(:,:) = cwavef_alltoall1(:,index_wavef_band) 875 end if 876 877 ! ------------------------------------------------------- 878 ! Call nonlop 879 ! ------------------------------------------------------- 880 if(paw_opt == 2) then 881 lambda_nonlop(1:bandpp) = lambdablock((mpi_enreg%me_band*bandpp)+1:((mpi_enreg%me_band+1)*bandpp)) 882 end if 883 call nonlop(choice,cpopt,cwaveprj,enlout,hamk,idir,lambda_nonlop,mpi_enreg,bandpp,nnlout,paw_opt,& 884 & signs,gsc_alltoall2,tim_nonlop,cwavef_alltoall2,gvnlc_alltoall2,vectproj=vectproj) 885 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL) 886 if(hamk%gpu_option==ABI_GPU_KOKKOS) call gpu_device_synchronize() 887 !call gpu_data_prefetch_async_f(C_LOC(cwavef_alltoall2), cwavef_alltoall2_size, CPU_DEVICE_ID) 888 !call gpu_data_prefetch_async_f(C_LOC(gvnlc_alltoall2), gvnlc_alltoall2_size, CPU_DEVICE_ID) 889 !if (associated(gsc_alltoall2)) then 890 ! call gpu_data_prefetch_async_f(C_LOC(gsc_alltoall2), gsc_alltoall2_size, CPU_DEVICE_ID) 891 !end if 892 #endif 893 894 ! ----------------------------------------------------- 895 ! Sorting of waves functions below the processors 896 ! ----------------------------------------------------- 897 if(do_transpose.and.signs==2) then 898 if (paw_opt/=3) gvnlc_alltoall1(:,index_wavef_band)=gvnlc_alltoall2(:,:) 899 if (paw_opt==3.or.paw_opt==4) gsc_alltoall1(:,index_wavef_band)=gsc_alltoall2(:,:) 900 end if 901 902 end if 903 904 !===================================================================== 905 ! ------------------------------------------------------- 906 ! Deallocation 907 ! ------------------------------------------------------- 908 if (allocated(index_wavef_band)) then 909 ABI_FREE(index_wavef_band) 910 end if 911 912 !Transpose the gsc_alltoall or gvlnc_alltoall tabs 913 !according to the paw_opt and signs values 914 if(do_transpose) then 915 if (signs==2) then 916 call timab(581,1,tsec) 917 if(bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2))then 918 if (paw_opt/=3) then 919 call xmpi_alltoallv(gvnlc_alltoall1,recvcountsloc,rdisplsloc,gvnlc,& 920 & sendcountsloc,sdisplsloc,spaceComm,ier) 921 end if 922 if (paw_opt==3.or.paw_opt==4) then 923 call xmpi_alltoallv(gsc_alltoall1,recvcountsloc,rdisplsloc,gsc,& 924 & sendcountsloc,sdisplsloc,spaceComm,ier) 925 end if 926 else 927 if (paw_opt/=3) then 928 call xmpi_alltoallv(gvnlc_alltoall2,recvcountsloc,rdisplsloc,gvnlc,& 929 & sendcountsloc,sdisplsloc,spaceComm,ier) 930 end if 931 if (paw_opt==3.or.paw_opt==4) then 932 call xmpi_alltoallv(gsc_alltoall2,recvcountsloc,rdisplsloc,gsc,& 933 & sendcountsloc,sdisplsloc,spaceComm,ier) 934 end if 935 end if 936 call timab(581,2,tsec) 937 end if 938 else 939 ! TODO check other usages, maybe 940 if (l_gpu_option == ABI_GPU_KOKKOS) then 941 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) 942 call copy_gpu_to_gpu(C_LOC(gsc), C_LOC(gsc_alltoall2), INT(2, c_size_t) * ndatarecv * my_nspinor * bandpp * dp) 943 #endif 944 else 945 call DCOPY(2*ndatarecv*my_nspinor*bandpp, gsc_alltoall2, 1, gsc, 1) 946 end if 947 end if 948 949 if (nnlout>0) then 950 call xmpi_allgather(enlout,nnlout*bandpp,enlout_block,spaceComm,ier) 951 end if 952 ABI_FREE(enlout) 953 ABI_FREE(sendcountsloc) 954 ABI_FREE(sdisplsloc) 955 ABI_FREE(recvcountsloc) 956 ABI_FREE(rdisplsloc) 957 958 if(hamk%gpu_option==ABI_GPU_KOKKOS) then 959 #if defined HAVE_GPU && defined HAVE_YAKL 960 ABI_FREE_MANAGED(cwavef_alltoall2) 961 ABI_FREE_MANAGED(gvnlc_alltoall2) 962 if (paw_opt >= 3) then 963 ABI_FREE_MANAGED(gsc_alltoall2) 964 end if 965 #endif 966 else 967 ABI_FREE(cwavef_alltoall2) 968 ABI_FREE(gvnlc_alltoall2) 969 ABI_FREE(gsc_alltoall2) 970 end if 971 972 if(do_transpose .and. (bandpp/=1 .or. (bandpp==1 .and. mpi_enreg%paral_spinor==0.and.nspinortot==2)))then 973 ABI_FREE(cwavef_alltoall1) 974 if(signs==2)then 975 if (paw_opt/=3) then 976 ABI_FREE(gvnlc_alltoall1) 977 end if 978 if (paw_opt==3.or.paw_opt==4) then 979 ABI_FREE(gsc_alltoall1) 980 end if 981 end if 982 end if 983 984 call timab(570,2,tsec) 985 986 DBG_EXIT('COLL') 987 988 end subroutine prep_nonlop
ABINIT/prep_sort_wavef_spin [ Functions ]
NAME
prep_sort_wavef_spin
FUNCTION
Compute index used to sort a spinorial wave-function by spin Sort to have all nspinor=1 fisrt, then all nspinor=2
INPUTS
nproc_band=size of "band" communicator nspinor=number of spinorial components of the wavefunction ndatarecv=total number of values on all processors recvcounts(nproc_band)= number of received values by the processor rdispls(nproc_band)= offsets of the received values by the processor
OUTPUT
index_wavef(:)=array containing the sorted indexes (pointer, allocated in this routine)
SOURCE
2160 subroutine prep_sort_wavef_spin(nproc_band,nspinor,ndatarecv,recvcounts,rdispls,index_wavef) 2161 2162 !Arguments ------------------------------------ 2163 !scalars 2164 integer,intent(in) :: ndatarecv,nproc_band,nspinor 2165 !arrays 2166 integer,intent(in) :: rdispls(nproc_band),recvcounts(nproc_band) 2167 integer,allocatable,intent(out) :: index_wavef(:) 2168 2169 !Local variables------------------------------- 2170 !scalars 2171 integer :: isft,isft1,iproc,iindex 2172 !arrays 2173 integer,allocatable :: recvcountsloc(:),rdisplsloc(:) 2174 2175 ! ********************************************************************* 2176 2177 ABI_MALLOC(index_wavef,(ndatarecv*nspinor)) 2178 2179 ABI_MALLOC(recvcountsloc,(nproc_band)) 2180 ABI_MALLOC(rdisplsloc,(nproc_band)) 2181 recvcountsloc(:)=recvcounts(:)*2*nspinor 2182 rdisplsloc(:)=rdispls(:)*2*nspinor 2183 2184 !--------------------------------------------- 2185 !Loops on bandpp and processors band 2186 !--------------------------------------------- 2187 isft=0 2188 do iproc=1,nproc_band 2189 2190 ! ===== Spin up 2191 if (iproc==1) then 2192 isft= 0 2193 else 2194 isft= sum(recvcounts(1: (iproc-1))) 2195 end if 2196 isft1 = 0.5*rdisplsloc(iproc) 2197 2198 index_wavef(1+isft:isft+recvcounts(iproc))= & 2199 & (/(iindex,iindex=isft1+1,isft1+recvcounts(iproc))/) 2200 2201 ! =====Spin down 2202 if (iproc==1)then 2203 isft=sum(recvcounts(1:nproc_band)) 2204 else 2205 isft=sum(recvcounts(1:nproc_band)) & 2206 & +sum(recvcounts(1:iproc-1)) 2207 end if 2208 isft1 = 0.5 * rdisplsloc(iproc) + recvcounts(iproc) 2209 2210 index_wavef(1+isft:isft+recvcounts(iproc))= & 2211 & (/(iindex,iindex=isft1+1,isft1+ recvcounts(iproc))/) 2212 2213 end do 2214 2215 ABI_FREE(recvcountsloc) 2216 ABI_FREE(rdisplsloc) 2217 2218 end subroutine prep_sort_wavef_spin
ABINIT/prep_wavef_sym_do [ Functions ]
NAME
prep_wavef_sym_do
FUNCTION
this routine associates waves functions by two as following E(G) = C(G) + D(G) E(-G) = C*(G) + iD*(G) the values are distributed on the processors in function of the value of mpi_enreg%distribfft%tab_fftwf2_distrib( (-kg_k_gather(2,i) )
INPUTS
mpi_enreg = information about mpi parallelization bandpp = number of couple of waves functions nspinor = number of spin ndatarecv = number of values received by the processor and sended by the other processors band ndatarecv_tot = total number of received values (ndatarecv + number of received opposited planewave coordinates) ndatasend_sym = number of sended values to the processors fft to create opposited planewave coordinates tab_proc = positions of opposited planewave coordinates in the list of the processors fft cwavef_alltoall = planewave coefficients of wavefunction ( initial of the processor + sended by other processors band) sendcounts_sym = number of sended values by the processor to each processor fft sdispls_sym = postions of the sended values by the processor to each processor fft recvcounts_sym = number of the received values by the processor from each processor fft rdispls_sym = postions of the received values by the processor from each processor fft
OUTPUT
ewavef_alltoall_sym = planewave coefficients of wavefunction initial of the processor + sended by other processors band + sended by other processors fft + and compisited if bandpp >1 index_wavef_send = index to send the values in blocks to the other processor fft
SIDE EFFECTS
SOURCE
1598 subroutine prep_wavef_sym_do(mpi_enreg,bandpp,nspinor,& 1599 & ndatarecv,& 1600 & ndatarecv_tot,ndatasend_sym,tab_proc,& 1601 & cwavef_alltoall,& 1602 & sendcounts_sym,sdispls_sym,& 1603 & recvcounts_sym,rdispls_sym,& 1604 & ewavef_alltoall_sym,& 1605 & index_wavef_send) 1606 1607 !Arguments ------------------------------------ 1608 !scalars 1609 integer,intent(in) :: bandpp,ndatarecv,ndatarecv_tot,ndatasend_sym 1610 integer,intent(in) :: nspinor 1611 type(mpi_type),intent(in) :: mpi_enreg 1612 !arrays 1613 integer,allocatable,intent(out) :: index_wavef_send(:) 1614 integer,intent(in) :: rdispls_sym(:),recvcounts_sym(:) 1615 integer,intent(in) :: sdispls_sym(:),sendcounts_sym(:) 1616 integer,intent(in) :: tab_proc(:) 1617 real(dp),intent(inout) :: cwavef_alltoall(2,ndatarecv*nspinor*bandpp) 1618 real(dp),pointer :: ewavef_alltoall_sym(:,:) 1619 1620 !Local variables------------------------------- 1621 !scalars 1622 integer :: bandpp_sym,ibandpp,idatarecv,ideb_loc,idebc,idebd,idebe 1623 integer :: ier,ifin_loc,ifinc,ifind,ifine,iproc,jbandpp,jsendloc 1624 integer :: kbandpp,newspacecomm,nproc_fft 1625 logical :: flag_compose 1626 !arrays 1627 integer,allocatable :: rdispls_sym_loc(:),recvcounts_sym_loc(:) 1628 integer,allocatable :: sdispls_sym_loc(:),sendcounts_sym_loc(:) 1629 real(dp),allocatable :: ewavef_alltoall_loc(:,:),ewavef_alltoall_send(:,:) 1630 1631 ! ********************************************************************* 1632 1633 !DEBUG 1634 !write(std_out,*)' prep_wavef_sym_do : enter ' 1635 !ENDDEBUG 1636 1637 !--------------------------------------------- 1638 !Initialisation 1639 !--------------------------------------------- 1640 nproc_fft = mpi_enreg%nproc_fft 1641 1642 newspacecomm = mpi_enreg%comm_fft 1643 1644 if (modulo(bandpp,2)==0) then 1645 bandpp_sym = bandpp/2 1646 flag_compose = .TRUE. 1647 else 1648 bandpp_sym = bandpp 1649 flag_compose = .FALSE. 1650 end if 1651 1652 !--------------------------------------------- 1653 !Allocation 1654 !--------------------------------------------- 1655 ABI_MALLOC(ewavef_alltoall_sym ,(2,ndatarecv_tot*bandpp_sym)) 1656 ABI_MALLOC(ewavef_alltoall_loc ,(2,ndatarecv *bandpp_sym)) 1657 ABI_MALLOC(ewavef_alltoall_send ,(2,ndatasend_sym*bandpp_sym)) 1658 ABI_MALLOC(index_wavef_send ,( ndatasend_sym*bandpp_sym)) 1659 1660 ABI_MALLOC(sendcounts_sym_loc ,(nproc_fft)) 1661 ABI_MALLOC(sdispls_sym_loc ,(nproc_fft)) 1662 ABI_MALLOC(recvcounts_sym_loc ,(nproc_fft)) 1663 ABI_MALLOC(rdispls_sym_loc ,(nproc_fft)) 1664 1665 1666 !Initialisation 1667 !-------------- 1668 ewavef_alltoall_sym(:,:) =0. 1669 ewavef_alltoall_loc(:,:) =0. 1670 1671 sendcounts_sym_loc(:) =0 1672 sdispls_sym_loc(:) =0 1673 recvcounts_sym_loc(:) =0 1674 rdispls_sym_loc(:) =0 1675 1676 index_wavef_send(:) =0 1677 1678 1679 !------------------------------------------------- 1680 !We are bandpp blocks which we want to : 1681 !associate by two (band_sym==bandpp/2) 1682 !or not associate by two (band_sym==bandpp) 1683 ! 1684 !So We'll have got bandpp_sym blocks 1685 !So we loop on the bandpp_sym blocks 1686 !-------------------------------------------------- 1687 1688 do kbandpp=1,bandpp_sym 1689 1690 ! position of the two blocks 1691 ! -------------------------- 1692 ibandpp = (kbandpp-1) * 2 1693 jbandpp = ibandpp + 1 1694 1695 idebe = (kbandpp-1) * ndatarecv_tot + 1 1696 ifine = idebe + ndatarecv - 1 1697 1698 idebc = ibandpp * ndatarecv + 1 1699 ifinc = idebc + ndatarecv - 1 1700 1701 idebd = jbandpp * ndatarecv + 1 1702 ifind = idebd + ndatarecv - 1 1703 1704 ideb_loc = (kbandpp-1) * ndatarecv + 1 1705 ifin_loc = ideb_loc + ndatarecv - 1 1706 1707 1708 if (flag_compose) then 1709 1710 1711 ! calcul ewavef(G) 1712 ! ---------------- 1713 ewavef_alltoall_sym(1,idebe:ifine) = & 1714 & cwavef_alltoall(1,idebc:ifinc) & 1715 & - cwavef_alltoall(2,idebd:ifind) 1716 1717 ewavef_alltoall_sym(2,idebe:ifine) = & 1718 & cwavef_alltoall(2,idebc:ifinc) & 1719 & + cwavef_alltoall(1,idebd:ifind) 1720 1721 ! calcul ewavef_loc(-G) 1722 ! --------------------- 1723 ewavef_alltoall_loc(1,ideb_loc:ifin_loc) = & 1724 & cwavef_alltoall(1,idebc:ifinc) & 1725 & + cwavef_alltoall(2,idebd:ifind) 1726 1727 ewavef_alltoall_loc(2,ideb_loc:ifin_loc) = & 1728 & - cwavef_alltoall(2,idebc:ifinc) & 1729 & + cwavef_alltoall(1,idebd:ifind) 1730 else 1731 1732 ! calcul ewavef(G) 1733 ! ---------------- 1734 ewavef_alltoall_sym(1,idebe:ifine) = cwavef_alltoall(1,idebc:ifinc) 1735 ewavef_alltoall_sym(2,idebe:ifine) = cwavef_alltoall(2,idebc:ifinc) 1736 1737 ! calcul ewavef_loc(-G) 1738 ! --------------------- 1739 ewavef_alltoall_loc(1,ideb_loc:ifin_loc) = cwavef_alltoall(1,idebc:ifinc) 1740 ewavef_alltoall_loc(2,ideb_loc:ifin_loc) = - cwavef_alltoall(2,idebc:ifinc) 1741 1742 end if 1743 1744 end do 1745 1746 1747 1748 !------------------------------------------------------------------------ 1749 !Creation of datas blocks for each processor fft from ewavef_alltoall_loc 1750 !to send datas by blocks with a alltoall... 1751 !------------------------------------------------------------------------ 1752 1753 !Position of the blocks 1754 !---------------------- 1755 jsendloc=0 1756 do ibandpp=1,bandpp_sym 1757 do iproc=1,nproc_fft 1758 do idatarecv=1,ndatarecv 1759 if (tab_proc(idatarecv)==(iproc-1)) then 1760 jsendloc=jsendloc+1 1761 index_wavef_send(jsendloc) = idatarecv + ndatarecv * (ibandpp-1) 1762 end if 1763 end do 1764 end do 1765 end do 1766 1767 !Classment 1768 !---------- 1769 ewavef_alltoall_send(:,:)=ewavef_alltoall_loc(:,index_wavef_send) 1770 1771 1772 !------------------------------------------------- 1773 !Calcul of the number of received and sended datas 1774 !------------------------------------------------- 1775 sendcounts_sym_loc = sendcounts_sym*2 1776 recvcounts_sym_loc = recvcounts_sym*2 1777 1778 !------------------------------------------ 1779 !Exchange of the datas ewavef_allto_all_loc 1780 !------------------------------------------ 1781 do ibandpp=1,bandpp_sym 1782 1783 ! ------------------------------------------------ 1784 ! Deplacment of the sended datas because of bandpp 1785 ! ------------------------------------------------ 1786 sdispls_sym_loc(:) = sdispls_sym(:) + ndatasend_sym * (ibandpp-1) 1787 sdispls_sym_loc = sdispls_sym_loc *2 1788 1789 ! -------------------------------------------------- 1790 ! Deplacment of the received datas because of bandpp 1791 ! -------------------------------------------------- 1792 rdispls_sym_loc(:) = rdispls_sym(:) + ndatarecv_tot * (ibandpp-1) 1793 rdispls_sym_loc = rdispls_sym_loc *2 1794 1795 1796 call xmpi_alltoallv(& 1797 & ewavef_alltoall_send(:,:) ,sendcounts_sym_loc,sdispls_sym_loc,& 1798 & ewavef_alltoall_sym(:,:) ,recvcounts_sym_loc,rdispls_sym_loc,& 1799 & newspacecomm,ier) 1800 1801 end do 1802 1803 !----------------------- 1804 !Desallocation 1805 !----------------------- 1806 1807 ABI_FREE(sendcounts_sym_loc) 1808 ABI_FREE(recvcounts_sym_loc) 1809 ABI_FREE(sdispls_sym_loc) 1810 ABI_FREE(rdispls_sym_loc) 1811 1812 ABI_FREE(ewavef_alltoall_loc) 1813 ABI_FREE(ewavef_alltoall_send) 1814 1815 end subroutine prep_wavef_sym_do
ABINIT/prep_wavef_sym_undo [ Functions ]
NAME
prep_wavef_sym_undo
FUNCTION
this routine dissociates each wave function in two waves functions as following C(G) = ( E*(-G) + E(G))/2 D(G) = i*( E*(-G) - E(G))/2 the values are redistributed on the processors in function of the value of mpi_enreg%distribfft%tab_fftwf2_distrib( (-kg_k_gather(2,i) )
INPUTS
mpi_enreg = information about mpi parallelization bandpp = number of groups of couple of waves functions nspinor = number of spin ndatarecv = number of values received by the processor and sended by the other processors band ndatarecv_tot = total number of received values (ndatarecv + number of received opposited planewave coordinates) ndatasend_sym = number of sended values to the processors fft to create opposited planewave coordinates idatarecv0 = position of the planewave coordinates (0,0,0) sendcounts_sym = number of sended values by the processor to each processor fft sdispls_sym = postions of the sended values by the processor to each processor fft recvcounts_sym = number of the received values by the processor to each processor fft gwavef_alltoall_sym = planewave coefficients of wavefunction initial of the processor + sended by other processors band + sended by other processors fft + and composited if bandpp >1 index_wavef_send = index to send the values by block to the other processor fft
OUTPUT
gwavef_alltoall = planewave coefficients of wavefunction ( for of the processor + to send to other processors band)
SOURCE
1859 subroutine prep_wavef_sym_undo(mpi_enreg,bandpp,nspinor,& 1860 & ndatarecv,& 1861 & ndatarecv_tot,ndatasend_sym,idatarecv0,& 1862 & gwavef_alltoall,& 1863 & sendcounts_sym,sdispls_sym,& 1864 & recvcounts_sym,rdispls_sym,& 1865 & gwavef_alltoall_sym,& 1866 & index_wavef_send) 1867 1868 !Arguments ------------------------------------ 1869 !scalars 1870 integer,intent(in) :: bandpp,idatarecv0,ndatarecv,ndatarecv_tot,ndatasend_sym 1871 integer,intent(in) :: nspinor 1872 type(mpi_type),intent(in) :: mpi_enreg 1873 !arrays 1874 integer,intent(in) :: index_wavef_send(:),rdispls_sym(:),recvcounts_sym(:) 1875 integer,intent(in) :: sdispls_sym(:),sendcounts_sym(:) 1876 real(dp),intent(inout) :: gwavef_alltoall(2,ndatarecv*nspinor*bandpp) 1877 real(dp),intent(inout) :: gwavef_alltoall_sym(:,:) 1878 1879 !Local variables------------------------------- 1880 !scalars 1881 integer :: bandpp_sym,ibandpp,ideb_loc,idebc,idebd 1882 integer :: idebe,ier,ifin_loc,ifinc,ifind,ifine 1883 integer :: jbandpp,kbandpp,newspacecomm,nproc_fft 1884 logical :: flag_compose 1885 !arrays 1886 integer,allocatable :: rdispls_sym_loc(:),recvcounts_sym_loc(:) 1887 integer,allocatable :: sdispls_sym_loc(:),sendcounts_sym_loc(:) 1888 real(dp),allocatable :: gwavef_alltoall_loc(:,:),gwavef_alltoall_rcv(:,:) 1889 1890 ! ********************************************************************* 1891 1892 !DEBUG 1893 !write(std_out,*)' prep_wavef_sym_undo : enter ' 1894 !ENDDEBUG 1895 1896 1897 !--------------------------------------------- 1898 !Initialisation 1899 !--------------------------------------------- 1900 nproc_fft = mpi_enreg%nproc_fft 1901 1902 newspacecomm = mpi_enreg%comm_fft 1903 1904 if (modulo(bandpp,2)==0) then 1905 bandpp_sym = bandpp/2 1906 flag_compose = .TRUE. 1907 else 1908 bandpp_sym = bandpp 1909 flag_compose = .FALSE. 1910 end if 1911 1912 !--------------------------------------------- 1913 !Allocation 1914 !--------------------------------------------- 1915 ABI_MALLOC(gwavef_alltoall_loc ,(2,ndatarecv *bandpp_sym)) 1916 ABI_MALLOC(gwavef_alltoall_rcv ,(2,ndatasend_sym *bandpp_sym)) 1917 1918 ABI_MALLOC(sendcounts_sym_loc ,(nproc_fft)) 1919 ABI_MALLOC(sdispls_sym_loc ,(nproc_fft)) 1920 ABI_MALLOC(recvcounts_sym_loc ,(nproc_fft)) 1921 ABI_MALLOC(rdispls_sym_loc ,(nproc_fft)) 1922 1923 1924 !--------------------------------------------- 1925 !Initialisation 1926 !--------------------------------------------- 1927 gwavef_alltoall_loc(:,:) =0. 1928 1929 sendcounts_sym_loc(:) =0 1930 sdispls_sym_loc(:) =0 1931 recvcounts_sym_loc(:) =0 1932 rdispls_sym_loc(:) =0 1933 1934 1935 !------------------------------------------------- 1936 !Calcul of number of the sended and received datas 1937 !------------------------------------------------- 1938 sendcounts_sym_loc = sendcounts_sym*2 1939 recvcounts_sym_loc = recvcounts_sym*2 1940 1941 !---------------------------------------------------- 1942 !Sending of the values 1943 !---------------------------------------------------- 1944 do ibandpp = 1,bandpp_sym 1945 1946 ! ------------------------------------------------- 1947 ! Deplacment of the sended values because of bandpp 1948 ! ------------------------------------------------- 1949 sdispls_sym_loc(:) = sdispls_sym(:) + ndatasend_sym * (ibandpp-1) 1950 sdispls_sym_loc = sdispls_sym_loc *2 1951 1952 ! --------------------------------------------------- 1953 ! Deplacment of the received values because of bandpp 1954 ! --------------------------------------------------- 1955 rdispls_sym_loc(:) = rdispls_sym(:) + ndatarecv_tot * (ibandpp-1) 1956 rdispls_sym_loc = rdispls_sym_loc *2 1957 1958 1959 call xmpi_alltoallv(& 1960 & gwavef_alltoall_sym(:,:) ,recvcounts_sym_loc,rdispls_sym_loc,& 1961 & gwavef_alltoall_rcv(:,:) ,sendcounts_sym_loc,sdispls_sym_loc,& 1962 & newspacecomm,ier) 1963 1964 end do 1965 1966 1967 !---------------------- 1968 !Dispatching the blocks 1969 !---------------------- 1970 gwavef_alltoall_loc(:,index_wavef_send(:)) = gwavef_alltoall_rcv(:,:) 1971 1972 !---------------------- 1973 !Case -kg = [0 0 0] 1974 !---------------------- 1975 if (idatarecv0/=-1) then 1976 do kbandpp=1,bandpp_sym 1977 gwavef_alltoall_loc(:,(kbandpp-1)*ndatarecv + idatarecv0)= & 1978 gwavef_alltoall_sym(:,(kbandpp-1)*ndatarecv_tot + idatarecv0) 1979 end do 1980 end if 1981 1982 !--------------------------------------------------- 1983 !Build of hwavef_alltoall 1984 ! 1985 !We have got : 1986 !bandpp_sym blocks to dissociate 1987 !or bandpp_sym blokcs to not dissociate 1988 !-------------------------------------------------- 1989 do kbandpp=1,bandpp_sym 1990 1991 ! position of the 2 blocks 1992 ! ---------------------------------- 1993 ibandpp = (kbandpp-1) * 2 1994 jbandpp = ibandpp + 1 1995 1996 idebe = (kbandpp-1) * ndatarecv_tot + 1 1997 ifine = idebe + ndatarecv - 1 1998 1999 idebc = ibandpp * ndatarecv + 1 2000 ifinc = idebc + ndatarecv - 1 2001 2002 idebd = jbandpp * ndatarecv + 1 2003 ifind = idebd + ndatarecv - 1 2004 2005 ideb_loc = (kbandpp-1) * ndatarecv + 1 2006 ifin_loc = ideb_loc + ndatarecv - 1 2007 2008 2009 if (flag_compose) then 2010 2011 ! calcul cwavef(G) 2012 ! ---------------- 2013 gwavef_alltoall(1,idebc:ifinc) = gwavef_alltoall_sym(1,idebe:ifine) & 2014 & + gwavef_alltoall_loc(1,ideb_loc:ifin_loc) 2015 gwavef_alltoall(2,idebc:ifinc) = gwavef_alltoall_sym(2,idebe:ifine) & 2016 & - gwavef_alltoall_loc(2,ideb_loc:ifin_loc) 2017 2018 ! calcul dwavef(G) 2019 ! ------------------ 2020 gwavef_alltoall(1,idebd:ifind) = gwavef_alltoall_sym(2,idebe:ifine) & 2021 & + gwavef_alltoall_loc(2,ideb_loc:ifin_loc) 2022 gwavef_alltoall(2,idebd:ifind) = - gwavef_alltoall_sym(1,idebe:ifine) & 2023 & + gwavef_alltoall_loc(1,ideb_loc:ifin_loc) 2024 else 2025 2026 ! calcul cwavef(G) 2027 ! ---------------- 2028 gwavef_alltoall(1,idebc:ifinc) = gwavef_alltoall_sym(1,idebe:ifine) & 2029 & + gwavef_alltoall_loc(1,ideb_loc:ifin_loc) 2030 gwavef_alltoall(2,idebc:ifinc) = gwavef_alltoall_sym(2,idebe:ifine) & 2031 & - gwavef_alltoall_loc(2,ideb_loc:ifin_loc) 2032 end if 2033 2034 end do 2035 2036 !We divise by two 2037 gwavef_alltoall(:,:) = gwavef_alltoall(:,:)/2 2038 2039 !----------------------- 2040 !Desallocation 2041 !----------------------- 2042 2043 ABI_FREE(sendcounts_sym_loc) 2044 ABI_FREE(recvcounts_sym_loc) 2045 ABI_FREE(sdispls_sym_loc) 2046 ABI_FREE(rdispls_sym_loc) 2047 2048 ABI_FREE(gwavef_alltoall_loc) 2049 ABI_FREE(gwavef_alltoall_rcv) 2050 2051 end subroutine prep_wavef_sym_undo