TABLE OF CONTENTS
- ABINIT/m_hidecudarec
- m_hidecudarec/CleanRecGPU
- m_hidecudarec/cudarec
- m_hidecudarec/InitRecGPU
- m_hidecudarec/InitRecGPU_0
- m_initcuda/prt_mem_usage
ABINIT/m_hidecudarec [ Modules ]
NAME
m_hidecudarec
FUNCTION
Call the C-cu program to make recursion on GPU
COPYRIGHT
Copyright (C) 2009-2024 ABINIT group (MMancini) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
NOTES
SOURCE
18 #if defined HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 #if defined HAVE_GPU_CUDA 22 #include "cuda_common.h" 23 #endif 24 25 #include "abi_common.h" 26 27 28 module m_hidecudarec 29 30 use defs_basis 31 use defs_rectypes 32 use m_abicore 33 use m_fft, only : fourdp 34 35 #if defined HAVE_GPU_CUDA 36 use m_gpu_toolbox 37 #endif 38 39 implicit none 40 41 private 42 43 #if defined HAVE_GPU_CUDA 44 private :: prt_mem_usage ! Print memory usage 45 #endif 46 47 #if defined HAVE_GPU_CUDA 48 public :: InitRecGPU_0 ! Initialize recGPU_type 49 public :: InitRecGPU ! InitRecGPU 50 public :: cudarec ! Make the recursion on GPU 51 #endif 52 public :: CleanRecGPU ! deallocate all pointers. 53 54 55 CONTAINS !===========================================================
m_hidecudarec/CleanRecGPU [ Functions ]
[ Top ] [ m_hidecudarec ] [ Functions ]
NAME
CleanRecGPU
FUNCTION
If there are devices availeble than the recGPU_type is initialized
INPUTS
load=marks allocation of some arrays recgpu<type(devGPU_type)>=contains information of GPU
OUTPUT
nptrec(ndevice)=number of points for recursion on GPU
SOURCE
353 subroutine CleanRecGPU(recgpu,load) 354 355 implicit none 356 357 !Arguments ------------------------------------ 358 integer,intent(in) :: load 359 type(recGPU_type),intent(inout) :: recgpu 360 ! ************************************************************************* 361 362 recgpu%nptrec = 0 363 364 if(associated(recgpu%map)) then 365 ABI_FREE(recgpu%map) 366 end if 367 if(load==1)then 368 if(allocated(recgpu%par%displs)) then 369 ABI_FREE(recgpu%par%displs) 370 end if 371 if(allocated(recgpu%par%vcount)) then 372 ABI_FREE(recgpu%par%vcount) 373 end if 374 endif 375 call unset_dev() 376 377 end subroutine CleanRecGPU
m_hidecudarec/cudarec [ Functions ]
[ Top ] [ m_hidecudarec ] [ Functions ]
NAME
cudarec
FUNCTION
Make recursion on a GPU device
INPUTS
OUTPUT
SOURCE
259 #if defined HAVE_GPU_CUDA 260 261 subroutine cudarec(rset,exppot,an,bn2,beta,trotter,tolrec,gratio,ngfft,max_rec) 262 263 implicit none 264 265 !Arguments ------------------------------------ 266 integer,intent(in) :: trotter,gratio 267 real(dp),intent(in) :: beta,tolrec 268 integer,intent(inout) :: max_rec 269 type(recursion_type),intent(inout) :: rset 270 integer,intent(in) :: ngfft(1:3) 271 real(dp), intent(in) :: exppot(0:product(ngfft)-1) 272 real(cudap), intent(inout) :: an(0:rset%GPU%par%npt-1,0:rset%min_nrec) 273 real(cudap), intent(inout) :: bn2(0:rset%GPU%par%npt-1,0:rset%min_nrec) 274 !Local variables------------------------------- 275 ! character(len=500) :: msg 276 !integer :: maxpt,ipt,ii,jj,kk 277 real(dp) :: T_p(0:rset%nfftrec-1) 278 ! **integer*********************************************************************** 279 280 281 !DEBUG 282 ! write (std_out,*) ' m_hidecudarec/cudarec : enter' 283 !ENDDEBUG 284 285 call fourdp(1,rset%ZT_p,T_p,1,rset%mpi,rset%nfftrec,1,rset%ngfftrec,0) 286 T_p = (one/rset%nfftrec)*T_p 287 288 if(.not.(rset%tronc)) then 289 call cuda_rec_cal(trotter,& 290 & gratio,& 291 & rset%GPU%par%npt,& 292 & rset%min_nrec,& 293 & rset%GPU%nptrec,& 294 & max_rec,& 295 & real(beta,cudap),& 296 & real(rset%efermi,cudap),& 297 & real(tolrec,cudap),& 298 & real(rset%inf%ucvol,cudap),& 299 & rset%GPU%par%pt0,& 300 & rset%GPU%par%pt1,& 301 & rset%ngfftrec(1:3),& 302 & real(T_p,cudap),& 303 & real(exppot,cudap),& 304 & an,bn2) 305 306 else 307 308 309 call cuda_rec_cal_cut(trotter,& 310 & gratio,& 311 & rset%GPU%par%npt,& 312 & rset%min_nrec,& 313 & rset%GPU%nptrec,& 314 & max_rec,& 315 & real(beta,cudap),& 316 & real(rset%efermi,cudap),& 317 & real(tolrec,cudap),& 318 & real(rset%inf%ucvol,cudap),& 319 & rset%GPU%par%pt0,& 320 & rset%GPU%par%pt1,& 321 & ngfft,& 322 & rset%ngfftrec(1:3),& 323 & real(T_p,cudap),& 324 & real(exppot,cudap),& 325 & an,bn2) 326 327 endif 328 329 !DEBUG 330 !write (std_out,*) ' m_hidecudarec/cudarec : exit' 331 !ENDDEBUG 332 333 end subroutine cudarec
m_hidecudarec/InitRecGPU [ Functions ]
[ Top ] [ m_hidecudarec ] [ Functions ]
NAME
InitRecGPU
FUNCTION
If there are devices available then the recGPU_type is initialized
INPUTS
rset<recusion_type>= contains information of recusion gpuinfo<devGPU_type>=contains information of GPU calc_type=if 0 takes the possible max for nptrec (to test the completly full graphic card). 1 after test to calculate the min possible value for nptrec
OUTPUT
recgpuinfo<recGPU_type>=contains information of recursion with GPU
SOURCE
166 #if defined HAVE_GPU_CUDA 167 168 subroutine InitRecGPU(rset,nfft,gratio,gpudevice,calc_type) 169 170 implicit none 171 172 !Arguments ------------------------------------ 173 integer,intent(in) :: nfft,gpudevice 174 integer,intent(in) :: gratio 175 integer,intent(in) :: calc_type 176 type(recursion_type),intent(inout) :: rset 177 !Local variables------------------------------- 178 integer :: pos_size,resto,nfftc 179 ! real(dp) :: 180 type(devGPU_type) :: gpuinfo 181 character(len=500) :: msg 182 ! ************************************************************************* 183 nfftc = nfft/(gratio**3) 184 pos_size = 1 185 rset%gpudevice = gpudevice 186 187 188 call InitGPU(gpuinfo,gpudevice) 189 !-- look if it is possible to set devices CUDA compatible 190 call set_dev(gpudevice) 191 if(gpudevice>-1)then 192 !--Take the approximate use of memory to compute the number of points on any GPU 193 if(rset%tronc)then 194 !for CUDA version <3.0 : 195 ! pos_size = (.90d0*real(gpuinfo%maxmemdev(0))/real(cudap)& 196 ! & -real(nfft+4*rset%nfftrec))/real((4*rset%nfftrec+15+2)) 197 ! for CUDA version 3.0 with batched FFT: 198 pos_size = (.50d0*real(gpuinfo%maxmemdev(0))/real(cudap)& 199 & -real(nfft+2*rset%nfftrec))/real((6*rset%nfftrec+15+2)) 200 201 202 else 203 !for CUDA version <3.0 : 204 ! pos_size = (.90d0*real(gpuinfo%maxmemdev(0))/real(cudap)& 205 ! & -real(5*rset%nfftrec))/real((3*rset%nfftrec+15)+2) 206 ! for CUDA version 3.0 with batched FFT: 207 pos_size = (.5d0*real(gpuinfo%maxmemdev(0))/real(cudap)-real(3& 208 &*rset%nfftrec))/real((5*rset%nfftrec)+15+2) 209 210 endif 211 !--The nbr of points has to be bigger than 1 and smaller than 212 ! rset%par%npt (which is the number of points given to any proc to compute 213 ! it is smaller than nfftrec) 214 pos_size = min(pos_size,nfftc) 215 216 !--if production and not timing test 217 if(calc_type==1) pos_size = min(pos_size,rset%GPU%par%npt) 218 219 if(pos_size<1) then 220 write(msg,'(a)')' ERROR NO SUFFICENT MEMORY ON DEVICE' 221 call wrtout(std_out,msg,'PERS') 222 end if 223 224 225 !--For GPU calculation it is better to have a number of point 226 ! proportional to the half-warp size (16) 227 228 if(pos_size >16 )then 229 resto = mod(pos_size,16) 230 if(resto /=0) then 231 pos_size = pos_size-resto 232 if(pos_size<nfftc) pos_size = pos_size+16 233 endif 234 endif 235 236 rset%GPU%nptrec = pos_size 237 if(rset%mpi%me==0) then 238 call prt_mem_usage(pos_size,rset%nfftrec) 239 end if 240 endif 241 call CleanGPU(gpuinfo) 242 243 end subroutine InitRecGPU
m_hidecudarec/InitRecGPU_0 [ Functions ]
[ Top ] [ m_hidecudarec ] [ Functions ]
NAME
InitRecGPU_0
FUNCTION
recGPU_type is initialized
INPUTS
mpi_ab=MPI information
OUTPUT
recgpu=initialisation of GPU variables for recursion
SOURCE
127 #if defined HAVE_GPU_CUDA 128 129 subroutine InitRecGPU_0(recgpu,mpi_ab) 130 131 implicit none 132 133 !Arguments ------------------------------------ 134 type(MPI_type),intent(in) :: mpi_ab 135 type(recGPU_type),intent(inout) :: recgpu 136 !Local variables------------------------------- 137 ! ************************************************************************* 138 recgpu%nptrec = 0 139 nullify(recgpu%map) 140 ABI_MALLOC(recgpu%map,(0:mpi_ab%nproc-1)) 141 recgpu%map = -1 !--Initial guess no gpu 142 143 end subroutine InitRecGPU_0
m_initcuda/prt_mem_usage [ Functions ]
[ Top ] [ m_initcuda ] [ Functions ]
NAME
prt_mem_usage
FUNCTION
Print information about allocation on GPU device during recursion
INPUTS
nptrec=number of vectors allocated on device nfft=size of the grid (and so of a vector)
SOURCE
70 #if defined HAVE_GPU_CUDA 71 subroutine prt_mem_usage(nptrec,nfft) 72 73 implicit none 74 !Arguments ------------------------------------ 75 integer,intent(in) :: nptrec,nfft 76 !Local --------------------------- 77 integer :: ii 78 integer(kind=i4b) :: largeur,clargeur 79 real(dp):: totmem,rpart 80 character(500) :: msg 81 ! ********************************************************************* 82 83 largeur = cudap*nfft 84 clargeur = cudap*nfft*2 85 !for CUDA version <3.0 : 86 ! rpart = 3.d0*real(largeur,dp)/1048576.d0*real(nptrec,dp) 87 ! totmem = rpart+real(2*clargeur+largeur+(2*cudap+i2b)*nptrec,dp)/1048576.d0 88 !for CUDA version 3.0 : 89 rpart = 6.d0*real(largeur,dp)/1048576.d0*real(nptrec,dp) 90 totmem = rpart+real(clargeur+largeur+(2*cudap+i2b)*nptrec,dp)/1048576.d0 91 92 write(msg,'(a,80a)')' ',('_',ii=1,80) 93 call wrtout(std_out,msg,'COLL') 94 write(msg,'(a18,a44,a18,a)')'_________________',& 95 & ' Allocated Memory on Device for Recursion ','___________________' ,ch10 96 call wrtout(std_out,msg,'COLL') 97 98 write (msg,'(2(a32,i10,a),2(a32,i10,a6,a),2(a32,f10.2,a7,a))')& 99 & ' Number of Points ',nfft ,ch10, & 100 & ' Number of Vectors ',nptrec,ch10, & 101 & ' Size Real Vectors ',largeur ,'bytes',ch10, & 102 & ' Size Complex Vectors ',clargeur,'bytes',ch10, & 103 & ' Size Matrix of Vectors ',real(largeur*nptrec,dp)/1048576.d0,'Mbytes',ch10, & 104 & ' Allocated Memory on GPU ',totmem,'Mbytes',ch10 105 call wrtout(std_out,msg,'COLL') 106 write(msg,'(a,80a)')' ',('_',ii=1,80) 107 call wrtout(std_out,msg,'COLL') 108 end subroutine prt_mem_usage 109 110 #endif