TABLE OF CONTENTS


ABINIT/m_hidecudarec [ Modules ]

[ Top ] [ Modules ]

NAME

 m_hidecudarec

FUNCTION

  Call the C-cu program to make recursion on GPU

COPYRIGHT

  Copyright (C) 2009-2024 ABINIT group (MMancini)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

NOTES

SOURCE

18 #if defined HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21 #if defined HAVE_GPU_CUDA
22 #include "cuda_common.h"
23 #endif
24 
25 #include "abi_common.h"
26 
27 
28 module m_hidecudarec
29 
30  use defs_basis
31  use defs_rectypes
32  use m_abicore
33  use m_fft,        only : fourdp
34 
35 #if defined HAVE_GPU_CUDA
36  use m_gpu_toolbox
37 #endif
38 
39  implicit none
40 
41  private
42 
43 #if defined HAVE_GPU_CUDA
44  private ::  prt_mem_usage          ! Print memory usage
45 #endif
46 
47 #if defined HAVE_GPU_CUDA
48  public ::  InitRecGPU_0        ! Initialize recGPU_type
49  public ::  InitRecGPU          ! InitRecGPU
50  public ::  cudarec             ! Make the recursion on GPU
51 #endif
52  public :: CleanRecGPU            ! deallocate all pointers.
53 
54 
55 CONTAINS !===========================================================

m_hidecudarec/CleanRecGPU [ Functions ]

[ Top ] [ m_hidecudarec ] [ Functions ]

NAME

 CleanRecGPU

FUNCTION

  If there are devices availeble than the recGPU_type is initialized

INPUTS

  load=marks allocation of some arrays
  recgpu<type(devGPU_type)>=contains information of GPU

OUTPUT

 nptrec(ndevice)=number of points for recursion on GPU

SOURCE

353 subroutine CleanRecGPU(recgpu,load)
354 
355  implicit none
356 
357 !Arguments ------------------------------------
358  integer,intent(in)  :: load
359  type(recGPU_type),intent(inout) :: recgpu
360 ! *************************************************************************
361 
362  recgpu%nptrec = 0
363 
364  if(associated(recgpu%map))  then
365    ABI_FREE(recgpu%map)
366  end if
367  if(load==1)then
368    if(allocated(recgpu%par%displs)) then
369      ABI_FREE(recgpu%par%displs)
370    end if
371    if(allocated(recgpu%par%vcount)) then
372      ABI_FREE(recgpu%par%vcount)
373    end if
374  endif
375  call unset_dev()
376 
377 end subroutine CleanRecGPU

m_hidecudarec/cudarec [ Functions ]

[ Top ] [ m_hidecudarec ] [ Functions ]

NAME

 cudarec

FUNCTION

 Make recursion on a GPU device

INPUTS

OUTPUT

SOURCE

259 #if defined HAVE_GPU_CUDA
260 
261 subroutine cudarec(rset,exppot,an,bn2,beta,trotter,tolrec,gratio,ngfft,max_rec)
262 
263  implicit none
264 
265 !Arguments ------------------------------------
266  integer,intent(in)     :: trotter,gratio
267  real(dp),intent(in)    :: beta,tolrec
268  integer,intent(inout)  :: max_rec
269  type(recursion_type),intent(inout) :: rset
270  integer,intent(in)         :: ngfft(1:3)
271  real(dp), intent(in)       :: exppot(0:product(ngfft)-1)
272  real(cudap), intent(inout) :: an(0:rset%GPU%par%npt-1,0:rset%min_nrec)
273  real(cudap), intent(inout) :: bn2(0:rset%GPU%par%npt-1,0:rset%min_nrec)
274 !Local variables-------------------------------
275  ! character(len=500) :: msg
276  !integer  ::  maxpt,ipt,ii,jj,kk
277  real(dp) :: T_p(0:rset%nfftrec-1)
278  ! **integer***********************************************************************
279 
280 
281 !DEBUG
282 ! write (std_out,*) ' m_hidecudarec/cudarec : enter'
283 !ENDDEBUG
284 
285  call fourdp(1,rset%ZT_p,T_p,1,rset%mpi,rset%nfftrec,1,rset%ngfftrec,0)
286  T_p = (one/rset%nfftrec)*T_p
287 
288  if(.not.(rset%tronc)) then
289    call cuda_rec_cal(trotter,&
290      &               gratio,&
291      &               rset%GPU%par%npt,&
292      &               rset%min_nrec,&
293      &               rset%GPU%nptrec,&
294      &               max_rec,&
295      &               real(beta,cudap),&
296      &               real(rset%efermi,cudap),&
297      &               real(tolrec,cudap),&
298      &               real(rset%inf%ucvol,cudap),&
299      &               rset%GPU%par%pt0,&
300      &               rset%GPU%par%pt1,&
301      &               rset%ngfftrec(1:3),&
302      &               real(T_p,cudap),&
303      &               real(exppot,cudap),&
304      &               an,bn2)
305 
306  else
307 
308 
309    call cuda_rec_cal_cut(trotter,&
310      &                   gratio,&
311      &                   rset%GPU%par%npt,&
312      &                   rset%min_nrec,&
313      &                   rset%GPU%nptrec,&
314      &                   max_rec,&
315      &                   real(beta,cudap),&
316      &                   real(rset%efermi,cudap),&
317      &                   real(tolrec,cudap),&
318      &                   real(rset%inf%ucvol,cudap),&
319      &                   rset%GPU%par%pt0,&
320      &                   rset%GPU%par%pt1,&
321      &                   ngfft,&
322      &                   rset%ngfftrec(1:3),&
323      &                   real(T_p,cudap),&
324      &                   real(exppot,cudap),&
325      &                   an,bn2)
326 
327  endif
328 
329 !DEBUG
330 !write (std_out,*) ' m_hidecudarec/cudarec : exit'
331 !ENDDEBUG
332 
333 end subroutine cudarec

m_hidecudarec/InitRecGPU [ Functions ]

[ Top ] [ m_hidecudarec ] [ Functions ]

NAME

 InitRecGPU

FUNCTION

  If there are devices available then the recGPU_type is initialized

INPUTS

  rset<recusion_type>= contains information of recusion
  gpuinfo<devGPU_type>=contains information of GPU
  calc_type=if 0 takes the possible max for nptrec (to test the
  completly full graphic card). 1 after test to calculate the min
  possible value for nptrec

OUTPUT

  recgpuinfo<recGPU_type>=contains information of recursion with GPU

SOURCE

166 #if defined HAVE_GPU_CUDA
167 
168 subroutine InitRecGPU(rset,nfft,gratio,gpudevice,calc_type)
169 
170  implicit none
171 
172 !Arguments ------------------------------------
173  integer,intent(in) :: nfft,gpudevice
174  integer,intent(in) :: gratio
175  integer,intent(in) :: calc_type
176  type(recursion_type),intent(inout) :: rset
177 !Local variables-------------------------------
178  integer :: pos_size,resto,nfftc
179 ! real(dp) ::
180  type(devGPU_type) :: gpuinfo
181  character(len=500) :: msg
182 ! *************************************************************************
183  nfftc = nfft/(gratio**3)
184  pos_size = 1
185  rset%gpudevice = gpudevice
186 
187 
188  call InitGPU(gpuinfo,gpudevice)
189  !-- look if it is possible to set devices CUDA compatible
190  call set_dev(gpudevice)
191  if(gpudevice>-1)then
192    !--Take the approximate use of memory to compute the number of points on any GPU
193    if(rset%tronc)then
194      !for CUDA version <3.0 :
195      !      pos_size = (.90d0*real(gpuinfo%maxmemdev(0))/real(cudap)&
196      !        &           -real(nfft+4*rset%nfftrec))/real((4*rset%nfftrec+15+2))
197      ! for CUDA version 3.0 with batched FFT:
198     pos_size = (.50d0*real(gpuinfo%maxmemdev(0))/real(cudap)&
199       &           -real(nfft+2*rset%nfftrec))/real((6*rset%nfftrec+15+2))
200 
201 
202      else
203        !for CUDA version <3.0 :
204        !       pos_size = (.90d0*real(gpuinfo%maxmemdev(0))/real(cudap)&
205        !         &           -real(5*rset%nfftrec))/real((3*rset%nfftrec+15)+2)
206        ! for CUDA version 3.0 with batched FFT:
207       pos_size = (.5d0*real(gpuinfo%maxmemdev(0))/real(cudap)-real(3&
208         &*rset%nfftrec))/real((5*rset%nfftrec)+15+2)
209 
210    endif
211    !--The nbr of points has to be bigger than 1 and smaller than
212    !  rset%par%npt (which is the number of points given to any proc to compute
213    !  it is smaller than nfftrec)
214    pos_size = min(pos_size,nfftc)
215 
216    !--if production and not timing test
217    if(calc_type==1) pos_size = min(pos_size,rset%GPU%par%npt)
218 
219    if(pos_size<1) then
220      write(msg,'(a)')' ERROR NO SUFFICENT MEMORY ON DEVICE'
221      call wrtout(std_out,msg,'PERS')
222    end if
223 
224 
225    !--For GPU calculation it is better to have a number of point
226    !  proportional to the half-warp size (16)
227 
228    if(pos_size >16 )then
229      resto = mod(pos_size,16)
230      if(resto /=0) then
231        pos_size = pos_size-resto
232        if(pos_size<nfftc) pos_size = pos_size+16
233      endif
234    endif
235 
236    rset%GPU%nptrec = pos_size
237    if(rset%mpi%me==0) then
238      call prt_mem_usage(pos_size,rset%nfftrec)
239    end if
240  endif
241  call CleanGPU(gpuinfo)
242 
243 end subroutine InitRecGPU

m_hidecudarec/InitRecGPU_0 [ Functions ]

[ Top ] [ m_hidecudarec ] [ Functions ]

NAME

 InitRecGPU_0

FUNCTION

  recGPU_type is initialized

INPUTS

 mpi_ab=MPI information

OUTPUT

 recgpu=initialisation of GPU variables for recursion

SOURCE

127 #if defined HAVE_GPU_CUDA
128 
129 subroutine InitRecGPU_0(recgpu,mpi_ab)
130 
131  implicit none
132 
133 !Arguments ------------------------------------
134  type(MPI_type),intent(in) :: mpi_ab
135  type(recGPU_type),intent(inout) :: recgpu
136 !Local variables-------------------------------
137 ! *************************************************************************
138  recgpu%nptrec = 0
139  nullify(recgpu%map)
140  ABI_MALLOC(recgpu%map,(0:mpi_ab%nproc-1))
141  recgpu%map = -1       !--Initial guess no gpu
142 
143 end subroutine InitRecGPU_0

m_initcuda/prt_mem_usage [ Functions ]

[ Top ] [ m_initcuda ] [ Functions ]

NAME

 prt_mem_usage

FUNCTION

 Print information about allocation on GPU device during recursion

INPUTS

 nptrec=number of vectors allocated on device
 nfft=size of the grid (and so of a vector)

SOURCE

 70 #if defined HAVE_GPU_CUDA
 71 subroutine prt_mem_usage(nptrec,nfft)
 72 
 73   implicit none
 74 !Arguments ------------------------------------
 75   integer,intent(in) :: nptrec,nfft
 76 !Local ---------------------------
 77   integer :: ii
 78   integer(kind=i4b) :: largeur,clargeur
 79   real(dp):: totmem,rpart
 80   character(500) :: msg
 81 ! *********************************************************************
 82 
 83   largeur  = cudap*nfft
 84   clargeur = cudap*nfft*2
 85   !for CUDA version <3.0 :
 86   !   rpart = 3.d0*real(largeur,dp)/1048576.d0*real(nptrec,dp)
 87   !   totmem = rpart+real(2*clargeur+largeur+(2*cudap+i2b)*nptrec,dp)/1048576.d0
 88   !for CUDA version 3.0 :
 89     rpart = 6.d0*real(largeur,dp)/1048576.d0*real(nptrec,dp)
 90     totmem = rpart+real(clargeur+largeur+(2*cudap+i2b)*nptrec,dp)/1048576.d0
 91 
 92   write(msg,'(a,80a)')' ',('_',ii=1,80)
 93   call wrtout(std_out,msg,'COLL')
 94   write(msg,'(a18,a44,a18,a)')'_________________',&
 95 &  '  Allocated Memory on Device for Recursion ','___________________' ,ch10
 96   call wrtout(std_out,msg,'COLL')
 97 
 98   write (msg,'(2(a32,i10,a),2(a32,i10,a6,a),2(a32,f10.2,a7,a))')&
 99     & '   Number of Points            ',nfft  ,ch10, &
100     & '   Number of Vectors           ',nptrec,ch10, &
101     & '   Size Real Vectors           ',largeur ,'bytes',ch10, &
102     & '   Size Complex Vectors        ',clargeur,'bytes',ch10, &
103     & '   Size Matrix of Vectors      ',real(largeur*nptrec,dp)/1048576.d0,'Mbytes',ch10, &
104     & '   Allocated Memory on GPU     ',totmem,'Mbytes',ch10
105   call wrtout(std_out,msg,'COLL')
106   write(msg,'(a,80a)')' ',('_',ii=1,80)
107   call wrtout(std_out,msg,'COLL')
108 end subroutine prt_mem_usage
109 
110 #endif