TABLE OF CONTENTS


ABINIT/m_ompgpu_fourwf [ Modules ]

[ Top ] [ Modules ]

NAME

  m_ompgpu_fourwf

FUNCTION

  Fake module to dupe the build system and allow it to include cuda files
   in the chain of dependencies.

COPYRIGHT

  Copyright (C) 2000-2024 ABINIT group (MT)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

PARENTS

CHILDREN

SOURCE

 21 #if defined HAVE_CONFIG_H
 22 #include "config.h"
 23 #endif
 24 
 25 #include "abi_common.h"
 26 
 27 module m_ompgpu_fourwf
 28 
 29  use defs_basis
 30  use m_abicore
 31  use m_abi_linalg
 32  use m_errors
 33  use m_xomp
 34 
 35 #ifdef HAVE_FC_ISO_C_BINDING
 36  use iso_c_binding
 37 #endif
 38 
 39 #ifdef HAVE_GPU
 40  use m_gpu_toolbox
 41 #endif
 42 
 43  implicit none
 44 
 45  private
 46 
 47 #ifdef HAVE_OPENMP_OFFLOAD
 48 
 49 ! STATIC vars to avoid too much cuda overhead
 50 integer :: fourwf_initialized=0
 51 
 52 ! FFT plan
 53 integer :: fft_size=-1
 54 integer :: ndat_loc=-1
 55 integer :: npw=-1
 56 
 57 ! GPU ffts buffers
 58 real(dp),allocatable,target :: work_gpu(:,:,:,:)
 59 
 60 #endif
 61 
 62  public :: ompgpu_fourwf
 63  public :: alloc_ompgpu_fourwf
 64  public :: free_ompgpu_fourwf
 65 
 66  ! This routine is here to assess memory requirements
 67  public :: ompgpu_fourwf_work_mem
 68 contains
 69 
 70 function ompgpu_fourwf_work_mem(ngfft, ndat) result(req_mem)
 71 
 72  implicit none
 73 
 74  integer, intent(in) :: ngfft(:), ndat
 75  integer(kind=c_size_t) :: req_mem
 76 
 77  req_mem = 2 * dp * int(ngfft(1), c_size_t) * int(ngfft(2), c_size_t) * &
 78  &         int(ngfft(3), c_size_t) * int(ndat, c_size_t)
 79 
 80 end function ompgpu_fourwf_work_mem
 81 
 82 !Tested usecases :
 83 ! - Nvidia GPUs : FC_NVHPC + CUDA
 84 ! - AMD GPUs    : FC_LLVM + HIP
 85 ! An eventual Intel implementation would use the OneAPI LLVM compiler.
 86 ! Homemade CUDA/HIP interfaces would allow the use of GCC.
 87 ! But it is likely that OpenMP performance won't be optimal outside GPU vendors compilers.
 88 #ifdef HAVE_OPENMP_OFFLOAD
 89 
 90 subroutine ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,&
 91 &  kg_kin,kg_kout,mgfft,ndat,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i)
 92 
 93 !Arguments ------------------------------------
 94 !scalars
 95  integer,intent(in) :: cplex,istwf_k,ldx,ldy,ldz,ndat,npwin,npwout,option,mgfft
 96  real(dp),intent(in) :: weight_i(ndat),weight_r(ndat)
 97 !arrays
 98  integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2)
 99  integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18)
100  real(dp),target,intent(inout) :: denpot(cplex*ldx,ldy,ldz)
101  real(dp),intent(in) :: fofgin(2,npwin*ndat)
102  real(dp),intent(inout),target :: fofr(2,ldx,ldy,ldz*ndat)
103  real(dp),intent(out) :: fofgout(2,npwout*ndat)
104 
105 !Local variables-------------------------------
106 !scalars
107  integer,parameter :: me_g0=1,ndat1=1
108  integer :: nx,ny,nz,fftalg,fftalga,fftalgc,fftcache,dat,ptg,ptr,ptgin,ptgout,nthreads
109  character(len=500) :: msg
110 
111  real(dp) :: xnorm,one
112 
113  !Cuda return code
114  integer fft_state
115  integer rc
116 
117  !Local integer
118  integer n1,n2,n3,nfft_tot,i,offset
119 
120  integer :: cfft_size
121  integer :: shift_inv1,shift_inv2,shift_inv3
122  integer :: i1,i2,i3,ipw,idat;
123  integer :: i1inv,i2inv,i3inv
124  logical :: transfer_fofgin, transfer_fofgout, transfer_fofr
125  integer(C_SIZE_T) :: byte_count
126 
127 #if defined HAVE_GPU_HIP && defined FC_LLVM
128  real(dp), ABI_CONTIGUOUS pointer :: fofr_amdref(:,:,:,:)
129 #endif
130 
131  n1=ngfft(1);
132  n2=ngfft(2);
133  n3=ngfft(3);
134  nfft_tot=n1*n2*n3;
135 
136  !*********** CHECK some compatibilities **************
137  if( (n1/=ldx) .or. (n2/=ldy) .or. (n3/=ldz)) then
138    write(msg,"(a,a,i5,i5,i5,a,i5,i5,i5,a)") "FFT SIZE ERROR: \n when gpu mode is on the fft grid must not be augmented",&
139     "(n1,n2,n3) = (", n1,n2,n3,") whereas (ldx,ldy,ldz) = (", ldx,ldy,ldz, ")"
140    ABI_ERROR(msg)
141  end if
142 
143  if(cplex==2)  then
144    ABI_ERROR("gpu_fourwf: cplex == 2 is not treated in GPU mode")
145  end if
146 
147  !*************** CUDA INITIALISATION STAGE ****
148  if (fourwf_initialized == 0) then
149    call alloc_ompgpu_fourwf(ngfft,ndat)
150  endif !end of initialisation
151 
152 
153  ! ***********  GPU ALLOCATIONS  ***********************
154 
155  ! If fft size has changed, we realloc our buffers
156  if((nfft_tot/=fft_size) .or. (ndat/=ndat_loc)) then
157    call free_ompgpu_fourwf
158    call alloc_ompgpu_fourwf(ngfft,ndat)
159  end if !end if "fft size changed"
160 
161  transfer_fofgin=.not. xomp_target_is_present(c_loc(fofgin))
162  transfer_fofgout=.not. xomp_target_is_present(c_loc(fofgout))
163  transfer_fofr=.not. xomp_target_is_present(c_loc(fofr))
164 
165  !$OMP TARGET ENTER DATA MAP(to:fofgin)     IF(transfer_fofgin)
166  !$OMP TARGET ENTER DATA MAP(alloc:fofgout) IF(transfer_fofgout)
167  !$OMP TARGET ENTER DATA MAP(alloc:fofr)    IF(transfer_fofr)
168 
169 #ifdef HAVE_GPU_CUDA
170  !Work buffer allocated at each call to save memory in CUDA
171  !$OMP TARGET ENTER DATA MAP(alloc:work_gpu)
172 #endif
173 
174  if(option==3) then
175    !$OMP TARGET UPDATE TO(fofr)
176  endif
177 
178  if(option/=3) then
179    ! We launch async transfert of denpot
180    !$OMP TARGET ENTER DATA MAP(alloc:denpot)
181    !FIXME This async transfer might be better handled through CUDA/HIP after all...
182    ! Issues randomly occurs when using Cray compiler, seems fine with NVHPC.
183 #ifdef FC_CRAY
184    !$OMP TARGET UPDATE TO(denpot)
185 #else
186    !$OMP TARGET UPDATE TO(denpot) NOWAIT
187 #endif
188    if(option == 1) then
189 #ifdef FC_CRAY
190      !$OMP TARGET ENTER DATA MAP(to:weight_r,weight_i)
191 #else
192      !$OMP TARGET ENTER DATA MAP(to:weight_r,weight_i) NOWAIT
193 #endif
194    endif
195 
196 #if defined HAVE_GPU_HIP && defined FC_LLVM
197    !FIXME Work-around for AOMP v15.0.3 (AMD Flang fork)
198    ! For some reason, fofr won't be processed normally when passed as argument
199    ! of FFT routine within TARGET DATA directives
200    fofr_amdref => fofr
201 #endif
202 
203    ! GPU_SPHERE_IN
204 
205    cfft_size = 2*n1*n2*n3*ndat
206 
207 #if defined HAVE_GPU_CUDA
208    byte_count=sizeof(work_gpu)
209    !$OMP TARGET DATA USE_DEVICE_PTR(work_gpu)
210    call gpu_memset(c_loc(work_gpu), 0, byte_count)
211    !$OMP END TARGET DATA
212 #elif defined HAVE_GPU_HIP
213    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(i1,i2,i3)  MAP(to:work_gpu)
214    do i3=1,n3*ndat
215      do i2=1,n2
216        do i1=1,n1
217          work_gpu(:,i1,i2,i3) = 0
218        end do
219      end do
220    end do
221 #endif
222 
223    ! During GPU calculation we do some pre-calculation on symetries
224    if((istwf_k==2) .or. (istwf_k==4) .or. (istwf_k==6) .or. (istwf_k==8)) then
225      shift_inv1 = n1;
226    else
227      shift_inv1 = n1-1;
228    end if
229 
230    if((istwf_k>=2) .and. (istwf_k<=5))  then
231      shift_inv2 = n2;
232    else
233      shift_inv2 = n2-1;
234    end if
235 
236    if((istwf_k==2) .or. (istwf_k==3) .or. (istwf_k==6) .or. (istwf_k==7)) then
237      shift_inv3 = n3;
238    else
239      shift_inv3 = n3-1;
240    end if
241 
242 
243    !!$OMP TARGET TEAMS LOOP &
244    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) &
245    !$OMP& PRIVATE(idat,ipw,i1,i2,i3) MAP(to:work_gpu,kg_kin,fofgin)
246    do idat = 1, ndat
247      do ipw = 1, npwin
248        i1=kg_kin(1,ipw); if(i1<0)i1=i1+n1; i1=i1+1
249        i2=kg_kin(2,ipw); if(i2<0)i2=i2+n2; i2=i2+1
250        i3=kg_kin(3,ipw); if(i3<0)i3=i3+n3; i3=i3+1
251        ! We write cfft(i1,i2,i3)
252        ! (double2): cfft[i1 + n1*(i2 + n2*(i3 + n3*idat))] = cg[ipw + npw*idat]
253        work_gpu(1, i1, i2, i3+n3*(idat-1)) = fofgin(1, (ipw + npwin*(idat-1)))
254        work_gpu(2, i1, i2, i3+n3*(idat-1)) = fofgin(2, (ipw + npwin*(idat-1)))
255      end do
256    end do
257 
258    if(istwf_k > 1) then
259      !!$OMP TARGET TEAMS LOOP &
260      !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) &
261      !$OMP& PRIVATE(idat,ipw,i1,i2,i3,i1inv,i2inv,i3inv) MAP(to:work_gpu,kg_kin,fofgin)
262      do idat = 1, ndat
263        do ipw = 1, npwin
264          i1=kg_kin(1,ipw); if(i1<0)i1=i1+n1;
265          i2=kg_kin(2,ipw); if(i2<0)i2=i2+n2;
266          i3=kg_kin(3,ipw); if(i3<0)i3=i3+n3;
267 #if defined HAVE_GPU_CUDA
268          i1inv = modulo(shift_inv1 - i1, n1) + 1
269          i2inv = modulo(shift_inv2 - i2, n2) + 1
270          i3inv = modulo(shift_inv3 - i3, n3) + 1
271 #elif defined HAVE_GPU_HIP
272          i1inv = (shift_inv1-i1) - ( ((shift_inv1-i1)/n1) * n1 ) + 1
273          i2inv = (shift_inv2-i2) - ( ((shift_inv2-i2)/n2) * n2 ) + 1
274          i3inv = (shift_inv3-i3) - ( ((shift_inv3-i3)/n3) * n3 ) + 1
275 #endif
276          work_gpu(1, i1inv, i2inv, i3inv+n3*(idat-1)) =  fofgin(1, (ipw + npwin*(idat-1)))
277          work_gpu(2, i1inv, i2inv, i3inv+n3*(idat-1)) = -fofgin(2, (ipw + npwin*(idat-1)))
278        end do
279      end do
280    end if
281 
282    ! call backward fourrier transform on gpu work_gpu => fofr_gpu
283 #if defined HAVE_GPU_HIP && defined FC_LLVM
284    !$OMP TARGET DATA USE_DEVICE_ADDR(work_gpu,fofr_amdref)
285    call gpu_fft_exec_z2z(c_loc(work_gpu), c_loc(fofr_amdref), FFT_INVERSE)
286    !$OMP END TARGET DATA
287 #else
288    !$OMP TARGET DATA USE_DEVICE_PTR(work_gpu,fofr)
289    call gpu_fft_exec_z2z(c_loc(work_gpu), c_loc(fofr), FFT_INVERSE)
290    !$OMP END TARGET DATA
291 #endif
292    call gpu_fft_stream_synchronize()
293  end if
294 
295  if(option==0) then
296    ! We copy back fofr
297    !$OMP TARGET UPDATE from(fofr)
298  end if
299 
300  if(option==1) then
301    ! We finish denpot and weight transferts
302    !!$OMP TASKWAIT depend(in:denpot,weight_r,weight_i)
303    !$OMP TASKWAIT
304 
305    ! call density accumulation routine on gpu
306    !!$OMP TARGET TEAMS LOOP &
307    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) &
308    !$OMP& PRIVATE(idat,i1,i2,i3) MAP(to:fofr,denpot,weight_r,weight_i)
309    do i3=1, n3
310      do i2=1, n2
311        do i1=1, n1
312          do idat = 1, ndat
313            denpot(i1,i2,i3) = denpot(i1,i2,i3) + fofr(1, i1, i2, i3+(idat-1)*n3)*fofr(1, i1, i2, i3+(idat-1)*n3)*weight_r(idat)
314            denpot(i1,i2,i3) = denpot(i1,i2,i3) + fofr(2, i1, i2, i3+(idat-1)*n3)*fofr(2, i1, i2, i3+(idat-1)*n3)*weight_i(idat)
315          end do
316        end do
317      end do
318    end do
319    !$OMP TARGET UPDATE from(denpot)
320 
321  end if
322 
323  if(option==2) then
324    ! We finished denpot transfert
325    !!$OMP TASKWAIT depend(in:denpot)
326    !$OMP TASKWAIT
327 
328    ! call gpu routine to  Apply local potential
329    !!$OMP TARGET TEAMS LOOP &
330    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(4) &
331    !$OMP& PRIVATE(idat,i1,i2,i3) MAP(to:denpot,fofr)
332    do idat = 1, ndat
333      do i3=1, n3
334        do i2=1, n2
335          do i1=1, n1
336            fofr(1, i1, i2, i3+(idat-1)*n3) = fofr(1, i1, i2, i3+(idat-1)*n3) * denpot(i1,i2,i3)
337            fofr(2, i1, i2, i3+(idat-1)*n3) = fofr(2, i1, i2, i3+(idat-1)*n3) * denpot(i1,i2,i3)
338          end do
339        end do
340      end do
341    end do
342  end if
343 
344  if(option==2 .or. option==3) then
345 
346    ! call forward fourier transform on gpu: fofr_gpu ==> work_gpu
347 #if defined HAVE_GPU_HIP && defined FC_LLVM
348    !$OMP TARGET DATA USE_DEVICE_ADDR(work_gpu,fofr_amdref)
349    call gpu_fft_exec_z2z(c_loc(fofr_amdref), c_loc(work_gpu), FFT_FORWARD)
350    !$OMP END TARGET DATA
351 #else
352    !$OMP TARGET DATA USE_DEVICE_PTR(work_gpu,fofr)
353    call gpu_fft_exec_z2z(c_loc(fofr), c_loc(work_gpu), FFT_FORWARD)
354    !$OMP END TARGET DATA
355 #endif
356    call gpu_fft_stream_synchronize()
357 
358    one=1
359    xnorm=one/dble(n1*n2*n3)
360    !!$OMP TARGET TEAMS LOOP &
361    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) &
362    !$OMP& PRIVATE(idat,ipw,i1,i2,i3) MAP(to:work_gpu,kg_kout,fofgout)
363    do idat = 1, ndat
364      do ipw = 1, npwout
365        i1=kg_kout(1,ipw); if(i1<0)i1=i1+n1; i1=i1+1
366        i2=kg_kout(2,ipw); if(i2<0)i2=i2+n2; i2=i2+1
367        i3=kg_kout(3,ipw); if(i3<0)i3=i3+n3; i3=i3+1
368 
369        ! We write cg(ig)
370        fofgout (1, ipw + npwout*(idat-1)) = work_gpu(1, i1, i2, i3+n3*(idat-1)) * xnorm
371        fofgout (2, ipw + npwout*(idat-1)) = work_gpu(2, i1, i2, i3+n3*(idat-1)) * xnorm
372      end do
373    end do
374 
375    !$OMP TARGET UPDATE FROM(fofgout)   IF(transfer_fofgout)
376  end if
377 
378  if(option/=3) then
379    ! We launch async transfert of denpot
380    !!$OMP TARGET UPDATE FROM(denpot)
381    !$OMP TARGET EXIT DATA MAP(delete:denpot)
382    if(option == 1) then
383      !$OMP TARGET EXIT DATA MAP(delete:weight_r,weight_i)
384    endif
385  endif
386 
387 #ifdef HAVE_GPU_CUDA
388  !Work buffer allocated at each call to save memory in CUDA
389  !$OMP TARGET EXIT DATA MAP(delete:work_gpu)
390 #endif
391 
392  !$OMP TARGET EXIT DATA MAP(delete:fofgin)  IF(transfer_fofgin)
393  !$OMP TARGET EXIT DATA MAP(delete:fofgout) IF(transfer_fofgout)
394  !$OMP TARGET EXIT DATA MAP(delete:fofr)    IF(transfer_fofr)
395 
396 end subroutine ompgpu_fourwf