TABLE OF CONTENTS
ABINIT/m_ompgpu_fourwf [ Modules ]
NAME
m_ompgpu_fourwf
FUNCTION
Fake module to dupe the build system and allow it to include cuda files in the chain of dependencies.
COPYRIGHT
Copyright (C) 2000-2024 ABINIT group (MT) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
PARENTS
CHILDREN
SOURCE
21 #if defined HAVE_CONFIG_H 22 #include "config.h" 23 #endif 24 25 #include "abi_common.h" 26 27 module m_ompgpu_fourwf 28 29 use defs_basis 30 use m_abicore 31 use m_abi_linalg 32 use m_errors 33 use m_xomp 34 35 #ifdef HAVE_FC_ISO_C_BINDING 36 use iso_c_binding 37 #endif 38 39 #ifdef HAVE_GPU 40 use m_gpu_toolbox 41 #endif 42 43 implicit none 44 45 private 46 47 #ifdef HAVE_OPENMP_OFFLOAD 48 49 ! STATIC vars to avoid too much cuda overhead 50 integer :: fourwf_initialized=0 51 52 ! FFT plan 53 integer :: fft_size=-1 54 integer :: ndat_loc=-1 55 integer :: npw=-1 56 57 ! GPU ffts buffers 58 real(dp),allocatable,target :: work_gpu(:,:,:,:) 59 60 #endif 61 62 public :: ompgpu_fourwf 63 public :: alloc_ompgpu_fourwf 64 public :: free_ompgpu_fourwf 65 66 ! This routine is here to assess memory requirements 67 public :: ompgpu_fourwf_work_mem 68 contains 69 70 function ompgpu_fourwf_work_mem(ngfft, ndat) result(req_mem) 71 72 implicit none 73 74 integer, intent(in) :: ngfft(:), ndat 75 integer(kind=c_size_t) :: req_mem 76 77 req_mem = 2 * dp * int(ngfft(1), c_size_t) * int(ngfft(2), c_size_t) * & 78 & int(ngfft(3), c_size_t) * int(ndat, c_size_t) 79 80 end function ompgpu_fourwf_work_mem 81 82 !Tested usecases : 83 ! - Nvidia GPUs : FC_NVHPC + CUDA 84 ! - AMD GPUs : FC_LLVM + HIP 85 ! An eventual Intel implementation would use the OneAPI LLVM compiler. 86 ! Homemade CUDA/HIP interfaces would allow the use of GCC. 87 ! But it is likely that OpenMP performance won't be optimal outside GPU vendors compilers. 88 #ifdef HAVE_OPENMP_OFFLOAD 89 90 subroutine ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,& 91 & kg_kin,kg_kout,mgfft,ndat,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i) 92 93 !Arguments ------------------------------------ 94 !scalars 95 integer,intent(in) :: cplex,istwf_k,ldx,ldy,ldz,ndat,npwin,npwout,option,mgfft 96 real(dp),intent(in) :: weight_i(ndat),weight_r(ndat) 97 !arrays 98 integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2) 99 integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18) 100 real(dp),target,intent(inout) :: denpot(cplex*ldx,ldy,ldz) 101 real(dp),intent(in) :: fofgin(2,npwin*ndat) 102 real(dp),intent(inout),target :: fofr(2,ldx,ldy,ldz*ndat) 103 real(dp),intent(out) :: fofgout(2,npwout*ndat) 104 105 !Local variables------------------------------- 106 !scalars 107 integer,parameter :: me_g0=1,ndat1=1 108 integer :: nx,ny,nz,fftalg,fftalga,fftalgc,fftcache,dat,ptg,ptr,ptgin,ptgout,nthreads 109 character(len=500) :: msg 110 111 real(dp) :: xnorm,one 112 113 !Cuda return code 114 integer fft_state 115 integer rc 116 117 !Local integer 118 integer n1,n2,n3,nfft_tot,i,offset 119 120 integer :: cfft_size 121 integer :: shift_inv1,shift_inv2,shift_inv3 122 integer :: i1,i2,i3,ipw,idat; 123 integer :: i1inv,i2inv,i3inv 124 logical :: transfer_fofgin, transfer_fofgout, transfer_fofr 125 integer(C_SIZE_T) :: byte_count 126 127 #if defined HAVE_GPU_HIP && defined FC_LLVM 128 real(dp), ABI_CONTIGUOUS pointer :: fofr_amdref(:,:,:,:) 129 #endif 130 131 n1=ngfft(1); 132 n2=ngfft(2); 133 n3=ngfft(3); 134 nfft_tot=n1*n2*n3; 135 136 !*********** CHECK some compatibilities ************** 137 if( (n1/=ldx) .or. (n2/=ldy) .or. (n3/=ldz)) then 138 write(msg,"(a,a,i5,i5,i5,a,i5,i5,i5,a)") "FFT SIZE ERROR: \n when gpu mode is on the fft grid must not be augmented",& 139 "(n1,n2,n3) = (", n1,n2,n3,") whereas (ldx,ldy,ldz) = (", ldx,ldy,ldz, ")" 140 ABI_ERROR(msg) 141 end if 142 143 if(cplex==2) then 144 ABI_ERROR("gpu_fourwf: cplex == 2 is not treated in GPU mode") 145 end if 146 147 !*************** CUDA INITIALISATION STAGE **** 148 if (fourwf_initialized == 0) then 149 call alloc_ompgpu_fourwf(ngfft,ndat) 150 endif !end of initialisation 151 152 153 ! *********** GPU ALLOCATIONS *********************** 154 155 ! If fft size has changed, we realloc our buffers 156 if((nfft_tot/=fft_size) .or. (ndat/=ndat_loc)) then 157 call free_ompgpu_fourwf 158 call alloc_ompgpu_fourwf(ngfft,ndat) 159 end if !end if "fft size changed" 160 161 transfer_fofgin=.not. xomp_target_is_present(c_loc(fofgin)) 162 transfer_fofgout=.not. xomp_target_is_present(c_loc(fofgout)) 163 transfer_fofr=.not. xomp_target_is_present(c_loc(fofr)) 164 165 !$OMP TARGET ENTER DATA MAP(to:fofgin) IF(transfer_fofgin) 166 !$OMP TARGET ENTER DATA MAP(alloc:fofgout) IF(transfer_fofgout) 167 !$OMP TARGET ENTER DATA MAP(alloc:fofr) IF(transfer_fofr) 168 169 #ifdef HAVE_GPU_CUDA 170 !Work buffer allocated at each call to save memory in CUDA 171 !$OMP TARGET ENTER DATA MAP(alloc:work_gpu) 172 #endif 173 174 if(option==3) then 175 !$OMP TARGET UPDATE TO(fofr) 176 endif 177 178 if(option/=3) then 179 ! We launch async transfert of denpot 180 !$OMP TARGET ENTER DATA MAP(alloc:denpot) 181 !FIXME This async transfer might be better handled through CUDA/HIP after all... 182 ! Issues randomly occurs when using Cray compiler, seems fine with NVHPC. 183 #ifdef FC_CRAY 184 !$OMP TARGET UPDATE TO(denpot) 185 #else 186 !$OMP TARGET UPDATE TO(denpot) NOWAIT 187 #endif 188 if(option == 1) then 189 #ifdef FC_CRAY 190 !$OMP TARGET ENTER DATA MAP(to:weight_r,weight_i) 191 #else 192 !$OMP TARGET ENTER DATA MAP(to:weight_r,weight_i) NOWAIT 193 #endif 194 endif 195 196 #if defined HAVE_GPU_HIP && defined FC_LLVM 197 !FIXME Work-around for AOMP v15.0.3 (AMD Flang fork) 198 ! For some reason, fofr won't be processed normally when passed as argument 199 ! of FFT routine within TARGET DATA directives 200 fofr_amdref => fofr 201 #endif 202 203 ! GPU_SPHERE_IN 204 205 cfft_size = 2*n1*n2*n3*ndat 206 207 #if defined HAVE_GPU_CUDA 208 byte_count=sizeof(work_gpu) 209 !$OMP TARGET DATA USE_DEVICE_PTR(work_gpu) 210 call gpu_memset(c_loc(work_gpu), 0, byte_count) 211 !$OMP END TARGET DATA 212 #elif defined HAVE_GPU_HIP 213 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(i1,i2,i3) MAP(to:work_gpu) 214 do i3=1,n3*ndat 215 do i2=1,n2 216 do i1=1,n1 217 work_gpu(:,i1,i2,i3) = 0 218 end do 219 end do 220 end do 221 #endif 222 223 ! During GPU calculation we do some pre-calculation on symetries 224 if((istwf_k==2) .or. (istwf_k==4) .or. (istwf_k==6) .or. (istwf_k==8)) then 225 shift_inv1 = n1; 226 else 227 shift_inv1 = n1-1; 228 end if 229 230 if((istwf_k>=2) .and. (istwf_k<=5)) then 231 shift_inv2 = n2; 232 else 233 shift_inv2 = n2-1; 234 end if 235 236 if((istwf_k==2) .or. (istwf_k==3) .or. (istwf_k==6) .or. (istwf_k==7)) then 237 shift_inv3 = n3; 238 else 239 shift_inv3 = n3-1; 240 end if 241 242 243 !!$OMP TARGET TEAMS LOOP & 244 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & 245 !$OMP& PRIVATE(idat,ipw,i1,i2,i3) MAP(to:work_gpu,kg_kin,fofgin) 246 do idat = 1, ndat 247 do ipw = 1, npwin 248 i1=kg_kin(1,ipw); if(i1<0)i1=i1+n1; i1=i1+1 249 i2=kg_kin(2,ipw); if(i2<0)i2=i2+n2; i2=i2+1 250 i3=kg_kin(3,ipw); if(i3<0)i3=i3+n3; i3=i3+1 251 ! We write cfft(i1,i2,i3) 252 ! (double2): cfft[i1 + n1*(i2 + n2*(i3 + n3*idat))] = cg[ipw + npw*idat] 253 work_gpu(1, i1, i2, i3+n3*(idat-1)) = fofgin(1, (ipw + npwin*(idat-1))) 254 work_gpu(2, i1, i2, i3+n3*(idat-1)) = fofgin(2, (ipw + npwin*(idat-1))) 255 end do 256 end do 257 258 if(istwf_k > 1) then 259 !!$OMP TARGET TEAMS LOOP & 260 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & 261 !$OMP& PRIVATE(idat,ipw,i1,i2,i3,i1inv,i2inv,i3inv) MAP(to:work_gpu,kg_kin,fofgin) 262 do idat = 1, ndat 263 do ipw = 1, npwin 264 i1=kg_kin(1,ipw); if(i1<0)i1=i1+n1; 265 i2=kg_kin(2,ipw); if(i2<0)i2=i2+n2; 266 i3=kg_kin(3,ipw); if(i3<0)i3=i3+n3; 267 #if defined HAVE_GPU_CUDA 268 i1inv = modulo(shift_inv1 - i1, n1) + 1 269 i2inv = modulo(shift_inv2 - i2, n2) + 1 270 i3inv = modulo(shift_inv3 - i3, n3) + 1 271 #elif defined HAVE_GPU_HIP 272 i1inv = (shift_inv1-i1) - ( ((shift_inv1-i1)/n1) * n1 ) + 1 273 i2inv = (shift_inv2-i2) - ( ((shift_inv2-i2)/n2) * n2 ) + 1 274 i3inv = (shift_inv3-i3) - ( ((shift_inv3-i3)/n3) * n3 ) + 1 275 #endif 276 work_gpu(1, i1inv, i2inv, i3inv+n3*(idat-1)) = fofgin(1, (ipw + npwin*(idat-1))) 277 work_gpu(2, i1inv, i2inv, i3inv+n3*(idat-1)) = -fofgin(2, (ipw + npwin*(idat-1))) 278 end do 279 end do 280 end if 281 282 ! call backward fourrier transform on gpu work_gpu => fofr_gpu 283 #if defined HAVE_GPU_HIP && defined FC_LLVM 284 !$OMP TARGET DATA USE_DEVICE_ADDR(work_gpu,fofr_amdref) 285 call gpu_fft_exec_z2z(c_loc(work_gpu), c_loc(fofr_amdref), FFT_INVERSE) 286 !$OMP END TARGET DATA 287 #else 288 !$OMP TARGET DATA USE_DEVICE_PTR(work_gpu,fofr) 289 call gpu_fft_exec_z2z(c_loc(work_gpu), c_loc(fofr), FFT_INVERSE) 290 !$OMP END TARGET DATA 291 #endif 292 call gpu_fft_stream_synchronize() 293 end if 294 295 if(option==0) then 296 ! We copy back fofr 297 !$OMP TARGET UPDATE from(fofr) 298 end if 299 300 if(option==1) then 301 ! We finish denpot and weight transferts 302 !!$OMP TASKWAIT depend(in:denpot,weight_r,weight_i) 303 !$OMP TASKWAIT 304 305 ! call density accumulation routine on gpu 306 !!$OMP TARGET TEAMS LOOP & 307 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) & 308 !$OMP& PRIVATE(idat,i1,i2,i3) MAP(to:fofr,denpot,weight_r,weight_i) 309 do i3=1, n3 310 do i2=1, n2 311 do i1=1, n1 312 do idat = 1, ndat 313 denpot(i1,i2,i3) = denpot(i1,i2,i3) + fofr(1, i1, i2, i3+(idat-1)*n3)*fofr(1, i1, i2, i3+(idat-1)*n3)*weight_r(idat) 314 denpot(i1,i2,i3) = denpot(i1,i2,i3) + fofr(2, i1, i2, i3+(idat-1)*n3)*fofr(2, i1, i2, i3+(idat-1)*n3)*weight_i(idat) 315 end do 316 end do 317 end do 318 end do 319 !$OMP TARGET UPDATE from(denpot) 320 321 end if 322 323 if(option==2) then 324 ! We finished denpot transfert 325 !!$OMP TASKWAIT depend(in:denpot) 326 !$OMP TASKWAIT 327 328 ! call gpu routine to Apply local potential 329 !!$OMP TARGET TEAMS LOOP & 330 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(4) & 331 !$OMP& PRIVATE(idat,i1,i2,i3) MAP(to:denpot,fofr) 332 do idat = 1, ndat 333 do i3=1, n3 334 do i2=1, n2 335 do i1=1, n1 336 fofr(1, i1, i2, i3+(idat-1)*n3) = fofr(1, i1, i2, i3+(idat-1)*n3) * denpot(i1,i2,i3) 337 fofr(2, i1, i2, i3+(idat-1)*n3) = fofr(2, i1, i2, i3+(idat-1)*n3) * denpot(i1,i2,i3) 338 end do 339 end do 340 end do 341 end do 342 end if 343 344 if(option==2 .or. option==3) then 345 346 ! call forward fourier transform on gpu: fofr_gpu ==> work_gpu 347 #if defined HAVE_GPU_HIP && defined FC_LLVM 348 !$OMP TARGET DATA USE_DEVICE_ADDR(work_gpu,fofr_amdref) 349 call gpu_fft_exec_z2z(c_loc(fofr_amdref), c_loc(work_gpu), FFT_FORWARD) 350 !$OMP END TARGET DATA 351 #else 352 !$OMP TARGET DATA USE_DEVICE_PTR(work_gpu,fofr) 353 call gpu_fft_exec_z2z(c_loc(fofr), c_loc(work_gpu), FFT_FORWARD) 354 !$OMP END TARGET DATA 355 #endif 356 call gpu_fft_stream_synchronize() 357 358 one=1 359 xnorm=one/dble(n1*n2*n3) 360 !!$OMP TARGET TEAMS LOOP & 361 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & 362 !$OMP& PRIVATE(idat,ipw,i1,i2,i3) MAP(to:work_gpu,kg_kout,fofgout) 363 do idat = 1, ndat 364 do ipw = 1, npwout 365 i1=kg_kout(1,ipw); if(i1<0)i1=i1+n1; i1=i1+1 366 i2=kg_kout(2,ipw); if(i2<0)i2=i2+n2; i2=i2+1 367 i3=kg_kout(3,ipw); if(i3<0)i3=i3+n3; i3=i3+1 368 369 ! We write cg(ig) 370 fofgout (1, ipw + npwout*(idat-1)) = work_gpu(1, i1, i2, i3+n3*(idat-1)) * xnorm 371 fofgout (2, ipw + npwout*(idat-1)) = work_gpu(2, i1, i2, i3+n3*(idat-1)) * xnorm 372 end do 373 end do 374 375 !$OMP TARGET UPDATE FROM(fofgout) IF(transfer_fofgout) 376 end if 377 378 if(option/=3) then 379 ! We launch async transfert of denpot 380 !!$OMP TARGET UPDATE FROM(denpot) 381 !$OMP TARGET EXIT DATA MAP(delete:denpot) 382 if(option == 1) then 383 !$OMP TARGET EXIT DATA MAP(delete:weight_r,weight_i) 384 endif 385 endif 386 387 #ifdef HAVE_GPU_CUDA 388 !Work buffer allocated at each call to save memory in CUDA 389 !$OMP TARGET EXIT DATA MAP(delete:work_gpu) 390 #endif 391 392 !$OMP TARGET EXIT DATA MAP(delete:fofgin) IF(transfer_fofgin) 393 !$OMP TARGET EXIT DATA MAP(delete:fofgout) IF(transfer_fofgout) 394 !$OMP TARGET EXIT DATA MAP(delete:fofr) IF(transfer_fofr) 395 396 end subroutine ompgpu_fourwf