TABLE OF CONTENTS


ABINIT/alloc_hamilt_gpu [ Functions ]

[ Top ] [ Functions ]

NAME

 alloc_hamilt_gpu

FUNCTION

 allocate several memory pieces on a GPU device for the application
 of Hamiltonian using a GPU

INPUTS

  atindx1(natom)=index table for atoms, inverse of atindx (see gstate.f)
  dtset <type(dataset_type)>=all input variables for this dataset
  gprimd(3,3)=dimensional reciprocal space primitive translations
  mpi_enreg=information about MPI parallelization
  nattyp(ntypat)= # atoms of each type.
  option=0: allocate data for local operator (FFT)
         1: allocate data for nonlocal operator
         2: allocate both
  psps <type(pseudopotential_type)>=variables related to pseudopotentials
  gpu_option= GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)

OUTPUT

  (no output - only allocation on GPU)

SOURCE

 96 subroutine alloc_hamilt_gpu(atindx1,dtset,gprimd,mpi_enreg,nattyp,npwarr,option,psps,gpu_option)
 97 
 98 !Arguments ------------------------------------
 99 !scalars
100  integer,intent(in) :: option,gpu_option
101  type(dataset_type),intent(in) :: dtset
102  type(MPI_type),intent(in) :: mpi_enreg
103  type(pseudopotential_type),intent(in) :: psps
104 !arrays
105  integer,intent(in) :: atindx1(dtset%natom),nattyp(dtset%ntypat),npwarr(dtset%nkpt)
106  real(dp),intent(in) :: gprimd(3,3)
107 
108 !Local variables-------------------------------
109 !scalars
110 #if defined HAVE_GPU
111  integer :: dimekb1_max,dimekb2_max,dimffnl_max,ierr,ikpt,npw_max_loc,npw_max_nonloc
112  integer :: cplex
113  integer ::npwarr_tmp(dtset%nkpt)
114 
115  integer(kind=c_int32_t) :: proj_dim(3)
116 #endif
117 
118 ! *************************************************************************
119 
120  if (gpu_option==ABI_GPU_DISABLED) return
121 
122 #if defined(HAVE_GPU)
123 !=== Local Hamiltonian ===
124  if (option==0.or.option==2) then
125 !  Compute max of total planes waves
126    npw_max_loc=0
127    if(mpi_enreg%paral_kgb==1) then
128      npwarr_tmp=npwarr
129      call xmpi_sum(npwarr_tmp,mpi_enreg%comm_bandfft,ierr)
130      npw_max_loc =maxval(npwarr_tmp)
131    else
132      npw_max_loc=dtset%mpw
133    end if
134    ! Initialize gpu data needed in fourwf
135    ! ndat=bandpp when paral_kgb=1
136    ! no matter paral_kgb=0 or 1, we gathet all bands into a single call gpu_fourwf
137    if(gpu_option == ABI_GPU_LEGACY) then
138 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING)
139      call alloc_gpu_fourwf(dtset%ngfft,dtset%bandpp,npw_max_loc,npw_max_loc)
140 #endif
141    else if (gpu_option == ABI_GPU_KOKKOS) then
142 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING)
143      call alloc_gpu_fourwf_managed(dtset%ngfft,dtset%bandpp,npw_max_loc,npw_max_loc)
144 #endif
145    else if (gpu_option == ABI_GPU_OPENMP) then
146      call alloc_ompgpu_fourwf(dtset%ngfft,dtset%bandpp)
147    end if
148 
149  end if
150 !=== Nonlocal Hamiltonian ===
151  if (option==1.or.option==2) then
152 !  Compute max of total planes waves
153    npw_max_nonloc=0
154    if(mpi_enreg%paral_kgb==1) then
155      npwarr_tmp=npwarr
156      call xmpi_sum(npwarr_tmp,mpi_enreg%comm_band,ierr)
157      npw_max_nonloc =maxval(npwarr_tmp)
158    else
159      npw_max_nonloc=dtset%mpw
160    end if
161 !  Initialize all gpu data needed in nonlop
162    dimffnl_max=4
163 !  if (abs(dtset%berryopt) == 5) dimffnl_max=4
164    dimekb1_max=psps%dimekb
165    if (dtset%usepaw==0) dimekb2_max=psps%ntypat
166    if (dtset%usepaw==1) dimekb2_max=dtset%natom
167 
168    if (gpu_option == ABI_GPU_KOKKOS .or. gpu_option == ABI_GPU_LEGACY) then
169 
170 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING)
171      ! TODO (PK) : disable this allocation when Kokkos is available
172      ! to save memory on GPU side
173      call alloc_nonlop_gpu(npw_max_nonloc, &
174        &                   npw_max_nonloc,&
175        &                   dtset%nspinor,&
176        &                   dtset%natom,&
177        &                   dtset%ntypat,&
178        &                   psps%lmnmax,&
179        &                   psps%indlmn,&
180        &                   nattyp,&
181        &                   atindx1,&
182        &                   gprimd,&
183        &                   dimffnl_max,&
184        &                   dimffnl_max,&
185        &                   dimekb1_max,&
186        &                   dimekb2_max)
187 
188      if (dtset%use_gemm_nonlop == 1) then
189        call alloc_nonlop_gpu_data(dtset, &
190          &                      psps, &
191          &                      npw_max_nonloc,&
192          &                      npw_max_nonloc,&
193          &                      atindx1,&
194          &                      nattyp,&
195          &                      gpu_option)
196      end if
197 #endif
198 
199    end if
200 
201  end if ! option 1 or 2
202 
203  call xmpi_barrier(mpi_enreg%comm_cell)
204 
205 #else
206 
207  ABI_UNUSED(npwarr)
208  ABI_UNUSED_A(psps)
209  if (.false.) then
210    write(std_out,*) atindx1(1),dtset%natom,gprimd(1,1),mpi_enreg%me,nattyp(1),option
211  end if
212 
213 #endif
214 
215 end subroutine alloc_hamilt_gpu

ABINIT/alloc_nonlop_gpu_data [ Functions ]

[ Top ] [ Functions ]

NAME

 alloc_hamilt_gpu

FUNCTION

 allocate several memory pieces on a GPU device for the application
 of Hamiltonian using a GPU

INPUTS

  atindx1(natom)=index table for atoms, inverse of atindx (see gstate.f)
  dtset <type(dataset_type)>=all input variables for this dataset
  nattyp(ntypat)= # atoms of each type.
  npwin is the number of plane waves for vectin
  npwout is the number of plane waves for vectout
  psps <type(pseudopotential_type)>=variables related to pseudopotentials
  gpu_option= GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)

OUTPUT

  (no output - only allocation on GPU, member of gemm_nonlop_gpu_data)

SOURCE

298 subroutine alloc_nonlop_gpu_data(dtset,&
299   &                            psps,&
300   &                            npwin,&
301   &                            npwout,&
302   &                            atindx1,&
303   &                            nattyp,&
304   &                            gpu_option)
305 
306   !Arguments ------------------------------------
307   !scalars
308   type(dataset_type),        intent(in) :: dtset
309   type(pseudopotential_type),intent(in) :: psps
310   integer,                   intent(in) :: npwin
311   integer,                   intent(in) :: npwout
312   integer,                   intent(in) :: atindx1(dtset%natom)
313   !arrays
314   integer,                   intent(in) :: nattyp(dtset%ntypat)
315   !integer,                   intent(in) :: npwarr(dtset%nkpt)
316 
317   ! other
318   integer,                   intent(in) :: gpu_option
319 
320   !Local variables-------------------------------
321   !scalars
322 #if defined HAVE_GPU_CUDA
323   integer :: cplex
324   integer :: nprojs
325   integer :: itypat
326 
327   real(dp) :: allocated_size_bytes
328   character(len=500)    :: message
329 #endif
330 
331 ! *************************************************************************
332 
333 #if defined HAVE_GPU_CUDA
334   allocated_size_bytes = 0.
335 
336   cplex=2; !if (istwf_k>1) cplex=1
337 
338   ! compute nprojs
339   nprojs = 0
340   do itypat = 1,dtset%ntypat
341     nprojs = nprojs + count(psps%indlmn(3,:,itypat)>0) * nattyp(itypat)
342   end do
343 
344 
345   !! allocate memory on device
346 
347   if (gemm_nonlop_gpu_data % allocated .eqv. .false.) then
348     ! These will store the non-local factors for vectin, svectout and vectout respectively
349     ABI_MALLOC_CUDA(gemm_nonlop_gpu_data%    projections_gpu, INT(cplex, c_size_t) * nprojs * dtset%nspinor*dtset%bandpp * dp)
350     ABI_MALLOC_CUDA(gemm_nonlop_gpu_data%  s_projections_gpu, INT(cplex, c_size_t) * nprojs * dtset%nspinor*dtset%bandpp * dp)
351     ABI_MALLOC_CUDA(gemm_nonlop_gpu_data%vnl_projections_gpu, INT(2    , c_size_t) * nprojs * dtset%nspinor*dtset%bandpp * dp)
352 
353     allocated_size_bytes = allocated_size_bytes + (2*cplex+2)*nprojs * dtset%nspinor*dtset%bandpp * dp
354 
355     ABI_MALLOC_CUDA(gemm_nonlop_gpu_data%  vectin_gpu,  INT(2, c_size_t) * npwin  * dtset%nspinor*dtset%bandpp * dp)
356     ABI_MALLOC_CUDA(gemm_nonlop_gpu_data%  vectout_gpu, INT(2, c_size_t) * npwout * dtset%nspinor*dtset%bandpp * dp)
357     ABI_MALLOC_CUDA(gemm_nonlop_gpu_data% svectout_gpu, INT(2, c_size_t) * npwout * dtset%nspinor*dtset%bandpp * dp)
358 
359     allocated_size_bytes = allocated_size_bytes + &
360       & 2 * (npwin+2*npwout) * dtset%nspinor * dtset%bandpp * dp
361 
362     gemm_nonlop_gpu_data % allocated = .true.
363 
364     write(message,*) '  alloc_nonlop_gpu_data allocated ', allocated_size_bytes*1e-6, ' MBytes on device memory'
365     call wrtout(std_out,message,'COLL')
366 
367   end if
368 
369 #else
370 
371   if (.false.) then
372     write(std_out,*) psps%indlmn(1,1,1),dtset%natom,npwin,npwout,atindx1(1),nattyp(1),gpu_option
373   end if
374 
375 #endif
376 
377 end subroutine alloc_nonlop_gpu_data

ABINIT/dealloc_hamilt_gpu [ Functions ]

[ Top ] [ Functions ]

NAME

 dealloc_hamilt_gpu

FUNCTION

 deallocate several memory pieces on a GPU device used for the application
 of Hamiltonian using a GPU

INPUTS

  option=0: deallocate data for local operator (FFT)
         1: deallocate data for nonlocal operator
         2: deallocate both
  gpu_option= GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)

OUTPUT

  (no output - only deallocation on GPU)

SOURCE

237 subroutine dealloc_hamilt_gpu(option,gpu_option)
238 
239 !Arguments ------------------------------------
240 !scalars
241  integer,intent(in) :: option,gpu_option
242 !arrays
243 
244 !Local variables-------------------------------
245 
246 ! *************************************************************************
247 
248  if (gpu_option==ABI_GPU_DISABLED) return
249 
250  if (gpu_option == ABI_GPU_KOKKOS .or. gpu_option == ABI_GPU_LEGACY) then
251 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING)
252    if (option==0.or.option==2) then
253      if (gpu_option == ABI_GPU_LEGACY) then
254        call free_gpu_fourwf()
255      else if(gpu_option == ABI_GPU_KOKKOS) then
256        call free_gpu_fourwf_managed()
257      end if
258    end if
259 
260    if (option==1.or.option==2) then
261      call free_nonlop_gpu()
262      call dealloc_nonlop_gpu_data()
263    end if ! option 1 or 2
264 #endif
265  else if(gpu_option == ABI_GPU_OPENMP) then
266    call free_ompgpu_fourwf()
267  end if
268 
269  if (.false.) then
270    write(std_out,*) option
271  end if
272 
273 end subroutine dealloc_hamilt_gpu

ABINIT/dealloc_nonlop_gpu_data [ Functions ]

[ Top ] [ Functions ]

NAME

 dealloc_nonlop_gpu_data

FUNCTION

 deallocate several memory pieces on a GPU device used for the application
 of nonlop operators using Kokkos implementation

OUTPUT

  (no output - only deallocation on GPU)

SOURCE

394 subroutine dealloc_nonlop_gpu_data()
395 
396 #if defined HAVE_GPU_CUDA
397   if (gemm_nonlop_gpu_data % allocated) then
398     ABI_FREE_CUDA(gemm_nonlop_gpu_data%    projections_gpu)
399     ABI_FREE_CUDA(gemm_nonlop_gpu_data%  s_projections_gpu)
400     ABI_FREE_CUDA(gemm_nonlop_gpu_data%vnl_projections_gpu)
401 
402     ABI_FREE_CUDA(gemm_nonlop_gpu_data% vectin_gpu)
403     ABI_FREE_CUDA(gemm_nonlop_gpu_data% vectout_gpu)
404     ABI_FREE_CUDA(gemm_nonlop_gpu_data%svectout_gpu)
405 
406     gemm_nonlop_gpu_data % allocated = .false.
407   end if
408 #endif
409 
410 end subroutine dealloc_nonlop_gpu_data

ABINIT/m_alloc_hamilt_gpu [ Modules ]

[ Top ] [ Modules ]

NAME

  m_alloc_hamilt_gpu

FUNCTION

COPYRIGHT

  Copyright (C) 2000-2024 ABINIT group (MT, FDahm)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

15 #if defined HAVE_CONFIG_H
16 #include "config.h"
17 #endif
18 
19 #include "abi_common.h"
20 
21 module m_alloc_hamilt_gpu
22 
23  use defs_basis
24  use m_abicore
25  use m_xmpi
26  use m_dtset
27  use m_ompgpu_fourwf
28 #if defined HAVE_GPU
29  use m_gpu_toolbox
30 #endif
31 
32 #ifdef HAVE_FC_ISO_C_BINDING
33  use, intrinsic :: iso_c_binding
34 #endif
35 
36  use defs_datatypes, only : pseudopotential_type
37  use defs_abitypes, only : MPI_type
38 
39  implicit none
40 
41  private