TABLE OF CONTENTS


ABINIT/m_chebfiwf [ Functions ]

[ Top ] [ Functions ]

NAME

 m_chebfiwf

FUNCTION

 This module contains a routine updating the whole wave functions at a given k-point,
 using the Chebyshev filtering method (2021 implementation using xG abstraction layer)
 for a given spin-polarization, from a fixed hamiltonian
 but might also simply compute eigenvectors and eigenvalues at this k point.
 it will also update the matrix elements of the hamiltonian.

COPYRIGHT

 Copyright (C) 2018-2024 ABINIT group (BS)
 This file is distributed under the terms of the
 gnu general public license, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 for the initials of contributors, see ~abinit/doc/developers/contributors.txt .

SOURCE

 21 #if defined HAVE_CONFIG_H
 22 #include "config.h"
 23 #endif
 24 
 25 #include "abi_common.h"
 26 
 27 ! nvtx related macro definition
 28 #include "nvtx_macros.h"
 29 
 30 module m_chebfiwf
 31 
 32  use defs_abitypes
 33  use defs_basis
 34  use m_abicore
 35  use m_errors
 36  use m_fstrings
 37  use m_time
 38 
 39  use m_chebfi
 40  use m_chebfi2
 41  use m_invovl
 42 
 43  use m_cgtools,     only : dotprod_g
 44  use m_dtset,       only : dataset_type
 45 
 46  use m_hamiltonian, only : gs_hamiltonian_type
 47  use m_pawcprj,     only : pawcprj_type
 48  use m_nonlop,      only : nonlop
 49  use m_prep_kgb,    only : prep_getghc, prep_nonlop
 50  use m_pawcprj,     only : pawcprj_type, pawcprj_alloc, pawcprj_free
 51  use m_getghc,      only : multithreaded_getghc
 52  use m_gemm_nonlop, only : gemm_nonlop_use_gemm
 53 
 54  use m_xg
 55  use m_xgTransposer
 56 
 57  !FIXME Keep those in these modules or moves them together ?
 58  use m_invovl,             only : invovl_ompgpu_static_mem,invovl_ompgpu_work_mem
 59  use m_gemm_nonlop_ompgpu, only : gemm_nonlop_ompgpu_static_mem
 60  use m_getghc_ompgpu,      only : getghc_ompgpu_work_mem
 61 
 62 #if defined(HAVE_GPU)
 63  use m_gpu_toolbox
 64 #endif
 65 
 66 #if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
 67  use m_nvtx_data
 68 #endif
 69 
 70 #if defined(HAVE_GPU)
 71  use m_gpu_toolbox
 72 #endif
 73 
 74 #if defined(HAVE_YAKL)
 75  use gator_mod
 76 #endif
 77 
 78  use, intrinsic :: iso_c_binding, only: c_associated,c_loc,c_ptr,c_f_pointer,c_double,c_size_t
 79 
 80  use m_xmpi
 81  use m_xomp
 82 #ifdef HAVE_OPENMP
 83  use omp_lib
 84 #endif
 85 
 86  implicit none
 87 
 88  private
 89 
 90  integer, parameter :: l_tim_getghc=7
 91  real(dp), parameter :: inv_sqrt2 = 1/sqrt2
 92 
 93 ! For use in getghc_gsc1
 94  integer, save :: l_cpopt
 95  integer, save :: l_icplx
 96  integer, save :: l_istwf
 97  integer, save :: l_npw
 98  integer, save :: l_nband_filter
 99  integer, save :: l_nspinor
100  logical, save :: l_paw
101  integer, save :: l_prtvol
102  integer, save :: l_sij_opt
103  integer, save :: l_paral_kgb
104  integer, save :: l_useria
105  integer, save :: l_block_sliced
106 
107 #if defined HAVE_GPU && defined HAVE_YAKL
108  real(kind=c_double), ABI_CONTIGUOUS pointer, save :: l_pcon(:)
109 #else
110  real(dp),            allocatable,            save :: l_pcon(:)
111 #endif
112 
113  type(mpi_type),pointer,save :: l_mpi_enreg
114  type(gs_hamiltonian_type),pointer,save :: l_gs_hamk
115 
116  integer, parameter :: DEBUG_ROWS = 5
117  integer, parameter :: DEBUG_COLUMNS = 5
118 
119  public :: chebfiwf2_blocksize
120  public :: chebfiwf2
121 
122  CONTAINS  !========================================================================================

m_chebfiwf/chebfiwf2 [ Functions ]

[ Top ] [ m_chebfiwf ] [ Functions ]

NAME

 chebfiwf2

FUNCTION

 This routine updates the whole wave functions set at a given k-point,
 using the Chebfi method (2021 version using xG abstraction layer)

INPUTS

  dtset= input variables for this dataset
  kinpw(npw)= kinetic energy for each plane wave (Hartree)
  mpi_enreg= MPI-parallelisation information
  nband= number of bands at this k point
  npw= number of plane waves at this k point
  nspinor= number of spinorial components of the wavefunctions
  prtvol= control print volume and debugging

OUTPUT

  eig(nband)= eigenvalues (hartree) for all bands
  enl_out(nband)= contribution of each band to the nl part of energy
  resid(nband)= residuals for each band

SIDE EFFECTS

  cg(2,npw*nspinor*nband)= planewave coefficients of wavefunctions
  gs_hamk <type(gs_hamiltonian_type)>=all data for the hamiltonian at k

SOURCE

245 subroutine chebfiwf2(cg,dtset,eig,enl_out,gs_hamk,kinpw,mpi_enreg,&
246 &                    nband,npw,nspinor,prtvol,resid)
247 
248  implicit none
249 
250  ! Arguments ------------------------------------
251  integer,intent(in) :: nband,npw,prtvol,nspinor
252  type(mpi_type),target,intent(in) :: mpi_enreg
253  real(dp),target,intent(inout) :: cg(2,npw*nspinor*nband)
254  real(dp),intent(in) :: kinpw(npw)
255  real(dp),target,intent(out) :: resid(nband)
256  real(dp),intent(out) :: enl_out(nband)
257  real(dp),target,intent(out) :: eig(nband)
258  type(dataset_type),intent(in) :: dtset
259  type(gs_hamiltonian_type),target,intent(inout) :: gs_hamk
260 
261  ! Local variables-------------------------------
262  ! scalars
263  integer, parameter :: tim_chebfiwf2 = 1750
264  integer :: ipw,space,blockdim,nline,total_spacedim,ierr
265  real(dp) :: localmem
266  type(c_ptr) :: cptr
267  type(chebfi_t) :: chebfi
268  type(xgBlock_t) :: xgx0,xgeigen,xgresidu
269  ! arrays
270  real(dp) :: tsec(2),chebfiMem(2)
271  real(dp),pointer :: eig_ptr(:,:) => NULL()
272  real(dp),pointer :: resid_ptr(:,:) => NULL()
273  real(dp), allocatable :: l_gvnlxc(:,:)
274 
275  ! Stupid things for NC
276  integer,parameter :: choice=1, paw_opt=0, signs=1
277  real(dp) :: gsc_dummy(1,1)
278  type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1)
279 
280 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
281  integer(kind=c_size_t) :: l_pcon_size_bytes
282 #endif
283 
284 ! *********************************************************************
285 
286 !################ INITIALIZATION  #####################################
287 !######################################################################
288 
289   call timab(tim_chebfiwf2,1,tsec)
290 
291 !Set module variables
292  l_paw = (gs_hamk%usepaw==1)
293  l_cpopt=-1;l_sij_opt=0;if (l_paw) l_sij_opt=1
294  l_istwf=gs_hamk%istwf_k
295  l_npw = npw
296  l_nspinor = nspinor
297  l_prtvol = prtvol
298  l_mpi_enreg => mpi_enreg
299  l_gs_hamk => gs_hamk
300  l_nband_filter = nband
301  l_paral_kgb = dtset%paral_kgb
302  l_block_sliced = dtset%invovl_blksliced
303 
304 !Variables
305  nline=dtset%nline
306  blockdim=l_mpi_enreg%nproc_band*l_mpi_enreg%bandpp
307  !for debug
308  l_useria=dtset%useria
309 
310 !Depends on istwfk
311  if ( l_istwf == 2 ) then ! Real only
312    ! SPACE_CR mean that we have complex numbers but no re*im terms only re*re
313    ! and im*im so that a vector of complex is consider as a long vector of real
314    ! therefore the number of data is (2*npw*nspinor)*nband
315    ! This space is completely equivalent to SPACE_R but will correctly set and
316    ! get the array data into the xgBlock
317    space = SPACE_CR
318    l_icplx = 2
319  else ! complex
320    space = SPACE_C
321    l_icplx = 1
322  end if
323 
324 !Memory info
325  if ( prtvol >= 3 ) then
326    if (l_mpi_enreg%paral_kgb == 1) then
327      total_spacedim = l_icplx*l_npw*l_nspinor
328      call xmpi_sum(total_spacedim,l_mpi_enreg%comm_bandspinorfft,ierr)
329    else
330      total_spacedim = 0
331    end if
332    chebfiMem = chebfi_memInfo(nband,l_icplx*l_npw*l_nspinor,space,l_mpi_enreg%paral_kgb, &
333 &                             total_spacedim,l_mpi_enreg%bandpp) !blockdim
334    localMem = (l_npw+2*l_npw*l_nspinor+2*nband)*kind(1.d0) !blockdim
335    write(std_out,'(1x,A,F10.6,1x,A)') "Each MPI process calling chebfi should need around ", &
336    (localMem+sum(chebfiMem))/1e9,"GB of peak memory as follows :"
337    write(std_out,'(4x,A,F10.6,1x,A)') "Permanent memory in chebfiwf : ",(localMem)/1e9,"GB"
338    write(std_out,'(4x,A,F10.6,1x,A)') "Permanent memory in m_chebfi : ",(chebfiMem(1))/1e9,"GB"
339    write(std_out,'(4x,A,F10.6,1x,A)') "Temporary memory in m_chebfi : ",(chebfiMem(2))/1e9,"GB"
340  end if
341 
342  !For preconditionning
343  if(dtset%gpu_option==ABI_GPU_KOKKOS) then
344 #if defined HAVE_GPU && defined HAVE_YAKL
345    ABI_MALLOC_MANAGED(l_pcon, (/l_icplx*npw/))
346 #endif
347  else
348    ABI_MALLOC(l_pcon,(1:l_icplx*npw))
349  end if
350 
351 !$omp parallel do schedule(static), shared(l_pcon,kinpw)
352  do ipw=1-1,l_icplx*npw-1
353    if(kinpw(ipw/l_icplx+1)>huge(zero)*1.d-11) then
354      l_pcon(ipw+1)=0.d0
355    else
356      l_pcon(ipw+1) = (27+kinpw(ipw/l_icplx+1)*(18+kinpw(ipw/l_icplx+1)*(12+8*kinpw(ipw/l_icplx+1)))) &
357 &    / (27+kinpw(ipw/l_icplx+1)*(18+kinpw(ipw/l_icplx+1)*(12+8*kinpw(ipw/l_icplx+1))) + 16*kinpw(ipw/l_icplx+1)**4)
358    end if
359  end do
360 
361 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
362  if(l_gs_hamk%gpu_option==ABI_GPU_KOKKOS) then
363    ! upload l_pcon to device / gpu
364    l_pcon_size_bytes =l_icplx * npw * dp
365    call gpu_data_prefetch_async(C_LOC(l_pcon), l_pcon_size_bytes)
366  end if
367 #endif
368 
369  call xgBlock_map(xgx0,cg,space,l_icplx*l_npw*l_nspinor,nband,l_mpi_enreg%comm_bandspinorfft,gpu_option=dtset%gpu_option)
370 
371 #ifdef HAVE_OPENMP_OFFLOAD
372  !$OMP TARGET ENTER DATA MAP(to:cg,eig,resid) IF(gs_hamk%gpu_option==ABI_GPU_OPENMP)
373 #endif
374 
375  ABI_NVTX_START_RANGE(NVTX_CHEBFI2_SQRT2)
376  if ( l_istwf == 2 ) then ! Real only
377    ! Scale cg
378    call xgBlock_scale(xgx0,sqrt2,1)  !ALL MPI processes do this
379 
380    ! This is possible since the memory in cg and xgx0 is the same
381    ! Don't know yet how to deal with this with xgBlock
382    !MPI HANDLES THIS AUTOMATICALLY (only proc 0 is me_g0)
383    if(l_mpi_enreg%me_g0 == 1) then
384      if(gs_hamk%gpu_option==ABI_GPU_OPENMP) then
385 #ifdef HAVE_OPENMP_OFFLOAD
386        !$OMP TARGET MAP(to:cg)
387        cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * inv_sqrt2
388        !$OMP END TARGET
389 #endif
390      else
391        cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * inv_sqrt2
392      end if
393    end if
394  end if
395  ABI_NVTX_END_RANGE()
396 
397 !Trick with C is to change rank of arrays (:) to (:,:)
398  cptr = c_loc(eig)
399  call c_f_pointer(cptr,eig_ptr,(/ nband,1 /))
400  call xgBlock_map(xgeigen,eig_ptr,SPACE_R,nband,1,l_mpi_enreg%comm_bandspinorfft,gpu_option=dtset%gpu_option)
401 !Trick the with C to change rank of arrays (:) to (:,:)
402  cptr = c_loc(resid)
403  call c_f_pointer(cptr,resid_ptr,(/ nband,1 /))
404  call xgBlock_map(xgresidu,resid_ptr,SPACE_R,nband,1,l_mpi_enreg%comm_bandspinorfft,gpu_option=dtset%gpu_option)
405 
406 ! ABI_MALLOC(l_gvnlxc,(2,l_npw*l_nspinor*l_nband_filter))
407  call timab(tim_chebfiwf2,2,tsec)
408 
409  ABI_NVTX_START_RANGE(NVTX_CHEBFI2_INIT)
410  call chebfi_init(chebfi,nband,l_icplx*l_npw*l_nspinor,dtset%tolwfr_diago,dtset%ecut, &
411 &                 dtset%paral_kgb,l_mpi_enreg%bandpp, &
412 &                 nline, space,1,l_gs_hamk%istwf_k, &
413 &                 l_mpi_enreg%comm_bandspinorfft,l_mpi_enreg%me_g0,l_paw,&
414 &                 l_mpi_enreg%comm_spinorfft,l_mpi_enreg%comm_band,&
415 &                 l_gs_hamk%gpu_option,gpu_kokkos_nthrd=dtset%gpu_kokkos_nthrd)
416  ABI_NVTX_END_RANGE()
417 
418 !################    RUUUUUUUN    #####################################
419 !######################################################################
420 
421  call chebfi_run(chebfi,xgx0,getghc_gsc1,getBm1X,precond1,xgeigen,xgresidu,nspinor)
422 
423 !Free preconditionning since not needed anymore
424  if(dtset%gpu_option==ABI_GPU_KOKKOS) then
425 #if defined HAVE_GPU && defined HAVE_YAKL
426    ABI_FREE_MANAGED(l_pcon)
427 #endif
428  else
429    ABI_FREE(l_pcon)
430  end if
431 
432 !Compute enlout (nonlocal energy for each band if necessary) This is the best
433 !  quick and dirty trick to compute this part in NC. gvnlc cannot be part of
434 !  chebfi algorithm
435  if ( .not. l_paw ) then
436    !Check l_gvnlc size
437    !if ( size(l_gvnlxc) < 2*nband*l_npw*l_nspinor ) then
438    !if ( size(l_gvnlxc) /= 0 ) then
439    !  ABI_FREE(l_gvnlxc)
440 #ifdef FC_CRAY
441    ABI_MALLOC(l_gvnlxc,(1,1))
442 #else
443    ABI_MALLOC(l_gvnlxc,(0,0))
444 #endif
445    !end if
446 
447    ABI_NVTX_START_RANGE(NVTX_CHEBFI2_NONLOP)
448    !Call nonlop
449    call nonlop(choice,l_cpopt,cprj_dum,enl_out,l_gs_hamk,0,eig,mpi_enreg,nband,1,paw_opt,&
450         &            signs,gsc_dummy,l_tim_getghc,cg,l_gvnlxc)
451    ABI_NVTX_END_RANGE()
452    ABI_FREE(l_gvnlxc)
453  end if
454 
455 !Free chebfi
456  call chebfi_free(chebfi)
457 
458 #ifdef HAVE_OPENMP_OFFLOAD
459  !$OMP TARGET UPDATE FROM(cg,eig,resid) IF(gs_hamk%gpu_option==ABI_GPU_OPENMP)
460  !$OMP TARGET EXIT DATA MAP(delete:cg,eig,resid) IF(gs_hamk%gpu_option==ABI_GPU_OPENMP)
461 #endif
462 !################    SORRY IT'S ALREADY FINISHED : )  #################
463 !######################################################################
464 
465  call timab(tim_chebfiwf2,2,tsec)
466 
467  DBG_EXIT("COLL")
468 
469 end subroutine chebfiwf2

m_chebfiwf/getBm1X [ Functions ]

[ Top ] [ m_chebfiwf ] [ Functions ]

NAME

 getBm1X

FUNCTION

 This routine computes S^-1|C> for a given wave function C.
  It acts as a driver for apply_invovl.

SIDE EFFECTS

  X  <type(xgBlock_t)>= memory block containing |C>
  Bm1X <type(xgBlock_t)>= memory block containing S^-1|C>
  transposer <type(xgTransposer_t)>= data used for array transpositions

SOURCE

655 subroutine getBm1X(X,Bm1X,transposer)
656 
657  implicit none
658 
659 !Arguments ------------------------------------
660  type(xgBlock_t), intent(inout) :: X
661  type(xgBlock_t), intent(inout) :: Bm1X
662  type(xgTransposer_t), optional, intent(inout) :: transposer
663 
664 !Local variables-------------------------------
665 !scalars
666  integer :: blockdim
667  integer :: spacedim
668  integer :: cpuRow
669 !arrays
670  real(dp), pointer :: ghc_filter(:,:)
671  real(dp), pointer :: gsm1hc_filter(:,:)
672  type(pawcprj_type), allocatable :: cwaveprj_next(:,:) !dummy
673 
674 ! *********************************************************************
675 
676  call xgBlock_getSize(X,spacedim,blockdim)
677 
678  spacedim = spacedim/l_icplx
679 
680  call xgBlock_reverseMap(X,ghc_filter,l_icplx,spacedim*blockdim)
681 
682  call xgBlock_reverseMap(Bm1X,gsm1hc_filter,l_icplx,spacedim*blockdim)
683 
684  if (l_paral_kgb == 1) cpuRow = xgTransposer_getRank(transposer, 2)
685 
686  !scale back cg
687  if(l_istwf == 2) then
688    call xgBlock_scale(X,inv_sqrt2,1)
689    if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then
690 #ifdef HAVE_OPENMP_OFFLOAD
691      if (l_paral_kgb == 0 ) then
692        if(l_mpi_enreg%me_g0 == 1) then
693          !$OMP TARGET MAP(to:ghc_filter)
694          ghc_filter(:, 1:spacedim*blockdim:l_npw) = ghc_filter(:, 1:spacedim*blockdim:l_npw) * sqrt2
695          !$OMP END TARGET
696        end if
697      else
698        if (cpuRow == 0) then
699          !$OMP TARGET MAP(to:ghc_filter)
700          ghc_filter(:, 1:spacedim*blockdim:spacedim) = ghc_filter(:, 1:spacedim*blockdim:spacedim) * sqrt2
701          !$OMP END TARGET
702        end if
703      end if
704 #endif
705    else
706      if (l_paral_kgb == 0) then
707        if(l_mpi_enreg%me_g0 == 1) ghc_filter(:, 1:spacedim*blockdim:l_npw) = ghc_filter(:, 1:spacedim*blockdim:l_npw) * sqrt2
708      else
709        if (cpuRow == 0) then
710          ghc_filter(:, 1:spacedim*blockdim:spacedim) = ghc_filter(:, 1:spacedim*blockdim:spacedim) * sqrt2
711        end if
712      end if
713    end if
714 
715    if(l_paw) then
716      call xgBlock_scale(Bm1X,inv_sqrt2,1)
717      if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then
718 #ifdef HAVE_OPENMP_OFFLOAD
719        if (l_paral_kgb == 0) then
720          if(l_mpi_enreg%me_g0 == 1) then
721            !$OMP TARGET MAP(to:gsm1hc_filter)
722            gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) = gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) * sqrt2
723            !$OMP END TARGET
724          end if
725        else
726          if (cpuRow == 0) then
727            !$OMP TARGET MAP(to:gsm1hc_filter)
728            gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) = gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) * sqrt2
729            !$OMP END TARGET
730          end if
731        end if
732 #endif
733      else
734        if (l_paral_kgb == 0) then
735          if(l_mpi_enreg%me_g0 == 1) &
736   &        gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) = gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) * sqrt2
737        else
738          if (cpuRow == 0) then
739            gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) = gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) * sqrt2
740          end if
741        end if
742      end if
743    end if
744  end if
745 
746  if(l_paw) then
747    !cwaveprj_next is dummy
748    if(gemm_nonlop_use_gemm) then
749      ABI_MALLOC(cwaveprj_next, (1,1))
750    else
751      ABI_MALLOC(cwaveprj_next, (l_gs_hamk%natom,l_nspinor*blockdim))
752      call pawcprj_alloc(cwaveprj_next,0,l_gs_hamk%dimcprj)
753    end if
754 
755    ABI_NVTX_START_RANGE(NVTX_INVOVL)
756    call apply_invovl(l_gs_hamk, ghc_filter(:,:), gsm1hc_filter(:,:), cwaveprj_next(:,:), &
757        spacedim/l_nspinor, blockdim, l_mpi_enreg, l_nspinor, l_block_sliced)
758    ABI_NVTX_END_RANGE()
759  else
760    gsm1hc_filter(:,:) = ghc_filter(:,:)
761  end if
762 
763  ABI_NVTX_START_RANGE(NVTX_INVOVL_POST1)
764  !Scale cg, ghc, gsc
765  if ( l_istwf == 2 ) then
766    call xgBlock_scale(X,sqrt2,1)
767    if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then
768 #ifdef HAVE_OPENMP_OFFLOAD
769      if (l_paral_kgb == 0) then
770        if(l_mpi_enreg%me_g0 == 1) then
771          !$OMP TARGET MAP(to:ghc_filter)
772          ghc_filter(:, 1:spacedim*blockdim:l_npw) = ghc_filter(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2
773          !$OMP END TARGET
774        endif
775      else
776        if (cpuRow == 0) then
777          !$OMP TARGET MAP(to:ghc_filter)
778          ghc_filter(:, 1:spacedim*blockdim:spacedim) = ghc_filter(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2
779          !$OMP END TARGET
780        end if
781      end if
782 #endif
783    else
784      if (l_paral_kgb == 0) then
785        if(l_mpi_enreg%me_g0 == 1) then
786          ghc_filter(:, 1:spacedim*blockdim:l_npw) = ghc_filter(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2
787        endif
788      else
789        if (cpuRow == 0) then
790          ghc_filter(:, 1:spacedim*blockdim:spacedim) = ghc_filter(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2
791        end if
792      end if
793    end if
794 
795    if(l_paw) then
796      call xgBlock_scale(Bm1X,sqrt2,1)
797      if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then
798 #ifdef HAVE_OPENMP_OFFLOAD
799        if (l_paral_kgb == 0) then
800          if(l_mpi_enreg%me_g0 == 1) then
801            !$OMP TARGET MAP(to:gsm1hc_filter)
802            gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) = gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2
803            !$OMP END TARGET
804          end if
805        else
806          if (cpuRow == 0) then
807            !$OMP TARGET MAP(to:gsm1hc_filter)
808            gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) = gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2
809            !$OMP END TARGET
810          end if
811        end if
812 #endif
813      else
814        if (l_paral_kgb == 0) then
815          if(l_mpi_enreg%me_g0 == 1) &
816   &        gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) = gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2
817        else
818          if (cpuRow == 0) then
819            gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) = gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2
820          end if
821        end if
822      end if
823    end if
824  end if
825 
826  if (l_paw) then
827    call pawcprj_free(cwaveprj_next)
828    ABI_FREE(cwaveprj_next)
829  end if
830 
831  ABI_NVTX_END_RANGE()
832 
833 end subroutine getBm1X

m_chebfiwf/getghc_gsc1 [ Functions ]

[ Top ] [ m_chebfiwf ] [ Functions ]

NAME

 getghc_gsc1

FUNCTION

 This routine computes H|C> and possibly S|C> for a given wave function C.
  It acts as a driver for getghc, taken into account parallelism, multithreading, etc.

SIDE EFFECTS

  X  <type(xgBlock_t)>= memory block containing |C>
  AX <type(xgBlock_t)>= memory block containing H|C>
  BX <type(xgBlock_t)>= memory block containing S|C>
  transposer <type(xgTransposer_t)>= data used for array transpositions

SOURCE

490 subroutine getghc_gsc1(X,AX,BX,transposer)
491 
492  implicit none
493 
494 !Arguments ------------------------------------
495  type(xgBlock_t), intent(inout) :: X
496  type(xgBlock_t), intent(inout) :: AX
497  type(xgBlock_t), intent(inout) :: BX
498  type(xgTransposer_t), optional, intent(inout) :: transposer
499  integer         :: blockdim
500  integer         :: spacedim
501  type(pawcprj_type) :: cprj_dum(l_gs_hamk%natom,1)
502 
503 !Local variables-------------------------------
504 !scalars
505  integer :: cpuRow
506  real(dp) :: eval
507 !arrays
508  real(dp), pointer :: cg(:,:)
509  real(dp), pointer :: ghc(:,:)
510  real(dp), pointer :: gsc(:,:)
511  real(dp)          :: l_gvnlxc(1,1)
512 
513 ! *********************************************************************
514 
515  call xgBlock_getSize(X,spacedim,blockdim)
516 
517  spacedim = spacedim/l_icplx
518 
519  call xgBlock_reverseMap(X,cg,l_icplx,spacedim*blockdim)
520  call xgBlock_reverseMap(AX,ghc,l_icplx,spacedim*blockdim)
521  call xgBlock_reverseMap(BX,gsc,l_icplx,spacedim*blockdim)
522 
523  !Scale back cg
524  if (l_paral_kgb == 1) cpuRow = xgTransposer_getRank(transposer, 2)
525  if(l_istwf == 2) then
526    call xgBlock_scale(X,inv_sqrt2,1)
527    if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then
528 #ifdef HAVE_OPENMP_OFFLOAD
529      if (l_paral_kgb == 0) then
530        if(l_mpi_enreg%me_g0 == 1) then
531          !$OMP TARGET MAP(cg)
532          cg(:, 1:spacedim*blockdim:l_npw) = cg(:, 1:spacedim*blockdim:l_npw) * sqrt2
533          !$OMP END TARGET
534        end if
535      else
536        if (cpuRow == 0) then
537          !$OMP TARGET MAP(cg)
538          cg(:, 1:spacedim*blockdim:spacedim) = cg(:, 1:spacedim*blockdim:spacedim) * sqrt2
539          !$OMP END TARGET
540        end if
541      end if
542 #endif
543    else
544      if (l_paral_kgb == 0) then
545        if(l_mpi_enreg%me_g0 == 1) cg(:, 1:spacedim*blockdim:l_npw) = cg(:, 1:spacedim*blockdim:l_npw) * sqrt2
546      else
547        if (cpuRow == 0) cg(:, 1:spacedim*blockdim:spacedim) = cg(:, 1:spacedim*blockdim:spacedim) * sqrt2
548      end if
549    end if
550  end if
551 
552  !if ( size(l_gvnlxc) < 2*blockdim*spacedim ) then
553  !  ABI_FREE(l_gvnlxc)
554  !  ABI_MALLOC(l_gvnlxc,(2,blockdim*spacedim))
555  !end if
556 
557  call multithreaded_getghc(l_cpopt,cg,cprj_dum,ghc,gsc,&
558    l_gs_hamk,l_gvnlxc,eval,l_mpi_enreg,blockdim,l_prtvol,l_sij_opt,l_tim_getghc,0)
559 
560 
561 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL)
562  call gpu_device_synchronize()
563 #endif
564 
565  !Scale cg, ghc, gsc
566  if ( l_istwf == 2 ) then
567    call xgBlock_scale(X ,sqrt2,1)
568    call xgBlock_scale(AX,sqrt2,1)
569 
570    if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then
571 #ifdef HAVE_OPENMP_OFFLOAD
572      if (l_paral_kgb == 0) then
573        if(l_mpi_enreg%me_g0 == 1) then
574          !$OMP TARGET MAP(cg)
575          cg(:, 1:spacedim*blockdim:l_npw) = cg(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2
576          !$OMP END TARGET
577          !$OMP TARGET MAP(ghc)
578          ghc(:, 1:spacedim*blockdim:l_npw) = ghc(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2
579          !$OMP END TARGET
580        endif
581      else
582        if (cpuRow == 0) then
583          !$OMP TARGET MAP(cg)
584          cg(:, 1:spacedim*blockdim:spacedim) = cg(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2
585          !$OMP END TARGET
586          !$OMP TARGET MAP(ghc)
587          ghc(:, 1:spacedim*blockdim:spacedim) = ghc(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2
588          !$OMP END TARGET
589        end if
590      end if
591 #endif
592    else
593      if (l_paral_kgb == 0) then
594        if(l_mpi_enreg%me_g0 == 1) then
595          cg(:, 1:spacedim*blockdim:l_npw) = cg(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2
596          ghc(:, 1:spacedim*blockdim:l_npw) = ghc(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2
597        endif
598      else
599        if (cpuRow == 0) then
600          cg(:, 1:spacedim*blockdim:spacedim) = cg(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2
601          ghc(:, 1:spacedim*blockdim:spacedim) = ghc(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2
602        end if
603      end if
604    end if
605    if(l_paw) then
606      call xgBlock_scale(BX,sqrt2,1)
607      if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then
608 #ifdef HAVE_OPENMP_OFFLOAD
609        if (l_paral_kgb == 0) then
610          if(l_mpi_enreg%me_g0 == 1) then
611            !$OMP TARGET MAP(gsc)
612            gsc(:, 1:spacedim*blockdim:l_npw) = gsc(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2
613            !$OMP END TARGET
614          end if
615        else
616          if (cpuRow == 0) then
617            !$OMP TARGET MAP(gsc)
618            gsc(:, 1:spacedim*blockdim:spacedim) = gsc(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2
619            !$OMP END TARGET
620          end if
621        end if
622 #endif
623      else
624        if (l_paral_kgb == 0) then
625          if(l_mpi_enreg%me_g0 == 1) gsc(:, 1:spacedim*blockdim:l_npw) = gsc(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2
626        else
627          if (cpuRow == 0) gsc(:, 1:spacedim*blockdim:spacedim) = gsc(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2
628        end if
629      end if
630    end if ! l_paw
631  end if ! l_istwf==2
632 
633  if ( .not. l_paw ) call xgBlock_copy(X,BX)
634 
635 end subroutine getghc_gsc1

m_chebfiwf/precond1 [ Functions ]

[ Top ] [ m_chebfiwf ] [ Functions ]

NAME

 precond1

FUNCTION

 This routine applies a preconditionning to a block of memory

INPUTS

 [gpu_option] = GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)

SIDE EFFECTS

  W <type(xgBlock_t)>= memory block

SOURCE

852 subroutine precond1(W)
853 
854  implicit none
855 
856  ! Arguments ------------------------------------
857  type(xgBlock_t), intent(inout)           :: W
858 
859 
860  ! Local variables-------------------------------
861  ! scalars
862  integer :: ispinor
863 
864  ! *********************************************************************
865 
866  ! Precondition resid_vec
867  do ispinor = 1,l_nspinor
868    call xgBlock_colwiseMul(W, l_pcon, l_icplx*l_npw*(ispinor-1))
869  end do
870 
871 end subroutine precond1