TABLE OF CONTENTS


ABINIT/m_gemm_nonlop_gpu [ Modules ]

[ Top ] [ Modules ]

NAME

 m_gemm_nonlop_gpu

FUNCTION

  This module provides functions to compute the nonlocal operator by means of the BLAS GEMM
  routine. By treating ndat simultaneous wavefunctions, it is able to exploit BLAS3 routines,
  which leads to excellent CPU efficiency and OpenMP scalability.

COPYRIGHT

 Copyright (C) 2014-2024 ABINIT group (AL,MS)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

19 #if defined HAVE_CONFIG_H
20 #include "config.h"
21 #endif
22 
23 #include "abi_common.h"
24 
25 module m_gemm_nonlop_gpu
26 
27  use defs_basis
28  use m_errors
29  use m_abicore
30  use m_xmpi
31  use m_fstrings,    only : itoa, ftoa, sjoin
32 
33  use m_abi_linalg  ! copy_on_gpu, copy_from_gpu, alloc_on_gpu, dealloc_on_gpu, gpu_memset, gpu_allocated
34  use defs_abitypes, only : MPI_type
35  use m_opernlc_ylm_allwf_cpu, only : opernlc_ylm_allwf_cpu
36  use m_pawcprj, only : pawcprj_type
37  use m_gemm_nonlop, only : gemm_nonlop_type,gemm_nonlop_ikpt_this_proc_being_treated,make_gemm_nonlop,gemm_nonlop_kpt
38 
39 #if defined(HAVE_GPU_CUDA)
40  use m_gpu_toolbox
41  use m_alloc_hamilt_gpu, only : gemm_nonlop_gpu_data
42 #endif
43 
44 #ifdef HAVE_KOKKOS
45  use m_manage_kokkos, only : opernlc_ylm_allwf_kokkos
46 #endif
47 
48 #ifdef HAVE_FC_ISO_C_BINDING
49  use, intrinsic :: iso_c_binding, only : c_ptr, c_int32_t, c_int64_t, c_float, c_double, c_size_t, c_loc
50 #endif
51 
52  implicit none
53 
54  private
55 
56  public :: init_gemm_nonlop_gpu
57  public :: destroy_gemm_nonlop_gpu
58  public :: make_gemm_nonlop_gpu
59  public :: gemm_nonlop_gpu

m_gemm_nonlop_gpu/destroy_gemm_nonlop_gpu [ Functions ]

[ Top ] [ m_gemm_nonlop_gpu ] [ Functions ]

NAME

 destroy_gemm_nonlop_gpu

FUNCTION

 Initalization of the gemm_nonlop_kpt array

INPUTS

 nkpt= number of k-points

PARENTS

      m_gstate

CHILDREN

      abi_zgemm_2r,dgemm,opernlc_ylm,xmpi_sum

SOURCE

226  subroutine destroy_gemm_nonlop_gpu(nkpt)
227 
228    integer,intent(in) :: nkpt
229    integer :: ikpt
230 
231    ! *************************************************************************
232 
233    ! TODO add cycling if kpt parallelism
234 
235    ! This function must be called before destroy_gemm_nonlop so it can
236    ! properly figure out which GPU buffer to free.
237    if(.not. allocated(gemm_nonlop_kpt)) then
238      ABI_BUG("gemm_nonlop is already free, cannot free GPU resources !")
239    end if
240 
241    ! deallocate GPU ressource for each k point
242    do ikpt = 1,nkpt
243      if(gemm_nonlop_kpt_gpu(ikpt)%nprojs /= -1) then
244        ! deallocate arrays projs, projs_r and projs_i
245        if (allocated(gemm_nonlop_kpt(ikpt)%projs)) then
246          call dealloc_on_gpu(gemm_nonlop_kpt_gpu(ikpt)%projs)
247        end if
248        if (allocated(gemm_nonlop_kpt(ikpt)%projs_r)) then
249          call dealloc_on_gpu(gemm_nonlop_kpt_gpu(ikpt)%projs_r)
250        end if
251        if (allocated(gemm_nonlop_kpt(ikpt)%projs_i)) then
252          call dealloc_on_gpu(gemm_nonlop_kpt_gpu(ikpt)%projs_i)
253        end if
254        gemm_nonlop_kpt_gpu(ikpt)%nprojs = -1
255      end if
256    end do
257 
258    ABI_FREE(gemm_nonlop_kpt_gpu)
259 
260  end subroutine destroy_gemm_nonlop_gpu

m_gemm_nonlop_gpu/gemm_nonlop_gpu [ Functions ]

[ Top ] [ m_gemm_nonlop_gpu ] [ Functions ]

NAME

 gemm_nonlop_gpu

FUNCTION

 Replacement of nonlop.

INPUTS

 [gpu_option] = GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)

PARENTS

      m_nonlop

CHILDREN

      abi_zgemm_2r,dgemm,opernlc_ylm,xmpi_sum

SOURCE

371  subroutine gemm_nonlop_gpu(atindx1,choice,cpopt,cprjin,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout,&
372 &                        enl,indlmn,istwf_k,&
373 &                        lambda,lmnmax,matblk,&
374 &                        mpi_enreg,natom,nattyp,ndat,nkpgin,nkpgout,&
375 &                        nnlout,npwin,npwout,nspinor,nspinortot,ntypat,paw_opt,&
376 &                        sij,svectout,&
377 &                        useylm,vectin,vectout,&
378 &                        vectproj,gpu_option)
379 
380   !Arguments ------------------------------------
381   !scalars
382   integer,intent(in) :: choice,cpopt,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout
383   integer,intent(in) :: istwf_k,lmnmax,matblk,natom,ndat,nkpgin
384   integer,intent(in) :: nkpgout,nnlout,npwin,npwout,nspinor,nspinortot,ntypat
385   integer,intent(in) :: paw_opt,useylm
386   integer,optional,intent(in)      :: gpu_option
387   real(dp), target, intent(in)     :: lambda(ndat)
388   type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat)
389   type(MPI_type),   intent(in)     :: mpi_enreg
390 
391   !arrays
392   integer,  target, intent(in)     :: atindx1(natom)
393   integer,  target, intent(in)     :: indlmn(6,lmnmax,ntypat)
394   integer,          intent(in)     :: nattyp(ntypat)
395   real(dp), target, intent(in)     :: enl(dimenl1,dimenl2,nspinortot**2,dimekbq)
396   real(dp), target, intent(in)     :: sij(dimenl1,ntypat*((paw_opt+1)/3))
397   real(dp), target, intent(inout)  ::  vectin (2,npwin*nspinor*ndat)
398   real(dp), target, intent(out)    :: svectout(2,npwout*nspinor*(paw_opt/3)*ndat)
399   real(dp), target, intent(inout)  ::  vectout(2,npwout*nspinor*ndat) !vz_i
400   real(dp), target, intent(inout), ABI_CONTIGUOUS optional :: vectproj(:,:,:)
401 
402   ! locals
403   integer :: idat, nprojs, shift, iatom, nlmn, ierr, ibeg, iend
404   integer :: cplex, cplex_enl, cplex_fac
405   integer :: iatm, ndgxdt, ndgxdtfac, nd2gxdt, nd2gxdtfac, optder, itypat, ilmn
406   integer :: cplex_dgxdt(1), cplex_d2gxdt(1)
407   real(dp) :: dgxdt_dum_in(1,1,1,1,1), dgxdt_dum_out(1,1,1,1,1),dgxdt_dum_out2(1,1,1,1,1)
408   real(dp) :: d2gxdt_dum_in(1,1,1,1,1), d2gxdt_dum_out(1,1,1,1,1),d2gxdt_dum_out2(1,1,1,1,1)
409   integer :: npw_max
410   integer :: nattyp_max
411 
412   logical :: local_vectproj
413 
414   real(dp), allocatable, target :: sij_typ(:)
415 
416   !type(c_ptr)                      :: projections_gpu,        s_projections_gpu,        vnl_projections_gpu
417   real(dp), ABI_CONTIGUOUS pointer :: projections_cpu(:,:,:)
418   real(dp),    allocatable, target :: s_projections_cpu(:,:,:), vnl_projections_cpu(:,:,:)
419 
420   ! used inside opernlc_ylm_allwf_kokkos_cpp when iphase > 1
421   type(c_ptr)                      :: vnl_projections2_gpu
422 
423   type(c_ptr)                      :: temp_realvec_gpu
424 
425   ! GPU waveform data are allocated in m_alloc_hamilt_gpu
426   !type(c_ptr)                      :: vectin_gpu, vectout_gpu, svectout_gpu
427   ! Pointers to either CUDA allocated or managed memory data
428   type(c_ptr)                      :: vectin_ptr, vectout_ptr, svectout_ptr
429   integer(c_size_t)                :: vectin_size
430 
431   type(c_ptr)                      :: enl_gpu
432   integer(c_size_t)                :: enl_size_bytes
433 
434   integer                          :: sizeof_int
435 
436   type(c_ptr)                      :: atindx1_gpu
437   integer(c_size_t)                :: atindx1_size_bytes
438 
439   type(c_ptr)                      :: indlmn_gpu
440   integer(c_size_t)                :: indlmn_size_bytes
441 
442   type(c_ptr)                      :: lambda_gpu
443   integer(c_size_t)                :: lambda_size_bytes
444 
445   type(c_ptr)                      :: sij_typ_gpu
446   integer(c_size_t)                :: sij_typ_size_bytes
447 
448   integer(kind=c_int32_t), parameter :: izero = 0
449   integer(kind=c_int32_t), parameter :: fix_realvec_divide_by_2 = 0
450   integer(kind=c_int32_t), parameter :: fix_realvec_zero_out    = 1
451 
452   integer                          :: shift_spinor
453 
454 ! *************************************************************************
455 
456   ! This function should only be called within CUDA legacy or Kokkos code paths
457   if(gpu_option/=ABI_GPU_LEGACY .and. gpu_option/=ABI_GPU_KOKKOS) then
458     ABI_BUG("Unhandled GPU value !")
459   end if
460 
461   npw_max = MAX(npwin, npwout)
462 
463   cplex = 2; if (istwf_k>1) cplex=1
464   cplex_enl = 1; if (paw_opt>0) cplex_enl=2*dimenl1/(lmnmax*(lmnmax+1)) ! is enl complex?
465   cplex_fac = max(cplex,dimekbq)
466   if ((nspinortot==2.or.cplex_enl==2).and.paw_opt>0.and.choice/=7) cplex_fac=2 ! is vnl_projections complex?
467 
468   nprojs = gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%nprojs
469 
470   ! These will store the non-local factors for vectin, svectout and vectout respectively
471   call gpu_memset(gemm_nonlop_gpu_data%    projections_gpu, izero, INT(cplex,     c_size_t) * nprojs * nspinor*ndat * dp)
472   call gpu_memset(gemm_nonlop_gpu_data%  s_projections_gpu, izero, INT(cplex,     c_size_t) * nprojs * nspinor*ndat * dp)
473   call gpu_memset(gemm_nonlop_gpu_data%vnl_projections_gpu, izero, INT(cplex_fac, c_size_t) * nprojs * nspinor*ndat * dp)
474 
475   if (dimekbq>1) then
476     ABI_CHECK(cplex_fac==2,"BUG: invalid cplex_fac==1 when dimekbq=2!")
477     ABI_MALLOC_CUDA(vnl_projections2_gpu,        INT(cplex_fac, c_size_t) * nprojs * nspinor*ndat * dp)
478     call gpu_memset(vnl_projections2_gpu, izero, INT(cplex_fac, c_size_t) * nprojs * nspinor*ndat * dp)
479   end if
480 
481   vectin_size = 2*npwin*nspinor*ndat*dp
482 
483   if(gpu_option==ABI_GPU_LEGACY) then
484     vectin_ptr  = gemm_nonlop_gpu_data%vectin_gpu
485     vectout_ptr = gemm_nonlop_gpu_data%vectout_gpu
486     svectout_ptr= gemm_nonlop_gpu_data%svectout_gpu
487 
488     call copy_on_gpu(vectin, gemm_nonlop_gpu_data%vectin_gpu, vectin_size)
489     if (choice == 7) then
490       call copy_on_gpu(svectout, gemm_nonlop_gpu_data%svectout_gpu, INT(2, c_size_t) * npwout*nspinor*ndat * dp)
491     end if
492 
493   else if(gpu_option==ABI_GPU_KOKKOS) then
494     vectin_ptr  =C_LOC(vectin)
495     vectout_ptr =C_LOC(vectout)
496     svectout_ptr=C_LOC(svectout)
497 
498     call gpu_data_prefetch_async(vectin_ptr, vectin_size)
499     if (choice == 7) then
500       call gpu_data_prefetch_async(C_LOC(svectout), vectin_size)
501     end if
502 
503   end if
504 
505   !! gpu alloc and init : enl_gpu
506   enl_size_bytes = dimenl1 * dimenl2 * nspinortot**2 * dimekbq * dp
507   ABI_MALLOC_CUDA( enl_gpu, enl_size_bytes )
508   call copy_on_gpu(enl, enl_gpu, enl_size_bytes )
509 
510   !! gpu alloc and init atindx1_gpu
511   sizeof_int = 4
512   atindx1_size_bytes = natom * sizeof_int
513   ABI_MALLOC_CUDA( atindx1_gpu,  atindx1_size_bytes )
514   call copy_on_gpu(atindx1, atindx1_gpu, atindx1_size_bytes )
515 
516   !! gpu alloc and init indlmn_gpu
517   indlmn_size_bytes = 6*lmnmax*ntypat * sizeof_int
518   ABI_MALLOC_CUDA( indlmn_gpu,  indlmn_size_bytes )
519   call copy_on_gpu(indlmn, indlmn_gpu, indlmn_size_bytes )
520 
521   !! gpu alloc and init lambda_gpu
522   lambda_size_bytes =  ndat * dp
523   ABI_MALLOC_CUDA( lambda_gpu, lambda_size_bytes )
524   call copy_on_gpu(lambda, lambda_gpu, lambda_size_bytes )
525 
526   if(nprojs == 0) then
527     ! TODO check if this is correct
528     vectout = zero
529     if(paw_opt>0) svectout = vectin
530     return
531   end if
532 
533   ! determine precisely when temp_realvec needs to be allocated
534   ! to factorize allocate (resp. deallocate) at the begining (resp. at the end) of subroutine
535   ! to avoid multiple allocate/deallocate that can be costly
536   if (cplex /= 2) then
537     if ( (cpopt < 2) .or. &
538       &  (paw_opt == 3 .or. paw_opt == 4) .or. &
539       &  (paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4)) then
540       ABI_MALLOC_CUDA(temp_realvec_gpu, (INT(npw_max, c_size_t) * nspinor * ndat * dp))
541     end if
542   end if
543 
544   ! If vectproj is provided, use it for further calculations, use allocated array otherwise
545   local_vectproj=.false.
546   if(PRESENT(vectproj)) then
547     if(size(vectproj)>1) local_vectproj=.true.
548   end if
549   if (local_vectproj) then
550     projections_cpu => vectproj
551   else
552     ABI_MALLOC(    projections_cpu,(cplex,     nprojs,nspinor*ndat))
553     projections_cpu = zero
554   end if
555   ABI_MALLOC(  s_projections_cpu,(cplex,     nprojs,nspinor*ndat)) ! TODO - TO BE REMOVED ONCE CUDA-IZATION IS OK
556   ABI_MALLOC(vnl_projections_cpu,(cplex_fac, nprojs,nspinor*ndat)) ! TODO - TO BE REMOVED ONCE CUDA-IZATION IS OK
557   s_projections_cpu = zero
558   vnl_projections_cpu = zero
559 
560   if(cpopt >= 2) then
561 
562     if(.not. local_vectproj) then
563       !This use-case is extremely painful for GEMM GPU nonlop performance.
564       !vectproj paramter should always be provided to avoid it
565 
566       ! retrieve from cprjin
567       do idat=1, ndat*nspinor
568         shift = 0
569         do iatom = 1, natom
570           nlmn = cprjin(iatom, idat)%nlmn
571           projections_cpu(1:cplex, shift+1:shift+nlmn, idat) = cprjin(iatom, idat)%cp(1:cplex, 1:nlmn)
572           shift = shift + nlmn
573         end do
574       end do
575     end if
576 
577     ! copy from HOST projections_cpu to GPU projections_gpu
578     call copy_on_gpu(projections_cpu, gemm_nonlop_gpu_data%projections_gpu,&
579         INT(cplex, c_size_t) * nprojs * nspinor*ndat * dp)
580 
581   else ! cpopt < 2
582 
583     ! opernla
584     if(cplex == 2) then
585 
586       ! projections_gpu = projs * vectin_gpu
587       call abi_gpu_xgemm(cplex, 'C', 'N', &
588         &            nprojs, ndat*nspinor, npwin, &                                                ! M,N,K
589         &            cone, &                                                                       ! alpha
590         &            gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs, npwin, & ! A, LDA
591         &            vectin_ptr, npwin, &                                                          ! B, LDB
592         &            czero, &                                                                      ! beta
593         &            gemm_nonlop_gpu_data%projections_gpu, nprojs)                                 ! C, LDC
594 
595     else
596 
597       if (.not. gpu_allocated(temp_realvec_gpu)) then
598         ABI_ERROR("Please provide memory allocation for temp_realvec_gpu array")
599       end if
600 
601 
602       ! only compute real part of projections = P^* psi => projections_r = P_r^T psi_r + P_i^T psi_i
603       !temp_realvec(1:npwin*nspinor*ndat) = vectin(1,1:npwin*nspinor*ndat)
604       call extract_real_part(temp_realvec_gpu, vectin_ptr, npwin*nspinor*ndat)
605 
606       if(istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then
607         ! do idat=1, ndat*nspinor
608         !   temp_realvec(1+(idat-1)*npwin) = temp_realvec(1+(idat-1)*npwin)/2
609         ! end do
610         call fix_realvec(temp_realvec_gpu, npwin, ndat*nspinor, fix_realvec_divide_by_2)
611       end if
612 
613       call abi_gpu_xgemm(cplex, 'T', 'N', &
614         &            nprojs, ndat*nspinor, npwin, &                                                  ! M,N,K
615         &            cone, &                                                                         ! alpha
616         &            gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs_r, npwin, & ! A, LDA
617         &            temp_realvec_gpu, npwin, &                                                      ! B, LDB
618         &            czero, &                                                                        ! beta
619         &            gemm_nonlop_gpu_data%projections_gpu, nprojs)                                   ! C, LDC
620 
621       !temp_realvec(1:npwin*nspinor*ndat) = vectin(2,1:npwin*nspinor*ndat)
622       call extract_imag_part(temp_realvec_gpu, vectin_ptr, npwin*nspinor*ndat)
623 
624       if(istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then
625         ! do idat=1, ndat*nspinor
626         !   temp_realvec(1+(idat-1)*npwin) = zero
627         ! end do
628         call fix_realvec(temp_realvec_gpu, npwin, ndat*nspinor, fix_realvec_zero_out)
629       end if
630       call abi_gpu_xgemm(cplex, 'T', 'N', &
631         &            nprojs, ndat*nspinor, npwin, &
632         &            cone, &
633         &            gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs_i, npwin, &
634         &            temp_realvec_gpu, npwin, &
635         &            cone, &
636         &            gemm_nonlop_gpu_data%projections_gpu, nprojs)
637 
638       !gemm_nonlop_gpu_data%projections_gpu = 2 * gemm_nonlop_gpu_data%projections_gpu
639       call gpu_xscal(cplex, nprojs*nspinor*ndat, ctwo, gemm_nonlop_gpu_data%projections_gpu, 1)
640 
641     end if ! cplex == 2
642 
643 !    call xmpi_sum(projections,mpi_enreg%comm_fft,ierr)
644 
645     if(cpopt >= 0) then
646       ! copy from GPU projections_gpu to HOST projections_cpu
647       call copy_from_gpu(projections_cpu, gemm_nonlop_gpu_data%projections_gpu,&
648           INT(cplex, c_size_t) * nprojs * nspinor*ndat * dp)
649 
650       if(.not. local_vectproj) then
651         !This use-case is extremely painful for GEMM GPU nonlop performance.
652         !vectproj paramter should always be provided to avoid it
653 
654         ! store in cprjin
655         do idat=1, ndat*nspinor
656           shift = 0
657           do iatom = 1, natom
658             nlmn = cprjin(iatom, idat)%nlmn
659             cprjin(iatom, idat)%cp(1:cplex, 1:nlmn) = projections_cpu(1:cplex, shift+1:shift+nlmn, idat)
660             shift = shift + nlmn
661           end do
662         end do
663       end if
664 
665     end if ! cpopt >= 0
666 
667   end if ! cpopt >= 2
668 
669   if(choice > 0) then
670 
671     if(choice /= 7) then
672       ! opernlc
673       iatm = 0
674       ndgxdt = 0
675       ndgxdtfac = 0
676       nd2gxdt = 0
677       nd2gxdtfac = 0
678       optder = 0
679 
680       ! get the maximun of nattyp array
681       nattyp_max = maxval(nattyp)
682 
683       !! gpu alloc and init sij_typ_size_bytes
684       sij_typ_size_bytes = (((paw_opt+1)/3)*lmnmax*(lmnmax+1)/2) * dp
685       ABI_MALLOC_CUDA( sij_typ_gpu, sij_typ_size_bytes )
686 
687       ABI_MALLOC     ( sij_typ    , (((paw_opt+1)/3)*lmnmax*(lmnmax+1)/2) )
688 
689       shift = 0
690       do itypat=1, ntypat
691         nlmn=count(indlmn(3,:,itypat)>0)
692         if (paw_opt>=2) then
693 
694           if (cplex_enl==1) then
695 
696             do ilmn=1,nlmn*(nlmn+1)/2
697               sij_typ(ilmn) = sij(ilmn,itypat)
698             end do
699 
700           else
701 
702             do ilmn=1,nlmn*(nlmn+1)/2
703               sij_typ(ilmn) = sij(2*ilmn-1,itypat)
704             end do
705 
706           end if
707 
708           call copy_on_gpu(sij_typ, sij_typ_gpu, sij_typ_size_bytes )
709 
710         end if ! paw_opt>=2
711 
712         ibeg = shift+1
713         iend = shift+nattyp(itypat)*nlmn
714 
715         !Parallelization over spinors treatment
716         shift_spinor = 0
717         if (mpi_enreg%paral_spinor==1) then
718           shift_spinor = mpi_enreg%me_spinor
719         end if
720 
721         ! Use the Kokkos implementation of opernlc if available
722         if (gpu_option == ABI_GPU_KOKKOS) then
723 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS)
724           call opernlc_ylm_allwf_kokkos(cplex, cplex_enl, cplex_fac, &
725             &                           dimenl1, dimenl2, dimekbq, &
726             &                           iatm, itypat, ntypat, nprojs, &
727             &                           natom, nattyp(itypat), nspinor, &
728             &                           nspinortot, paw_opt, &
729             &                           nlmn, lmnmax, &
730             &                           enl_gpu, &
731             &                           gemm_nonlop_gpu_data%projections_gpu, &
732             &                           gemm_nonlop_gpu_data%vnl_projections_gpu, &
733             &                           vnl_projections2_gpu, &
734             &                           gemm_nonlop_gpu_data%s_projections_gpu, &
735             &                           shift_spinor, ndat, &
736             &                           atindx1_gpu, &
737             &                           indlmn_gpu, &
738             &                           lambda_gpu, &
739             &                           sij_typ_gpu, &
740             &                           shift, nattyp_max)
741 #endif
742         ! Fall back on CPU implementation of opernlc
743         else
744 
745           call copy_from_gpu(  projections_cpu, gemm_nonlop_gpu_data%projections_gpu, &
746             &              INT(cplex,     c_size_t) * nprojs * nspinor*ndat * dp)
747 
748           call opernlc_ylm_allwf_cpu(atindx1, cplex, cplex_enl, cplex_fac, &
749             &                  dimenl1, dimenl2, dimekbq, enl, &
750             &                  projections_cpu(:, ibeg:iend, 1:nspinor*ndat), &
751             &                  vnl_projections_cpu(:, ibeg:iend, 1:nspinor*ndat), &
752             &                  s_projections_cpu(:, ibeg:iend, 1:nspinor*ndat), &
753             &                  iatm, indlmn(:,:,itypat), itypat, ndat, lambda, mpi_enreg, natom, &
754             &                  nattyp(itypat), nlmn, nspinor, nspinortot, paw_opt, sij_typ)
755 
756           call copy_on_gpu(  s_projections_cpu, gemm_nonlop_gpu_data%  s_projections_gpu, &
757             &              INT(cplex,     c_size_t) * nprojs * nspinor*ndat * dp)
758           call copy_on_gpu(vnl_projections_cpu, gemm_nonlop_gpu_data%vnl_projections_gpu, &
759             &              INT(cplex_fac, c_size_t) * nprojs * nspinor*ndat * dp)
760         end if
761 
762         shift = shift + nattyp(itypat)*nlmn
763         iatm  = iatm  + nattyp(itypat)
764       end do ! itypat
765 
766       ABI_FREE(sij_typ)
767       ABI_FREE_CUDA( sij_typ_gpu )
768 
769     else ! choice == 7
770 
771       ! TO BE REMOVED - DEBUG ONLY
772       call copy_gpu_to_gpu(gemm_nonlop_gpu_data%s_projections_gpu, &
773            &               gemm_nonlop_gpu_data%projections_gpu, &
774            &               INT(cplex, c_size_t) * nprojs * nspinor * ndat * dp)
775 
776     end if ! choice
777 
778     ! opernlb
779     if (paw_opt == 3 .or. paw_opt == 4) then
780 
781       ! Get svectout from s_projections
782       if(cplex == 2) then
783 
784         call abi_gpu_xgemm(cplex, 'N', 'N', &
785           &            npwout, ndat*nspinor, nprojs, &                                                ! M,N,K
786           &            cone, &                                                                        ! alpha
787           &            gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs, npwout, & ! A, LDA
788           &            gemm_nonlop_gpu_data%s_projections_gpu, nprojs, &                              ! B, LDB
789           &            czero, &                                                                       ! beta
790           &            svectout_ptr, npwout)                                                          ! C, LDC
791 
792       else
793 
794         if (.not. gpu_allocated(temp_realvec_gpu)) then
795           ABI_ERROR("Please provide memory allocation for temp_realvec_gpu array")
796         end if
797 
798         call abi_gpu_xgemm(cplex, 'N', 'N', &
799           &            npwout, ndat*nspinor, nprojs, &                                                ! M,N,K
800           &            cone, &                                                                        ! alpha
801           &            gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs_r, npwout, & ! A, LDA
802           &            gemm_nonlop_gpu_data%s_projections_gpu, nprojs, &                              ! B, LDB
803           &            czero, &                                                                       ! beta
804           &            temp_realvec_gpu, npwout)                                                      ! C, LDC
805         !svectout(1,1:npwout*nspinor*ndat) = temp_realvec_gpu(1:npwout*nspinor*ndat)
806         call insert_real_part(svectout_ptr, temp_realvec_gpu, npwout*nspinor*ndat)
807 
808         call abi_gpu_xgemm(cplex, 'N', 'N', &
809           &            npwout, ndat*nspinor, nprojs, &                                                ! M,N,K
810           &            cone, &                                                                        ! alpha
811           &            gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs_i, npwout,& ! A, LDA
812           &            gemm_nonlop_gpu_data%s_projections_gpu, nprojs, &                              ! B, LDB
813           &            czero, &                                                                       ! beta
814           &            temp_realvec_gpu, npwout)                                                      ! C, LDC
815         !svectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
816         call insert_imag_part(svectout_ptr, temp_realvec_gpu, npwout*nspinor*ndat)
817 
818       end if ! cplex == 2
819 
820       if (choice /= 7) then
821 
822         ! compute : svectout_gpu = svectout_gpu + vectin_gpu
823         ! this a axpy operation with x => vectin_gpu, y => svectout_gpu and alpha=1
824         ! please remember that svectout_gpu and vectin_gpu have same size when paw_opt >= 3 and paw_opt<6
825         ! this is the case here
826         call abi_gpu_xaxpy(1, &                              ! real
827           &            2*npwin*nspinor*ndat, &               ! size
828           &            cone, &                               ! alpha
829           &            vectin_ptr, 1, &                      ! X, incrx
830           &            svectout_ptr, 1)                      ! Y, incry
831 
832       endif
833 
834       if(gpu_option==ABI_GPU_LEGACY) then
835         ! copy back results on host
836         call copy_from_gpu(svectout, svectout_ptr, INT(2, c_size_t)*npwout*nspinor*(paw_opt/3)*ndat * dp)
837       end if
838 
839       ! reminder : a call to cudaDeviceSynchronize is needed so that svectout can be re-used safely on host
840       ! don't do it here, only when really needed
841 
842     end if ! (paw_opt == 3 .or. paw_opt == 4)
843 
844     if (paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4) then
845 
846       ! Get vectout from vnl_projections
847       if (cplex_fac == 2) then
848 
849         call abi_gpu_xgemm(cplex, 'N', 'N', &
850           &            npwout, ndat*nspinor, nprojs, &                                                ! M,N,K
851           &            cone, &                                                                        ! alpha
852           &            gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs, npwout, & ! A, LDA
853           &            gemm_nonlop_gpu_data%vnl_projections_gpu, nprojs, &                            ! B, LDB
854           &            czero, &                                                                       ! beta
855           &            vectout_ptr, npwout)                                                           ! C, LDC
856 
857       else
858 
859         if (.not. gpu_allocated(temp_realvec_gpu)) then
860           ABI_ERROR("Please provide memory allocation for temp_realvec_gpu array")
861         end if
862 
863         call abi_gpu_xgemm(cplex, 'N', 'N', &
864           &            npwout, ndat*nspinor, nprojs, &                                               ! M,N,K
865           &            cone, &                                                                       ! alpha
866           &            gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs_r, npwout, &  ! A, LDA
867           &            gemm_nonlop_gpu_data%vnl_projections_gpu, nprojs, &                           ! B, LDB
868           &            czero, &                                                                      ! beta
869           &            temp_realvec_gpu, npwout)                                                     ! C, LDC
870         ! vectout(1,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
871         call insert_real_part(vectout_ptr, temp_realvec_gpu, npwout*nspinor*ndat)
872 
873         call abi_gpu_xgemm(cplex, 'N', 'N', &
874           &            npwout, ndat*nspinor, nprojs, &                                               ! M,N,K
875           &            cone, &                                                                       ! alpha
876           &            gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs_i, npwout, & ! A, LDA
877           &            gemm_nonlop_gpu_data%vnl_projections_gpu, nprojs, &                           ! B, LDB
878           &            czero, &                                                                      ! beta
879           &            temp_realvec_gpu, npwout)                                                     ! C, LDC
880         ! vectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
881         call insert_imag_part(vectout_ptr, temp_realvec_gpu, npwout*nspinor*ndat)
882 
883       end if  ! cplex_fac == 2
884 
885       ! copy back results on host
886       if(gpu_option==ABI_GPU_LEGACY) then
887         call copy_from_gpu(vectout, vectout_ptr, INT(2, c_size_t)*npwout*nspinor*ndat * dp)
888       end if
889 
890     end if ! (paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4)
891 
892   end if ! choice > 0
893 
894   if (dimekbq>1) then
895     ABI_FREE_CUDA(vnl_projections2_gpu)
896   end if
897 
898   ! device sync before reusing data computed on device
899   ! this may not be the best location to have this sync
900   call gpu_device_synchronize()
901 
902   if (gpu_allocated(temp_realvec_gpu)) then
903     ABI_FREE_CUDA(temp_realvec_gpu)
904   end if
905 
906   ABI_FREE_CUDA( enl_gpu )
907   ABI_FREE_CUDA( atindx1_gpu )
908   ABI_FREE_CUDA( indlmn_gpu )
909   ABI_FREE_CUDA( lambda_gpu )
910 
911   ! if projections_cpu was allocated, then free it here
912   if(.not. local_vectproj) then
913     ABI_FREE(projections_cpu)
914   end if
915   ABI_FREE(  s_projections_cpu) ! TO BE REMOVED
916   ABI_FREE(vnl_projections_cpu) ! TO BE REMOVED
917 
918  end subroutine gemm_nonlop_gpu

m_gemm_nonlop_gpu/init_gemm_nonlop_gpu [ Functions ]

[ Top ] [ m_gemm_nonlop_gpu ] [ Functions ]

NAME

 init_gemm_nonlop_gpu

FUNCTION

 Memory allocation of the gemm_nonlop_kpt_gpu array

INPUTS

 nkpt= number of k-points

PARENTS

      m_gstate

CHILDREN

      abi_zgemm_2r,dgemm,opernlc_ylm,xmpi_sum

SOURCE

190  subroutine init_gemm_nonlop_gpu(nkpt)
191 
192    integer,intent(in) :: nkpt
193    integer :: ikpt
194 
195    ! *************************************************************************
196 
197    ! TODO only allocate the number of kpt treated by this proc
198    ABI_MALLOC(gemm_nonlop_kpt_gpu, (nkpt))
199    do ikpt=1,nkpt
200      gemm_nonlop_kpt_gpu(ikpt)%npw = -1
201      gemm_nonlop_kpt_gpu(ikpt)%nprojs = -1
202    end do
203 
204    gemm_nonlop_gpu_data % allocated = .false.
205 
206  end subroutine init_gemm_nonlop_gpu

m_gemm_nonlop_gpu/make_gemm_nonlop_gpu [ Functions ]

[ Top ] [ m_gemm_nonlop_gpu ] [ Functions ]

NAME

 make_gemm_nonlop_gpu

FUNCTION

 Replacement of nonlop.

INPUTS

PARENTS

      m_nonlop

CHILDREN

      abi_zgemm_2r,dgemm,opernlc_ylm,xmpi_sum

SOURCE

280  subroutine make_gemm_nonlop_gpu(ikpt,npw,lmnmax,ntypat,indlmn,nattyp,istwf_k,ucvol,ffnl_k, &
281 &                     ph3d_k,kpt_k,kg_k,kpg_k, &
282 &                     compute_grad_strain,compute_grad_atom) ! Optional parameters
283 
284    !Arguments ------------------------------------
285    integer, intent(in) :: ikpt
286    integer, intent(in) :: npw, lmnmax,ntypat
287    integer, intent(in) :: indlmn(:,:,:), kg_k(:,:)
288    integer, intent(in) :: nattyp(ntypat)
289    integer, intent(in) :: istwf_k
290    logical, intent(in), optional :: compute_grad_strain,compute_grad_atom
291    real(dp), intent(in) :: ucvol
292    real(dp), intent(in) :: ffnl_k(:,:,:,:)
293    real(dp), intent(in) :: ph3d_k(:,:,:)
294    real(dp), intent(in) :: kpt_k(:)
295    real(dp), intent(in), target :: kpg_k(:,:)
296 
297    ! locals
298    integer              :: nprojs, itypat
299 
300    ! *************************************************************************
301 
302    ABI_UNUSED((/ikpt,npw,lmnmax,ntypat,indlmn,kg_k,nattyp,istwf_k/))
303    ABI_UNUSED((/ucvol,ffnl_k,ph3d_k,kpt_k,kpg_k/))
304    ABI_UNUSED((/compute_grad_strain,compute_grad_atom/))
305 
306    call make_gemm_nonlop(ikpt,npw,lmnmax,ntypat,indlmn,nattyp,istwf_k,ucvol,ffnl_k, &
307           ph3d_k,kpt_k,kg_k,kpg_k, &
308           compute_grad_strain=compute_grad_strain,compute_grad_atom=compute_grad_atom)
309 
310    nprojs = 0
311    do itypat=1,ntypat
312      nprojs = nprojs + count(indlmn(3,:,itypat)>0)*nattyp(itypat)
313    end do
314    gemm_nonlop_kpt_gpu(ikpt)%npw    = npw
315    gemm_nonlop_kpt_gpu(ikpt)%nprojs = nprojs
316 
317 #ifdef DEBUG_VERBOSE_GPU
318    if(xmpi_comm_rank(xmpi_world) == 0) then
319      call check_gpu_mem("make_gemm_nonlop begin")
320      call wrtout(std_out,sjoin(" npw    .......", itoa(npw)),    'COLL')
321      call wrtout(std_out,sjoin(" nprojs .......", itoa(nprojs)), 'COLL')
322    end if
323 #endif
324 
325    if(istwf_k <= 1) then
326      call alloc_on_gpu(gemm_nonlop_kpt_gpu(ikpt)%projs, INT(2,c_size_t)*npw*nprojs*dp)
327      ! TODO : gradients
328    else
329      call alloc_on_gpu(gemm_nonlop_kpt_gpu(ikpt)%projs_r, INT(1, c_size_t)*npw*nprojs*dp)
330      call alloc_on_gpu(gemm_nonlop_kpt_gpu(ikpt)%projs_i, INT(1, c_size_t)*npw*nprojs*dp)
331      ! TODO : gradients
332    end if
333 
334 #ifdef DEBUG_VERBOSE_GPU
335    if(xmpi_comm_rank(xmpi_world) == 0) then
336      call check_gpu_mem("make_gemm_nonlop end  ")
337    end if
338 #endif
339 
340    ! upload data to gpu memory
341    if(istwf_k <= 1) then
342      call copy_on_gpu(gemm_nonlop_kpt(ikpt)%projs, gemm_nonlop_kpt_gpu(ikpt)%projs, INT(2, c_size_t)*npw*nprojs*dp)
343      ! TODO : gradients
344    else
345      call copy_on_gpu(gemm_nonlop_kpt(ikpt)%projs_r, gemm_nonlop_kpt_gpu(ikpt)%projs_r, &
346        &                    INT(1, c_size_t)*npw*nprojs*dp)
347      call copy_on_gpu(gemm_nonlop_kpt(ikpt)%projs_i, gemm_nonlop_kpt_gpu(ikpt)%projs_i, &
348        &                    INT(1, c_size_t)*npw*nprojs*dp)
349    end if
350 
351  end subroutine make_gemm_nonlop_gpu