TABLE OF CONTENTS


ABINIT/m_abi_gpu_linalg [ Modules ]

[ Top ] [ Modules ]

NAME

  m_abi_gpu_linalg

FUNCTION

  Interfaces of GPU subroutines wrapper

COPYRIGHT

  Copyright (C) 2011-2024 ABINIT group (FDahm ))
  This file is distributed under the terms of the
  GNU General Public License, see ~ABINIT/Infos/copyright
  or http://www.gnu.org/copyleft/gpl.txt .

SOURCE


m_abi_gpu_linalg/abi_gpu_work_resizeC [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

 abi_gpu_work_resizeC

m_abi_gpu_linalg/abi_gpu_work_resizeCptr [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

 abi_gpu_work_resizeCptr

m_abi_gpu_linalg/abi_gpu_work_resizeI [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

 abi_gpu_work_resizeI

m_abi_gpu_linalg/abi_gpu_work_resizeR [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

 abi_gpu_work_resizeR

m_abi_gpu_linalg/abi_gpu_xaxpy [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  abi_gpu_xaxpy

FUNCTION

  Compute a BLAS-1 AXPY operation on GPU
  y = alpha * x + y

INPUTS

  cplx  = 1 if real 2 if complex
  size  = vector size
  alpha = scalar complex value
  x_gpu = pointer to gpu memory location of array x
 incrx  = stride between consecutive elements of x
  y_gpu = pointer to gpu memory location of array y
 incry  = stride between consecutive elements of y

SOURCE

1851 subroutine abi_gpu_xaxpy_cptr(cplx, size, alpha, x, incrx, y, incry)
1852 
1853   ! !Arguments ------------------------------------
1854   integer,      intent(in)    :: cplx
1855   integer,      intent(in)    :: size
1856   complex(dpc), intent(in)    :: alpha
1857   type(c_ptr),  intent(in)    :: x
1858   integer,      intent(in)    :: incrx
1859   type(c_ptr),  intent(in)    :: y
1860   integer,      intent(in)    :: incry
1861 
1862 ! *************************************************************************
1863 
1864   if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then
1865     ABI_BUG("You requested to run on CPU to a GPU wrapper :/")
1866   end if
1867 
1868 #ifdef HAVE_GPU
1869 
1870   call gpu_xaxpy(cplx, size, alpha, x, incrx, y, incry)
1871 
1872   if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
1873     ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them.
1874     ! Therefore, we issue a stream sync here to avoid
1875     !potential mistakes in calling context.
1876     call gpu_linalg_stream_synchronize()
1877   end if
1878 
1879 #else
1880   ! Unused if GPU code disabled
1881   ABI_UNUSED((/cplx,incrx,incry,size/))
1882   ABI_UNUSED((/alpha/))
1883   ABI_UNUSED((/x,y/))
1884 #endif
1885 
1886 end subroutine abi_gpu_xaxpy_cptr

m_abi_gpu_linalg/abi_gpu_xcopy [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  abi_gpu_xcopy

FUNCTION

  Compute a BLAS-1 COPY operation on GPU
  y = x (copy x into y)

INPUTS

  cplx   = 1 if real 2 if complex
  size   = input vector size
  x      = pointer to gpu memory location of array x
  incrx  = stride between consecutive elements of x
  y      = pointer to gpu memory location of array y
  incry  = stride between consecutive elements of y

SOURCE

2038 subroutine abi_gpu_xcopy_cptr(cplx, size, x, incrx, y, incry)
2039 
2040   ! !Arguments ------------------------------------
2041   integer,      intent(in)    :: cplx
2042   integer,      intent(in)    :: size
2043   type(c_ptr),  intent(in)    :: x
2044   integer,      intent(in)    :: incrx
2045   type(c_ptr),  intent(in)    :: y
2046   integer,      intent(in)    :: incry
2047 
2048 ! *************************************************************************
2049 
2050   if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then
2051     ABI_BUG("You requested to run on CPU to a GPU wrapper :/")
2052   end if
2053 
2054 #ifdef HAVE_GPU
2055 
2056   call gpu_xcopy(cplx, size, x, incrx, y, incry)
2057 
2058   if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
2059     ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them.
2060     ! Therefore, we issue a stream sync here to avoid
2061     !potential mistakes in calling context.
2062     call gpu_linalg_stream_synchronize()
2063   end if
2064 
2065 #else
2066   ! Unused if GPU code disabled
2067   ABI_UNUSED((/cplx,incrx,incry,size/))
2068   ABI_UNUSED((/x,y/))
2069 #endif
2070 
2071 end subroutine abi_gpu_xcopy_cptr

m_abi_gpu_linalg/abi_gpu_xgemm [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  abi_gpu_xgemm

FUNCTION

  Compute a scalar-matrix-matrix product and return a scalar-matrix product on GPU
  c = alpha * op(a) * op(b) + beta * c

INPUTS

  cplx  = 1 if real 2 if complex
  transa= from of op(a) to be used in the matrix multiplication
  transb= from of op(b) to be used in the matrix multiplication
  m     = number of rows of the matrix op(a) and of the matrix c
  n     = number of rows of the matrix op(b) and the number of columns of the matrix c
  k     = number of columns of the matrix op(a) and the number of rows of the matrix op(b)
  alpha = alpha scalar coefficient for matrix op(a)
  a = pointer to gpu memory location of  matrix a
  lda   = first dimension of a
  b = pointer to gpu memory location of  matrix b
  ldb   = first dimension of b
  beta  = beta scalar coefficient for matrix c
  c = pointer to gpu memory location of  matrix c
  ldc   = first dimension of c

OUTPUT

  c     = c matrix

SOURCE

776 subroutine abi_gpu_xgemm_cptr(cplx,transa,transb,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
777 
778 !Arguments ------------------------------------
779  integer,intent(in) :: cplx,lda,ldb,ldc,m,n,k
780  complex(dpc),intent(in) :: alpha,beta
781  character(len=1),intent(in) :: transa,transb
782  type(c_ptr),intent(in) :: a,b
783  type(c_ptr),intent(in) :: c
784 
785 ! *********************************************************************
786 
787   if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then
788     ABI_BUG("You requested to run on CPU to a GPU wrapper :/")
789   end if
790 
791 #ifdef HAVE_GPU
792 
793   call gpu_xgemm(cplx,transa,transb,m,n,k,alpha,&
794       a,lda,b,ldb,beta,c,ldc)
795 
796   if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
797     ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them.
798     ! Therefore, we issue a stream sync here to avoid
799     !potential mistakes in calling context.
800     call gpu_linalg_stream_synchronize()
801   end if
802 
803 #else
804   ! Unused if GPU code disabled
805   ABI_UNUSED((/cplx,lda,ldb,ldc,m,n,k/))
806   ABI_UNUSED((/alpha,beta/))
807   ABI_UNUSED((/transa,transb/))
808   ABI_UNUSED((/a,b,c/))
809 #endif
810 
811 end subroutine abi_gpu_xgemm_cptr

m_abi_gpu_linalg/abi_gpu_xgemm_strided [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  abi_gpu_xgemm_strided

FUNCTION

  Compute a batched scalar-matrix-matrix product and return a scalar-matrix product on GPU.
  Meant to be used on non-contiguous matrixes with data is uniformly split in the same number of batches in each matrix.
  c = alpha * op(a) * op(b) + beta * c

INPUTS

  cplx       = 1 if real 2 if complex
  transa     = from of op(a) to be used in the matrix multiplication
  transb     = from of op(b) to be used in the matrix multiplication
  m          = number of rows of the matrix op(a) and of the matrix c
  n          = number of rows of the matrix op(b) and the number of columns of the matrix c
  k          = number of columns of the matrix op(a) and the number of rows of the matrix op(b)
  alpha      = alpha scalar coefficient for matrix op(a)
  a          = pointer to gpu memory location of  matrix a
  lda        = first dimension of a
  strideC    = stride between each batch in matrix a
  b          = pointer to gpu memory location of  matrix b
  ldb        = first dimension of b
  strideC    = stride between each batch in matrix b
  beta       = beta scalar coefficient for matrix c
  c          = pointer to gpu memory location of  matrix c
  ldc        = first dimension of c
  strideC    = stride between each batch in matrix c
  batchCount = number of batches in any matrix

OUTPUT

  c     = c matrix

SOURCE

1003 subroutine abi_gpu_xgemm_strided_cptr(cplx,transa,transb,m,n,k,alpha,a,lda,strideA,b,ldb,strideB,beta,c,ldc,strideC,batchCount)
1004 
1005 !Arguments ------------------------------------
1006  integer,intent(in) :: cplx,lda,ldb,ldc,m,n,k
1007  integer,intent(in) :: strideA,strideB,strideC,batchCount
1008  complex(dpc),intent(in) :: alpha,beta
1009  character(len=1),intent(in) :: transa,transb
1010  type(c_ptr),intent(in) :: a,b
1011  type(c_ptr),intent(in) :: c
1012 
1013 ! *********************************************************************
1014 
1015   if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then
1016     ABI_BUG("You requested to run on CPU to a GPU wrapper :/")
1017   end if
1018 
1019 #ifdef HAVE_GPU
1020 
1021   call gpu_xgemm_strided_batched(cplx,transa,transb,m,n,k,alpha,&
1022       a,lda,strideA,b,ldb,strideB,beta,c,ldc,strideC,batchCount)
1023 
1024   if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
1025     ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them.
1026     ! Therefore, we issue a stream sync here to avoid
1027     !potential mistakes in calling context.
1028     call gpu_linalg_stream_synchronize()
1029   end if
1030 
1031 #else
1032   ! Unused if GPU code disabled
1033   ABI_UNUSED((/cplx,lda,ldb,ldc,m,n,k/))
1034   ABI_UNUSED((/strideA,strideB,strideC,batchCount/))
1035   ABI_UNUSED((/alpha,beta/))
1036   ABI_UNUSED((/transa,transb/))
1037   ABI_UNUSED((/a,b,c/))
1038 #endif
1039 
1040 end subroutine abi_gpu_xgemm_strided_cptr

m_abi_gpu_linalg/abi_gpu_xheevd [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  abi_gpu_xheevd

FUNCTION

  Compute a LAPACK SYEVD operation on GPU
  compute eigen values/vectors of a real
  symmetric-definite eigenproblem

 See cusolver documentation
 https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-heevd

 See also LAPACK doc in reference implementation:
 https://github.com/Reference-LAPACK/lapack/blob/master/SRC/dheevd.f

INPUTS

  cplx  = 1 if real 2 if complex
  jobz  = character, 'n'(eigenvalues only) or 'v' (eigenvalues + eigenvectors)
  uplo  = character, 'u' or 'l'
  A_nrows = matrix size
  A = pointer to gpu memory location of matrix A
  lda   = leading dimension of matrix A
  W = pointer to gpu memory location of matrix W (output eigen values)
  devInfo  =

SOURCE

3003 subroutine abi_gpu_xheevd_cptr(cplx, jobz, uplo, A_nrows, &
3004                    A, lda, &
3005                    W,      &
3006                    devInfo)
3007 
3008   ! Arguments ------------------------------------
3009   integer,         intent(in   ) :: cplx
3010   character(len=1),intent(in   ) :: jobz
3011   character(len=1),intent(in   ) :: uplo
3012   integer,         intent(in   ) :: A_nrows
3013   type(c_ptr),     intent(in   ) :: A
3014   integer,         intent(in   ) :: lda
3015   type(c_ptr),     intent(in) :: W
3016   integer,         intent(inout) :: devInfo
3017 
3018   ! Local variables ------------------------------
3019   integer     :: bufferSize
3020   type(c_ptr) :: gpu_ptr
3021 
3022 ! *************************************************************************
3023 
3024   if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then
3025     ABI_BUG("You requested to run on CPU to a GPU wrapper :/")
3026   end if
3027 
3028 #ifdef HAVE_GPU
3029 
3030   ! probe needed bufferSize
3031   call gpu_xsyevd_buffersize(cplx, jobz, uplo, &
3032                  A_nrows, &
3033                  A, lda, &
3034                  W, &
3035                  bufferSize)
3036 
3037   select case(cplx)
3038 
3039   case (1)
3040     ! resize work array if needed and retrieve work pointer to use
3041     if(abi_linalg_gpu_mode == ABI_GPU_LEGACY &
3042           .or. abi_linalg_gpu_mode == ABI_GPU_KOKKOS) then
3043       call abi_gpu_work_resize(r_work,r_work_managed,r_work_len,bufferSize)
3044       gpu_ptr = c_loc(r_work_managed)
3045     else if(abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
3046 #ifdef HAVE_OPENMP_GET_MAPPED_PTR
3047       call abi_gpu_work_resize(r_work,r_work_managed,r_work_len,bufferSize)
3048       gpu_ptr = xomp_get_mapped_ptr(c_loc(r_work))
3049 #else
3050       call abi_gpu_work_resizeCptr(gpu_work,gpu_work_len,INT(1,c_size_t)*bufferSize*dp)
3051       gpu_ptr = gpu_work
3052 #endif
3053     end if
3054 
3055   case (2)
3056     ! resize work array if needed and retrieve work pointer to use
3057     if(abi_linalg_gpu_mode == ABI_GPU_LEGACY &
3058           .or. abi_linalg_gpu_mode == ABI_GPU_KOKKOS) then
3059       call abi_gpu_work_resize(c_work,c_work_managed,c_work_len,bufferSize)
3060       gpu_ptr = c_loc(c_work_managed)
3061     else if(abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
3062 #ifdef HAVE_OPENMP_GET_MAPPED_PTR
3063       call abi_gpu_work_resize(c_work,c_work_managed,c_work_len,bufferSize)
3064       gpu_ptr = xomp_get_mapped_ptr(c_loc(c_work))
3065 #else
3066       call abi_gpu_work_resizeCptr(gpu_work,gpu_work_len,INT(2,c_size_t)*bufferSize*dp)
3067       gpu_ptr = gpu_work
3068 #endif
3069     end if
3070 
3071   end select
3072 
3073   ! and compute (finally)
3074   call gpu_xsyevd(cplx, jobz, uplo, &
3075                  A_nrows, &
3076                  A, lda, &
3077                  W, &
3078                  gpu_ptr, bufferSize, devInfo)
3079 
3080   if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
3081     ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them.
3082     ! Therefore, we issue a stream sync here to avoid
3083     !potential mistakes in calling context.
3084     call gpu_linalg_stream_synchronize()
3085   end if
3086 
3087 #else
3088   ! Unused if GPU code disabled
3089   ABI_UNUSED((/cplx,A_nrows,lda,devInfo,bufferSize/))
3090   ABI_UNUSED((/jobz,uplo/))
3091   ABI_UNUSED((/A,W,gpu_ptr/))
3092 #endif
3093 
3094 end subroutine abi_gpu_xheevd_cptr

m_abi_gpu_linalg/abi_gpu_xhegvd [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  abi_gpu_xhegvd

FUNCTION

  Compute a LAPACK SYGVD operation on GPU
  compute eigen values/vectors of a real generalized
  symmetric-definite eigenproblem

 See cusolver documentation
 https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-hegvd

 See also LAPACK doc in reference implementation:
 https://github.com/Reference-LAPACK/lapack/blob/master/SRC/dhegvd.f

INPUTS

  cplx  = 1 if real 2 if complex
  itype = integer, type of problem
  jobz  = character, 'n'(eigenvalues only) or 'v' (eigenvalues + eigenvectors)
  uplo  = character, 'u' or 'l'
  A_nrows = matrix size
  A = pointer to gpu memory location of matrix A
  lda   = leading dimension of matrix A
  B = pointer to gpu memory location of matrix B
  ldb   = leading dimension of matrix B
  W = pointer to gpu memory location of matrix W (output eigen values)
  devInfo  =

SOURCE

2694 subroutine abi_gpu_xhegvd_cptr(cplx, itype, jobz, uplo, A_nrows, &
2695                    A, lda, &
2696                    B, ldb, &
2697                    W,      &
2698                    devInfo)
2699 
2700   ! Arguments ------------------------------------
2701   integer,         intent(in   ) :: cplx
2702   integer,         intent(in   ) :: itype
2703   character(len=1),intent(in   ) :: jobz
2704   character(len=1),intent(in   ) :: uplo
2705   integer,         intent(in   ) :: A_nrows
2706   type(c_ptr),     intent(in   ) :: A
2707   integer,         intent(in   ) :: lda
2708   type(c_ptr),     intent(in   ) :: B
2709   integer,         intent(in   ) :: ldb
2710   type(c_ptr),     intent(in   ) :: W
2711   integer,         intent(inout) :: devInfo
2712 
2713   ! Local variables ------------------------------
2714   integer     :: bufferSize
2715   type(c_ptr) :: gpu_ptr
2716 
2717 ! *************************************************************************
2718 
2719   if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then
2720     ABI_BUG("You requested to run on CPU to a GPU wrapper :/")
2721   end if
2722 
2723 #ifdef HAVE_GPU
2724 
2725   ! probe needed bufferSize
2726   call gpu_xsygvd_buffersize(cplx, itype, jobz, uplo, &
2727                  A_nrows, &
2728                  A, lda, &
2729                  B, ldb, &
2730                  W, &
2731                  bufferSize)
2732 
2733   select case(cplx)
2734 
2735   case (1)
2736     ! resize work array if needed and retrieve work pointer to use
2737     if(abi_linalg_gpu_mode == ABI_GPU_LEGACY &
2738           .or. abi_linalg_gpu_mode == ABI_GPU_KOKKOS) then
2739       call abi_gpu_work_resize(r_work,r_work_managed,r_work_len,bufferSize)
2740       gpu_ptr = c_loc(r_work_managed)
2741     else if(abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
2742 #ifdef HAVE_OPENMP_GET_MAPPED_PTR
2743       call abi_gpu_work_resize(r_work,r_work_managed,r_work_len,bufferSize)
2744       gpu_ptr = xomp_get_mapped_ptr(c_loc(r_work))
2745 #else
2746       call abi_gpu_work_resizeCptr(gpu_work,gpu_work_len,INT(1,c_size_t)*bufferSize*dp)
2747       gpu_ptr = gpu_work
2748 #endif
2749     end if
2750 
2751   case (2)
2752     ! resize work array if needed and retrieve work pointer to use
2753     if(abi_linalg_gpu_mode == ABI_GPU_LEGACY &
2754           .or. abi_linalg_gpu_mode == ABI_GPU_KOKKOS) then
2755       call abi_gpu_work_resize(c_work,c_work_managed,c_work_len,bufferSize)
2756       gpu_ptr = c_loc(c_work_managed)
2757     else if(abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
2758 #ifdef HAVE_OPENMP_GET_MAPPED_PTR
2759       call abi_gpu_work_resize(c_work,c_work_managed,c_work_len,bufferSize)
2760       gpu_ptr = xomp_get_mapped_ptr(c_loc(c_work))
2761 #else
2762       call abi_gpu_work_resizeCptr(gpu_work,gpu_work_len,INT(2,c_size_t)*bufferSize*dp)
2763       gpu_ptr = gpu_work
2764 #endif
2765     end if
2766 
2767   end select
2768 
2769   ! and compute (finally)
2770   call gpu_xsygvd(cplx, itype, jobz, uplo, &
2771                  A_nrows, &
2772                  A, lda, &
2773                  B, ldb, &
2774                  W, &
2775                  gpu_ptr, bufferSize, devInfo)
2776 
2777   if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
2778     ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them.
2779     ! Therefore, we issue a stream sync here to avoid
2780     !potential mistakes in calling context.
2781     call gpu_linalg_stream_synchronize()
2782   end if
2783 
2784 #else
2785   ! Unused if GPU code disabled
2786   ABI_UNUSED((/cplx,itype,A_nrows,lda,ldb,devInfo,bufferSize/))
2787   ABI_UNUSED((/jobz,uplo/))
2788   ABI_UNUSED((/A,B,W,gpu_ptr/))
2789 #endif
2790 
2791 end subroutine abi_gpu_xhegvd_cptr

m_abi_gpu_linalg/abi_gpu_xpotrf [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  abi_gpu_xpotrf

FUNCTION

  Compute a LAPACK SYGVD operation on GPU
  compute eigen values/vectors of a real
  symmetric-definite eigenproblem

 See cusolver documentation
 https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-potrf

 See also LAPACK doc in reference implementation:
 https://github.com/Reference-LAPACK/lapack/blob/master/SRC/dpotrf.f

INPUTS

  cplx  = 1 if real 2 if complex
  uplo  = character, 'u' or 'l'
  A_nrows = matrix size
  A = pointer to gpu memory location of matrix A
  lda   = leading dimension of matrix A
  devInfo  =

SOURCE

3282 subroutine abi_gpu_xpotrf_cptr(cplx, uplo, A_nrows, &
3283                    A, lda, &
3284                    devInfo)
3285 
3286   ! Arguments ------------------------------------
3287   integer,         intent(in   ) :: cplx
3288   character(len=1),intent(in   ) :: uplo
3289   integer,         intent(in   ) :: A_nrows
3290   type(c_ptr),     intent(in   ) :: A
3291   integer,         intent(in   ) :: lda
3292   integer,         intent(inout) :: devInfo
3293 
3294   ! Local variables ------------------------------
3295   integer     :: bufferSize
3296   type(c_ptr) :: gpu_ptr
3297 
3298 ! *************************************************************************
3299 
3300   if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then
3301     ABI_BUG("You requested to run on CPU to a GPU wrapper :/")
3302   end if
3303 
3304 #ifdef HAVE_GPU
3305 
3306   ! probe needed bufferSize
3307   call gpu_xpotrf_buffersize(cplx, uplo, &
3308                  A_nrows, &
3309                  A, lda, &
3310                  bufferSize)
3311 
3312   select case(cplx)
3313 
3314   case (1)
3315     ! resize work array if needed and retrieve work pointer to use
3316     if(abi_linalg_gpu_mode == ABI_GPU_LEGACY &
3317           .or. abi_linalg_gpu_mode == ABI_GPU_KOKKOS) then
3318       call abi_gpu_work_resize(r_work,r_work_managed,r_work_len,bufferSize)
3319       gpu_ptr = c_loc(r_work_managed)
3320     else if(abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
3321 #ifdef HAVE_OPENMP_GET_MAPPED_PTR
3322       call abi_gpu_work_resize(r_work,r_work_managed,r_work_len,bufferSize)
3323       gpu_ptr = xomp_get_mapped_ptr(c_loc(r_work))
3324 #else
3325       call abi_gpu_work_resizeCptr(gpu_work,gpu_work_len,INT(1,c_size_t)*bufferSize*dp)
3326       gpu_ptr = gpu_work
3327 #endif
3328     end if
3329 
3330   case (2)
3331     ! resize work array if needed and retrieve work pointer to use
3332     if(abi_linalg_gpu_mode == ABI_GPU_LEGACY &
3333           .or. abi_linalg_gpu_mode == ABI_GPU_KOKKOS) then
3334       call abi_gpu_work_resize(c_work,c_work_managed,c_work_len,bufferSize)
3335       gpu_ptr = c_loc(c_work_managed)
3336     else if(abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
3337 #ifdef HAVE_OPENMP_GET_MAPPED_PTR
3338       call abi_gpu_work_resize(c_work,c_work_managed,c_work_len,bufferSize)
3339       gpu_ptr = xomp_get_mapped_ptr(c_loc(c_work))
3340 #else
3341       call abi_gpu_work_resizeCptr(gpu_work,gpu_work_len,INT(1,c_size_t)*bufferSize*dp)
3342       gpu_ptr = gpu_work
3343 #endif
3344     end if
3345 
3346   end select
3347 
3348   ! and compute (finally)
3349   call gpu_xpotrf(cplx, uplo, &
3350                  A_nrows, &
3351                  A, lda, &
3352                  gpu_ptr, bufferSize, devInfo)
3353 
3354   if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
3355     ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them.
3356     ! Therefore, we issue a stream sync here to avoid
3357     !potential mistakes in calling context.
3358     call gpu_linalg_stream_synchronize()
3359   end if
3360 
3361 #else
3362   ! Unused if GPU code disabled
3363   ABI_UNUSED((/cplx,A_nrows,lda,devInfo,bufferSize/))
3364   ABI_UNUSED((/uplo/))
3365   ABI_UNUSED((/A,gpu_ptr/))
3366 #endif
3367 
3368 end subroutine abi_gpu_xpotrf_cptr

m_abi_gpu_linalg/abi_gpu_xscal [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  abi_gpu_xscal

FUNCTION

  Compute a BLAS-1 SCAL operation on GPU
  x = alpha * x

INPUTS

  cplx   = 1 if real 2 if complex
  size   = vector size
  alpha  = scalar complex value
  x_gpu  = pointer to gpu memory location of array x
  incrx  = stride between consecutive elements of x

SOURCE

1673 subroutine abi_gpu_xscal_cptr(cplx, size, alpha, x, incrx)
1674 
1675  ! !Arguments ------------------------------------
1676  integer,      intent(in)    :: cplx
1677  integer,      intent(in)    :: size
1678  complex(dpc), intent(in)    :: alpha
1679  type(c_ptr),  intent(in)    :: x
1680  integer,      intent(in)    :: incrx
1681 
1682 ! *************************************************************************
1683 
1684  if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then
1685    ABI_BUG("You requested to run on CPU to a GPU wrapper :/")
1686  end if
1687 
1688 #ifdef HAVE_GPU
1689 
1690  call gpu_xscal(cplx, size, alpha, x, incrx)
1691 
1692   if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
1693     ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them.
1694     ! Therefore, we issue a stream sync here to avoid
1695     !potential mistakes in calling context.
1696     call gpu_linalg_stream_synchronize()
1697   end if
1698 
1699 #else
1700   ! Unused if GPU code disabled
1701   ABI_UNUSED((/cplx,incrx,size/))
1702   ABI_UNUSED((/alpha/))
1703   ABI_UNUSED((/x/))
1704 #endif
1705 
1706 end subroutine abi_gpu_xscal_cptr

m_abi_gpu_linalg/abi_gpu_xsymm [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  abi_gpu_xsymm

FUNCTION

  Compute a symmetric scalar-matrix-matrix product and return a scalar-matrix product on GPU
  c = alpha * op(a) * op(b) + beta * c    , if side == L
  c = alpha * op(b) * op(a) + beta * c    , if side == R

INPUTS

  cplx  = 1 if real 2 if complex
  side  = Specifies whether op(a) appears on the left or right of x for
          the operation to be performed as follows:
              L or l op(a)*x = alpha*b
              R or r x*op(a) = alpha*b
  uplo  = Specifies whether the matrix a is an upper or lower triangular
          matrix as follows:
              U or u Matrix a is an upper triangular matrix.
              L or l Matrix a is a lower triangular matrix
  m     = number of rows of the matrix op(a) and of the matrix c
  n     = number of rows of the matrix op(b) and the number of columns of the matrix c
  alpha = alpha scalar coefficient for matrix op(a)
  a = pointer to gpu memory location of  matrix a
  lda   = first dimension of a
  b = pointer to gpu memory location of  matrix b
  ldb   = first dimension of b
  beta  = beta scalar coefficient for matrix c
  c = pointer to gpu memory location of  matrix c
  ldc   = first dimension of c

OUTPUT

  c     = c matrix

SOURCE

1237 subroutine abi_gpu_xsymm_cptr(cplx,side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
1238 
1239 !Arguments ------------------------------------
1240  integer,intent(in) :: cplx,lda,ldb,ldc,m,n
1241  complex(dpc),intent(in) :: alpha,beta
1242  character(len=1),intent(in) :: side,uplo
1243  type(c_ptr),intent(in) :: a,b
1244  type(c_ptr),intent(in) :: c
1245 
1246 ! *********************************************************************
1247 
1248   if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then
1249     ABI_BUG("You requested to run on CPU to a GPU wrapper :/")
1250   end if
1251 
1252 #ifdef HAVE_GPU
1253 
1254   call gpu_xsymm(cplx,side,uplo,m,n,alpha,&
1255       a,lda,b,ldb,beta,c,ldc)
1256 
1257   if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
1258     ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them.
1259     ! Therefore, we issue a stream sync here to avoid
1260     !potential mistakes in calling context.
1261     call gpu_linalg_stream_synchronize()
1262   end if
1263 
1264 #else
1265   ! Unused if GPU code disabled
1266   ABI_UNUSED((/cplx,lda,ldb,ldc,m,n/))
1267   ABI_UNUSED((/alpha,beta/))
1268   ABI_UNUSED((/side,uplo/))
1269   ABI_UNUSED((/a,b,c/))
1270 #endif
1271 
1272 end subroutine abi_gpu_xsymm_cptr

m_abi_gpu_linalg/abi_gpu_xtrsm [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  abi_gpu_xtrsm

FUNCTION

 Solves a matrix equation (one matrix operand is triangular) on GPU.
 The xtrsm routines solve one of the following matrix equations
 op(a)*x = alpha*b
 or
 x*op(a) = alpha*b,

INPUTS

 cplx= 1 if real 2 if complex
 side= Specifies whether op(a) appears on the left or right of x for
      the operation to be performed as follows:
      L or l op(a)*x = alpha*b
      R or r x*op(a) = alpha*b
 uplo= Specifies whether the matrix a is an upper or lower triangular
      matrix as follows:
      U or u Matrix a is an upper triangular matrix.
      L or l Matrix a is a lower triangular matrix
 transa= Specifies the form of op(a) to be used in the matrix
      multiplication as follows:
      N or n op(a) = a
      T or t op(a) = a'
      C or c op(a) = conjg(a')
 diag= Specifies whether or not a is unit triangular as follows:
      U or u Matrix a is assumed to be unit triangular.
      N or n Matrix a is not assumed to be unit triangular.
 m= Specifies the number of rows of b. The value of m must be at least zero
 n= Specifies the number of columns of b. The value of n must be at least zero
 alpha= Specifies the scalar alpha. When alpha is zero, then a is not referenced and b
      need not be set before entry.
  a = pointer to gpu memory location of  array a, DIMENSION (lda, k), where k is m when side = 'L' or 'l' and is n
      when side = 'R' or 'r'.
 lda= Specifies the first dimension of a as declared in the calling
     (sub)program. When side = 'L' or 'l', then lda must be at least max(1,
      m), when side = 'R' or 'r', then lda must be at least max(1, n).
  b = pointer to gpu memory location of  b Array, DIMENSION (ldb,n). Before entry, the leading m-by-n part of the array
     b must contain the right-hand side matrix b.
 ldb= Specifies the first dimension of b as declared in the calling
     (sub)program. The value of ldb must be at least max(1, m).

OUTPUT

  b

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU_CUDA is not enabled
   the correct one is in 17_toolbox/gpu_linalg.cu

SOURCE

2253 subroutine abi_gpu_xtrsm_cptr(cplx,side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
2254 
2255 ! !Arguments ------------------------------------
2256  integer, intent(in) :: cplx,lda,ldb,m,n
2257  complex(dpc), intent(in) :: alpha
2258  character(len=1), intent(in) :: side,uplo,transa,diag
2259  type(c_ptr),intent(in) :: a
2260  type(c_ptr),intent(in) :: b
2261 
2262 ! *********************************************************************
2263 
2264   if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then
2265     ABI_BUG("You requested to run on CPU to a GPU wrapper :/")
2266   end if
2267 
2268 #ifdef HAVE_GPU
2269 
2270   call gpu_xtrsm(cplx,side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
2271 
2272   if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
2273     ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them.
2274     ! Therefore, we issue a stream sync here to avoid
2275     !potential mistakes in calling context.
2276     call gpu_linalg_stream_synchronize()
2277   end if
2278 
2279 #else
2280   ! Unused if GPU code disabled
2281   ABI_UNUSED((/cplx,lda,ldb,m,n/))
2282   ABI_UNUSED((/alpha/))
2283   ABI_UNUSED((/side,uplo,transa,diag/))
2284   ABI_UNUSED((/a,b/))
2285 #endif
2286 
2287 end subroutine abi_gpu_xtrsm_cptr

m_abi_gpu_linalg/abi_gpu_zhemm [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  abi_gpu_xhemm

FUNCTION

  Compute a Hermitian scalar-matrix-matrix product and return a scalar-matrix product on GPU
  c = alpha * op(a) * op(b) + beta * c    , if side == L
  c = alpha * op(b) * op(a) + beta * c    , if side == R

INPUTS

  side  = Specifies whether op(a) appears on the left or right of x for
          the operation to be performed as follows:
              L or l op(a)*x = alpha*b
              R or r x*op(a) = alpha*b
  uplo  = Specifies whether the matrix a is an upper or lower triangular
          matrix as follows:
              U or u Matrix a is an upper triangular matrix.
              L or l Matrix a is a lower triangular matrix
  m     = number of rows of the matrix op(a) and of the matrix c
  n     = number of rows of the matrix op(b) and the number of columns of the matrix c
  alpha = alpha scalar coefficient for matrix op(a)
  a_gpu = pointer to gpu memory location of  matrix a
  lda   = first dimension of a
  b_gpu = pointer to gpu memory location of  matrix b
  ldb   = first dimension of b
  beta  = beta scalar coefficient for matrix c
  c_gpu = pointer to gpu memory location of  matrix c
  ldc   = first dimension of c

OUTPUT

  c     = c matrix

SOURCE

1464 subroutine abi_gpu_zhemm_cptr(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
1465 
1466 !Arguments ------------------------------------
1467  integer,intent(in) :: lda,ldb,ldc,m,n
1468  complex(dpc),intent(in) :: alpha,beta
1469  character(len=1),intent(in) :: side,uplo
1470  type(c_ptr),intent(in) :: a,b
1471  type(c_ptr),intent(in) :: c
1472 
1473 ! *********************************************************************
1474 
1475   if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then
1476     ABI_BUG("You requested to run on CPU to a GPU wrapper :/")
1477   end if
1478 
1479 #ifdef HAVE_GPU
1480 
1481   call gpu_zhemm(side,uplo,m,n,alpha,&
1482       a,lda,b,ldb,beta,c,ldc)
1483 
1484   if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then
1485     ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them.
1486     ! Therefore, we issue a stream sync here to avoid
1487     !potential mistakes in calling context.
1488     call gpu_linalg_stream_synchronize()
1489   end if
1490 
1491 #else
1492   ! Unused if GPU code disabled
1493   ABI_UNUSED((/lda,ldb,ldc,m,n/))
1494   ABI_UNUSED((/alpha,beta/))
1495   ABI_UNUSED((/side,uplo/))
1496   ABI_UNUSED((/a,b,c/))
1497 #endif
1498 
1499 end subroutine abi_gpu_zhemm_cptr

m_abi_gpu_linalg/alloc_on_gpu [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  alloc_on_gpu

FUNCTION

  Allocate size byte in gpu memory and returns in gpu_ptr this location

INPUTS

  size= size in byte to allocate

OUTPUT

  gpu_ptr= C_PTR on gpu memory location that has been allocated

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/dev_spec.cu

SOURCE

 98 subroutine alloc_on_gpu(gpu_ptr,size)
 99 
100 !Arguments ------------------------------------
101  type(c_ptr),                intent(inout) :: gpu_ptr
102  integer(kind=c_size_t),     intent(in)    :: size ! size in bytes to allocate
103 
104  ABI_UNUSED(gpu_ptr)
105  ABI_UNUSED(size)
106 
107 end subroutine alloc_on_gpu

m_abi_gpu_linalg/check_gpu_mem [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  check_gpu_mem

FUNCTION

  Print information about amount of free memory on GPU and total amount of memory on GPU (current device).

INPUTS

  str is a string message (character array).

OUTPUT

  None

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/dev_spec.cu

SOURCE

66 subroutine check_gpu_mem(str)
67 
68   !Arguments ------------------------------------
69   character (KIND=c_char), intent(in), target  :: str(*)
70   !Local variables ------------------------------
71   type(c_ptr)                                  :: dummy
72 
73   if(.false.) dummy=c_loc(str)
74 
75 end subroutine check_gpu_mem

m_abi_gpu_linalg/copy_from_gpu [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  copy_from_gpu

FUNCTION

  copy size byte from gpu memory (pointed by gpu_ptr) to cpu memory (pointed by cpu_ptr)

INPUTS

  size_in_bytes = size in bytes to allocate
  gpu_ptr = C_PTR on gpu memory location that has been allocated

OUTPUT

  dtab = fortran tab which will contains data

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/dev_spec.cu

SOURCE

130 subroutine copy_from_gpu(dtab,gpu_ptr,size_in_bytes)
131 
132 !Arguments ------------------------------------
133  real(dp),dimension(*)               :: dtab
134  type(c_ptr)                         :: gpu_ptr
135  integer(kind=c_size_t), intent(in)  :: size_in_bytes ! size in byte (to be transfered)
136 
137 !Local variables ------------------------------
138  type(c_ptr)                         :: cpu_ptr
139 
140  if(.false.) write(std_out,*) dtab(1)
141  ABI_UNUSED(cpu_ptr)
142  ABI_UNUSED(gpu_ptr)
143  ABI_UNUSED(size_in_bytes)
144 
145 end subroutine copy_from_gpu

m_abi_gpu_linalg/copy_gpu_to_gpu [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  copy_gpu_to_gpu

FUNCTION

  copy size byte from gpu (src) to gpu (dest)

INPUTS

  size_in_bytes = size in bytes to copy
  src_gpu_ptr = C_PTR on gpu memory

OUTPUT

  dest_gpu_ptr = C_PTR on gpu memory

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU_CUDA is not enabled
   the correct one is in 17_gpu_toolbox/dev_spec.cu

PARENTS

      lobpcgwf,m_abi_gpu_linalg

SOURCE

209 subroutine copy_gpu_to_gpu(cpu_ptr,gpu_ptr,size_in_bytes)
210 
211   !Arguments ------------------------------------
212   type(c_ptr)                         :: cpu_ptr
213   type(c_ptr)                         :: gpu_ptr
214   integer(kind=c_size_t), intent(in)  :: size_in_bytes ! size in byte (to be transfered)
215 
216   ABI_UNUSED(cpu_ptr)
217   ABI_UNUSED(gpu_ptr)
218   ABI_UNUSED(size_in_bytes)
219 
220 end subroutine copy_gpu_to_gpu

m_abi_gpu_linalg/copy_on_gpu [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  copy_on_gpu

FUNCTION

  copy size byte from cpu (pointed by cpu_ptr) to gpu memory (pointed by gpu_ptr)

INPUTS

  size_in_bytes = size in bytes to allocate
  dtab = fortran tab to copy

OUTPUT

  gpu_ptr= C_PTR on gpu memory location

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/dev_spec.cu

SOURCE

168 subroutine copy_on_gpu(dtab,gpu_ptr,size_in_bytes)
169 
170   !Arguments ------------------------------------
171   real(dp),dimension(*)               :: dtab
172   type(c_ptr)                         :: gpu_ptr
173   integer(kind=c_size_t), intent(in)  :: size_in_bytes ! size in byte (to be transfered)
174 
175   !Local variables ------------------------------
176   type(c_ptr)                         :: cpu_ptr
177 
178  if(.false.) write(std_out,*) dtab(1)
179   ABI_UNUSED(cpu_ptr)
180   ABI_UNUSED(gpu_ptr)
181   ABI_UNUSED(size_in_bytes)
182 
183 end subroutine copy_on_gpu

m_abi_gpu_linalg/dealloc_on_gpu [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  dealloc_on_gpu

FUNCTION

  free memory location pointed by gpu_ptr

INPUTS

OUTPUT

  gpu_ptr= C_PTR on gpu memory location that has been allocated

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/dev_spec.cu

SOURCE

241 subroutine dealloc_on_gpu(gpu_ptr)
242 
243   !Arguments ------------------------------------
244   type(c_ptr) :: gpu_ptr
245 
246   ABI_UNUSED(gpu_ptr)
247 
248 end subroutine dealloc_on_gpu

m_abi_gpu_linalg/gpu_allocated_impl [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_allocated_impl

FUNCTION

  Check if pointer points to allocated gpu device memory.

INPUTS

  gpu_ptr= C_PTR on gpu memory location

OUTPUT

  is_allocate= logical(c_bool) : true (if allocated), false (if not allocated)

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/dev_spec.cu

PARENTS

      lobpcgwf

SOURCE

311 subroutine gpu_allocated_impl(gpu_ptr, is_allocated)
312 
313   !Arguments ------------------------------------
314   type(c_ptr)                       :: gpu_ptr
315   logical(kind=c_bool), intent(out) :: is_allocated
316 
317   ABI_UNUSED(gpu_ptr)
318 
319   is_allocated = .false.
320 
321 end subroutine gpu_allocated_impl

m_abi_gpu_linalg/gpu_device_synchronize [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_device_synchronize

FUNCTION

  Wait for any running operation, compute and memory transfer, to complete on GPU.

INPUTS

  None

OUTPUT

  None

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/dev_spec.cu

SOURCE

40 subroutine gpu_device_synchronize()
41   use, intrinsic :: iso_c_binding
42   implicit none
43 end subroutine gpu_device_synchronize

m_abi_gpu_linalg/gpu_linalg_init [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_linalg_init

FUNCTION

  initialisation of linalg environnement on GPU

INPUTS

OUTPUT

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/gpu_linalg.cu

SOURCE

375 subroutine gpu_linalg_init()
376 
377 
378 end subroutine gpu_linalg_init

m_abi_gpu_linalg/gpu_linalg_shutdown [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_linalg_shutdown

FUNCTION

  close linalg environnement on GPU

INPUTS

OUTPUT

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/gpu_linalg.cu

SOURCE

397 subroutine gpu_linalg_shutdown()
398 
399 end subroutine gpu_linalg_shutdown

m_abi_gpu_linalg/gpu_managed_ptr_status [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_managed_ptr_status_impl

FUNCTION

  Print information about a managed pointer (host or device address when accessible).

INPUTS

  gpu_ptr= C_PTR on gpu memory location

OUTPUT

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/dev_spec.cu

PARENTS

SOURCE

344 subroutine gpu_managed_ptr_status(gpu_ptr, str)
345 
346   !Arguments ------------------------------------
347   type(c_ptr)                                  :: gpu_ptr
348   character (KIND=c_char), intent(in), target  :: str(*)
349   !Local variables ------------------------------
350   type(c_ptr)                                  :: dummy
351 
352   ABI_UNUSED(gpu_ptr)
353   if(.false.) dummy=c_loc(str)
354 
355 end subroutine gpu_managed_ptr_status

m_abi_gpu_linalg/gpu_memset [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_memset

FUNCTION

  Initializes or sets device memory to a value.

INPUTS

  gpu_ptr= C_PTR on gpu memory location
  val= value used to initialized each bytes
  size= number of bytes to initialize

OUTPUT

  gpu_ptr= C_PTR on gpu memory location

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/dev_spec.cu

PARENTS

      lobpcgwf

SOURCE

275 subroutine gpu_memset(gpu_ptr, val, array_size)
276 
277   !Arguments ------------------------------------
278   type(c_ptr)                         :: gpu_ptr
279   integer(kind=c_int32_t), intent(in) :: val
280   integer(kind=c_size_t),  intent(in) :: array_size
281 
282   ABI_UNUSED(gpu_ptr)
283   ABI_UNUSED(val)
284   ABI_UNUSED(array_size)
285 
286 end subroutine gpu_memset

m_abi_gpu_linalg/gpu_xaxpy [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_xaxpy

FUNCTION

  Compute a BLAS-1 AXPY operation on GPU
  y = alpha * x + y

INPUTS

  cplx  = 1 if real 2 if complex
  size  = vector size
  alpha = scalar complex value
  x_gpu = pointer to gpu memory location of array x
 incrx  = stride between consecutive elements of x
  y_gpu = pointer to gpu memory location of array y
 incry  = stride between consecutive elements of y

SOURCE

547 subroutine gpu_xaxpy(cplx, size, alpha, x_gpu, incrx, y_gpu, incry)
548 
549   ! !Arguments ------------------------------------
550   integer,      intent(in)    :: cplx
551   integer,      intent(in)    :: size
552   complex(dpc), intent(in)    :: alpha
553   type(c_ptr),  intent(in)    :: x_gpu
554   integer,      intent(in)    :: incrx
555   type(c_ptr),  intent(inout) :: y_gpu
556   integer,      intent(in)    :: incry
557 
558   ABI_UNUSED((/cplx,size,incrx,incry/))
559   ABI_UNUSED(alpha)
560   ABI_UNUSED_A(x_gpu)
561   ABI_UNUSED_A(y_gpu)
562 end subroutine gpu_xaxpy

m_abi_gpu_linalg/gpu_xcopy [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_xcopy

FUNCTION

  Compute a BLAS-1 COPY operation on GPU
  y = x (copy x into y)

INPUTS

  cplx   = 1 if real 2 if complex
  size   = input vector size
  x_gpu  = pointer to gpu memory location of array x
  incrx  = stride between consecutive elements of x
  y_gpu  = pointer to gpu memory location of array y
  incry  = stride between consecutive elements of y

SOURCE

582 subroutine gpu_xcopy(cplx, size, x_gpu, incrx, y_gpu, incry)
583 
584   ! !Arguments ------------------------------------
585   integer,      intent(in)    :: cplx
586   integer,      intent(in)    :: size
587   type(c_ptr),  intent(in)    :: x_gpu
588   integer,      intent(in)    :: incrx
589   type(c_ptr),  intent(inout) :: y_gpu
590   integer,      intent(in)    :: incry
591 
592   ABI_UNUSED((/cplx,size,incrx,incry/))
593   ABI_UNUSED_A(x_gpu)
594   ABI_UNUSED_A(y_gpu)
595 end subroutine gpu_xcopy

m_abi_gpu_linalg/gpu_xgemm [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_xgemm

FUNCTION

  Compute a scalar-matrix-matrix product and return a scalar-matrix product on GPU
  c = alpha * op(a) * op(b) + beta * c

INPUTS

  cplx  = 1 if real 2 if complex
  transa= from of op(a) to be used in the matrix multiplication
  transb= from of op(b) to be used in the matrix multiplication
  m     = number of rows of the matrix op(a) and of the matrix c
  n     = number of rows of the matrix op(b) and the number of columns of the matrix c
  k     = number of columns of the matrix op(a) and the number of rows of the matrix op(b)
  alpha = alpha scalar coefficient for matrix op(a)
  a_gpu = pointer to gpu memory location of  matrix a
  lda   = first dimension of a
  b_gpu = pointer to gpu memory location of  matrix b
  ldb   = first dimension of b
  beta  = beta scalar coefficient for matrix c
  c_gpu = pointer to gpu memory location of  matrix c
  ldc   = first dimension of c

OUTPUT

  c     = c matrix

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/gpu_linalg.cu

SOURCE

436 subroutine gpu_xgemm(cplx,transa,transb,m,n,k,alpha,a_gpu,lda,b_gpu,ldb,beta,c_gpu,ldc)
437 
438 !Arguments ------------------------------------
439  integer,intent(in) :: cplx,lda,ldb,ldc,m,n,k
440  complex(dpc),intent(in) :: alpha,beta
441  character(len=1),intent(in) :: transa,transb
442  type(c_ptr),intent(in) :: a_gpu,b_gpu
443  type(c_ptr),intent(inout) :: c_gpu
444 !Local variables ------------------------------
445  type(c_ptr) :: cptr
446 
447 ! *********************************************************************
448 
449  if (.false.) then
450    cptr=a_gpu;cptr=b_gpu;cptr=c_gpu
451    write(std_out,*) transa,transb,cplx,lda,ldb,ldc,m,n,k,alpha,beta
452  end if
453 
454 end subroutine gpu_xgemm

m_abi_gpu_linalg/gpu_xscal [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_xscal

FUNCTION

  Compute a BLAS-1 SCAL operation on GPU
  x = alpha * x

INPUTS

  cplx   = 1 if real 2 if complex
  size   = vector size
  alpha  = scalar complex value
  x_gpu  = pointer to gpu memory location of array x
  incrx  = stride between consecutive elements of x

SOURCE

614 subroutine gpu_xscal(cplx, size, alpha, x_gpu, incrx)
615 
616   ! !Arguments ------------------------------------
617   integer,      intent(in)    :: cplx
618   integer,      intent(in)    :: size
619   complex(dpc), intent(in)    :: alpha
620   type(c_ptr),  intent(in)    :: x_gpu
621   integer,      intent(in)    :: incrx
622 
623   ABI_UNUSED((/cplx,size,incrx/))
624   ABI_UNUSED(alpha)
625   ABI_UNUSED_A(x_gpu)
626 end subroutine gpu_xscal

m_abi_gpu_linalg/gpu_xsygvd [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_xsygvd

FUNCTION

  Compute a LAPACK SYGVD operation on GPU
  compute eigen values/vectors of a real generalized
  symmetric-definite eigenproblem

 See cusolver documentation
 https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-sygvd

 See also LAPACK doc in reference implementation:
 https://github.com/Reference-LAPACK/lapack/blob/master/SRC/dsygvd.f

INPUTS

  cplx  = 1 if real 2 if complex
  itype = integer, type of problem
  jobz  = character, 'n'(eigenvalues only) or 'v' (eigenvalues + eigenvectors)
  uplo  = character, 'u' or 'l'
  A_nrows = matrix size
  A_ptr = pointer to gpu memory location of matrix A
  lda   = leading dimension of matrix A
  B_ptr = pointer to gpu memory location of matrix B
  ldb   = leading dimension of matrix B
  W_ptr = pointer to gpu memory location of matrix W (output eigen values)
  work_ptr =
  lwork =
  devInfo  =

SOURCE

660 subroutine gpu_xsygvd(cplx, itype, jobz, uplo, A_nrows, &
661   &                   A_ptr, lda, &
662   &                   B_ptr, ldb, &
663   &                   W_ptr,      &
664   &                   work_ptr, lwork, &
665   &                   devInfo)
666 
667   ! Arguments ------------------------------------
668   integer,         intent(in   ) :: cplx
669   integer,         intent(in   ) :: itype
670   character(len=1),intent(in   ) :: jobz
671   character(len=1),intent(in   ) :: uplo
672   integer,         intent(in   ) :: A_nrows
673   type(c_ptr),     intent(in   ) :: A_ptr
674   integer,         intent(in   ) :: lda
675   type(c_ptr),     intent(in   ) :: B_ptr
676   integer,         intent(in   ) :: ldb
677   type(c_ptr),     intent(inout) :: W_ptr
678   type(c_ptr),     intent(inout) :: work_ptr
679   integer,         intent(in   ) :: lwork
680   integer,         intent(inout) :: devInfo
681 
682   ABI_UNUSED((/cplx,itype,A_nrows,lda,ldb,lwork,devInfo/))
683   ABI_UNUSED((/jobz,uplo/))
684   ABI_UNUSED((/A_ptr,B_ptr,W_ptr,work_ptr/))
685 end subroutine gpu_xsygvd

m_abi_gpu_linalg/gpu_xsygvd_bufferSize [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_xsygvd_bufferSize

FUNCTION

  Compute required size for auxiliary work buffer used internally by cusolver
  in cusolverDnDsygvd / cusolverDnZhegvd

 See cusolver documentation
 https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-sygvd

INPUTS

  cplx  = 1 if real 2 if complex
  itype = integer, type of problem
  jobz  = character, 'n'(eigenvalues only) or 'v' (eigenvalues + eigenvectors)
  uplo  = character, 'u' or 'l'
  A_nrows = matrix size
  A_ptr = pointer to gpu memory location of matrix A
  lda   = leading dimension of matrix A
  B_ptr = pointer to gpu memory location of matrix B
  ldb   = leading dimension of matrix B
  W_ptr = pointer to gpu memory location of matrix W (output eigen values)
  work_ptr =
  lwork =
  devInfo  =

SOURCE

715 subroutine gpu_xsygvd_buffersize(cplx, itype, jobz, uplo, A_nrows, &
716   &                              A_ptr, lda, &
717   &                              B_ptr, ldb, &
718   &                              W_ptr,      &
719   &                              lwork)
720 
721   ! Arguments ------------------------------------
722   integer,         intent(in   ) :: cplx
723   integer,         intent(in   ) :: itype
724   character(len=1),intent(in   ) :: jobz
725   character(len=1),intent(in   ) :: uplo
726   integer,         intent(in   ) :: A_nrows
727   type(c_ptr),     intent(in   ) :: A_ptr
728   integer,         intent(in   ) :: lda
729   type(c_ptr),     intent(in   ) :: B_ptr
730   integer,         intent(in   ) :: ldb
731   type(c_ptr),     intent(inout) :: W_ptr
732   integer,         intent(in   ) :: lwork
733 
734   ABI_UNUSED((/cplx,itype,A_nrows,lda,ldb,lwork/))
735   ABI_UNUSED((/jobz,uplo/))
736   ABI_UNUSED((/A_ptr,B_ptr,W_ptr/))
737 end subroutine gpu_xsygvd_bufferSize

m_abi_gpu_linalg/gpu_xtrsm [ Functions ]

[ Top ] [ m_abi_gpu_linalg ] [ Functions ]

NAME

  gpu_xtrsm

FUNCTION

 Solves a matrix equation (one matrix operand is triangular) on GPU.
 The xtrsm routines solve one of the following matrix equations
 op(a)*x = alpha*b
 or
 x*op(a) = alpha*b,

INPUTS

 cplx= 1 if real 2 if complex
 side= Specifies whether op(a) appears on the left or right of x for
      the operation to be performed as follows:
      L or l op(a)*x = alpha*b
      R or r x*op(a) = alpha*b
 uplo= Specifies whether the matrix a is an upper or lower triangular
      matrix as follows:
      U or u Matrix a is an upper triangular matrix.
      L or l Matrix a is a lower triangular matrix
 transa= Specifies the form of op(a) to be used in the matrix
      multiplication as follows:
      N or n op(a) = a
      T or t op(a) = a'
      C or c op(a) = conjg(a')
 diag= Specifies whether or not a is unit triangular as follows:
      U or u Matrix a is assumed to be unit triangular.
      N or n Matrix a is not assumed to be unit triangular.
 m= Specifies the number of rows of b. The value of m must be at least zero
 n= Specifies the number of columns of b. The value of n must be at least zero
 alpha= Specifies the scalar alpha. When alpha is zero, then a is not referenced and b
      need not be set before entry.
  a_gpu = pointer to gpu memory location of  array a, DIMENSION (lda, k), where k is m when side = 'L' or 'l' and is n
      when side = 'R' or 'r'.
 lda= Specifies the first dimension of a as declared in the calling
     (sub)program. When side = 'L' or 'l', then lda must be at least max(1,
      m), when side = 'R' or 'r', then lda must be at least max(1, n).
  b_gpu = pointer to gpu memory location of  b Array, DIMENSION (ldb,n). Before entry, the leading m-by-n part of the array
     b must contain the right-hand side matrix b.
 ldb= Specifies the first dimension of b as declared in the calling
     (sub)program. The value of ldb must be at least max(1, m).

OUTPUT

  b_gpu

SIDE EFFECTS

   WARNING! : this routine is a dummy one when HAVE_GPU is not enabled
   the correct one is in 17_gpu_toolbox/gpu_linalg.cu

SOURCE

508 subroutine gpu_xtrsm(cplx,side,uplo,transa,diag,m,n,alpha,a_gpu,lda,b_gpu,ldb)
509 
510 ! !Arguments ------------------------------------
511  integer, intent(in) :: cplx,lda,ldb,m,n
512  complex(dpc), intent(in) :: alpha
513  character(len=1), intent(in) :: side,uplo,transa,diag
514  type(c_ptr),intent(in) :: a_gpu
515  type(c_ptr),intent(inout) :: b_gpu
516 !Local variables ------------------------------
517  type(c_ptr) :: cptr
518 
519 ! *********************************************************************
520 
521  if (.false.) then
522    cptr=a_gpu;cptr=b_gpu
523    write(std_out,*) side,uplo,transa,diag,cplx,lda,ldb,m,n,alpha
524  end if
525 
526 end subroutine gpu_xtrsm

m_abi_linalg/gpu_xorthonormalize [ Functions ]

[ Top ] [ m_abi_linalg ] [ Functions ]

NAME

  gpu_xorthonormalize

FUNCTION

  This routine computes the overlap of two complex wavefunctions (for a given number of bands)
  and orthonormalizes it using gpu:
      - Computes the products of two rectangular matrices
         containing the wavefunctions psi and S.psi (where S is the
         overlap (with the PAW terms if necessary)).
      - Does a Cholesky decomposition of this overlap
      - rotates the initial matrix blockvectorx by the triangular matrix to
         have an orthonormal set of wavefunctions

INPUTS

  blockvectorbx = matrix of dimension (blocksize,vectsize) as a GPU ptr
                  (e.g. block of overlap*wavefunction)
  blocksize     = dimension of matrices (e.g number of bands)
  spaceComm     = communicator used for  MPI parallelization
  vectsize      = dimension of matrices (e.g number of G vector)

OUTPUT

  sqgram        = Choleski decomposition of transpose(blockvector)*blockvectorx as a GPU ptr

SIDE EFFECTS

  blockvectorx  = on input, matrix of dimension (vectsize,blocksize) as a GPU ptr
                  (e.g block of wavefunction)
  blockvectorx  = on output, orthonormalized wavefunction. as a GPU ptr

SOURCE

3541 subroutine gpu_xorthonormalize(blockvectorx_gpu,blockvectorbx_gpu,blocksize,spaceComm,&
3542 &                              sqgram_gpu,vectsize,&
3543 &                              x_cplx,timopt,tim_xortho) ! optional arguments
3544 
3545   use, intrinsic :: iso_c_binding
3546   implicit none
3547 
3548 !Arguments ------------------------------------
3549 !scalars
3550  integer,intent(in) :: blocksize,spaceComm,vectsize,x_cplx
3551  integer, intent(in), optional :: timopt,tim_xortho
3552 !arrays
3553  type(c_ptr),intent(inout) :: blockvectorbx_gpu, blockvectorx_gpu, sqgram_gpu
3554 !Local variables-------------------------------
3555 #if defined HAVE_GPU
3556  integer :: ierr,info
3557  real(dp),    dimension(:,:),allocatable, target :: d_sqgram
3558  complex(dpc),dimension(:,:),allocatable, target :: z_sqgram
3559  character :: tr
3560  real(dp) :: tsec(2)
3561  integer(c_size_t) :: size
3562 #else
3563  type(c_ptr) :: cptr_a
3564 #endif
3565  character(len=500) :: message
3566 
3567 ! *********************************************************************
3568 
3569 #if defined HAVE_GPU
3570  if (present(tim_xortho).and.present(timopt)) then
3571    if(abs(timopt)==3) then
3572      call timab(tim_xortho,1,tsec)
3573    end if
3574  end if
3575 
3576  if ( x_cplx == 1 ) then
3577    tr='t'
3578    ABI_MALLOC(d_sqgram,(blocksize,blocksize))
3579  else
3580    tr='c'
3581    ABI_MALLOC(z_sqgram,(blocksize,blocksize))
3582  end if
3583 
3584  call gpu_xgemm(x_cplx,tr,'n',blocksize,blocksize,vectsize, &
3585 & cone,blockvectorx_gpu,vectsize,blockvectorbx_gpu,vectsize,czero,sqgram_gpu,blocksize)
3586  call gpu_device_synchronize()
3587    size=x_cplx*dp*blocksize*blocksize
3588 
3589  if ( x_cplx == 1 ) then
3590    call copy_from_gpu(d_sqgram, sqgram_gpu, INT(x_cplx, c_size_t)*dp*blocksize*blocksize)
3591    call xmpi_sum(d_sqgram,spaceComm,ierr)
3592    call abi_xpotrf('u',blocksize,d_sqgram,blocksize,info)
3593    call copy_on_gpu(d_sqgram, sqgram_gpu, INT(x_cplx, c_size_t)*dp*blocksize*blocksize)
3594  else
3595    call copy_from_gpu(z_sqgram, sqgram_gpu, size)
3596    call xmpi_sum(z_sqgram,spaceComm,ierr)
3597    call abi_xpotrf('u',blocksize,z_sqgram,blocksize,info)
3598    call copy_on_gpu(z_sqgram, sqgram_gpu, size)
3599  end if
3600 
3601  if (info /= 0 ) then
3602    write(message,'(a,i3)') '  xpotrf, info=',info
3603    ABI_WARNING(message)
3604  end if
3605 
3606  call gpu_xtrsm(x_cplx,'r','u','n','n',vectsize,blocksize,cone,sqgram_gpu,blocksize,&
3607 &               blockvectorx_gpu,vectsize)
3608 
3609  if(x_cplx==1) then
3610    ABI_FREE(d_sqgram)
3611  else
3612    ABI_FREE(z_sqgram)
3613  end if
3614  if (present(tim_xortho).and.present(timopt)) then
3615    if(abs(timopt)==3) then
3616      call timab(tim_xortho,2,tsec)
3617    end if
3618  end if
3619  return
3620 
3621 #else
3622  message='  This routine is not allowed when running on GPU is disabled !'
3623  ABI_BUG(message)
3624  if (.false.) then
3625    write(std_out,*) blocksize,vectsize,spaceComm,x_cplx
3626    if (present(timopt))  write(std_out,*) timopt
3627    if (present(tim_xortho))  write(std_out,*) tim_xortho
3628    cptr_a=blockvectorbx_gpu;cptr_a=blockvectorx_gpu;cptr_a=sqgram_gpu
3629  end if
3630 #endif
3631 
3632 end subroutine gpu_xorthonormalize