TABLE OF CONTENTS
- ABINIT/m_gemm_nonlop_gpu
- m_gemm_nonlop_gpu/destroy_gemm_nonlop_gpu
- m_gemm_nonlop_gpu/gemm_nonlop_gpu
- m_gemm_nonlop_gpu/init_gemm_nonlop_gpu
- m_gemm_nonlop_gpu/make_gemm_nonlop_gpu
ABINIT/m_gemm_nonlop_gpu [ Modules ]
NAME
m_gemm_nonlop_gpu
FUNCTION
This module provides functions to compute the nonlocal operator by means of the BLAS GEMM routine. By treating ndat simultaneous wavefunctions, it is able to exploit BLAS3 routines, which leads to excellent CPU efficiency and OpenMP scalability.
COPYRIGHT
Copyright (C) 2014-2024 ABINIT group (AL,MS) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
19 #if defined HAVE_CONFIG_H 20 #include "config.h" 21 #endif 22 23 #include "abi_common.h" 24 25 module m_gemm_nonlop_gpu 26 27 use defs_basis 28 use m_errors 29 use m_abicore 30 use m_xmpi 31 use m_fstrings, only : itoa, ftoa, sjoin 32 33 use m_abi_linalg ! copy_on_gpu, copy_from_gpu, alloc_on_gpu, dealloc_on_gpu, gpu_memset, gpu_allocated 34 use defs_abitypes, only : MPI_type 35 use m_opernlc_ylm_allwf_cpu, only : opernlc_ylm_allwf_cpu 36 use m_pawcprj, only : pawcprj_type 37 use m_gemm_nonlop, only : gemm_nonlop_type,gemm_nonlop_ikpt_this_proc_being_treated,make_gemm_nonlop,gemm_nonlop_kpt 38 39 #if defined(HAVE_GPU_CUDA) 40 use m_gpu_toolbox 41 use m_alloc_hamilt_gpu, only : gemm_nonlop_gpu_data 42 #endif 43 44 #ifdef HAVE_KOKKOS 45 use m_manage_kokkos, only : opernlc_ylm_allwf_kokkos 46 #endif 47 48 #ifdef HAVE_FC_ISO_C_BINDING 49 use, intrinsic :: iso_c_binding, only : c_ptr, c_int32_t, c_int64_t, c_float, c_double, c_size_t, c_loc 50 #endif 51 52 implicit none 53 54 private 55 56 public :: init_gemm_nonlop_gpu 57 public :: destroy_gemm_nonlop_gpu 58 public :: make_gemm_nonlop_gpu 59 public :: gemm_nonlop_gpu
m_gemm_nonlop_gpu/destroy_gemm_nonlop_gpu [ Functions ]
[ Top ] [ m_gemm_nonlop_gpu ] [ Functions ]
NAME
destroy_gemm_nonlop_gpu
FUNCTION
Initalization of the gemm_nonlop_kpt array
INPUTS
nkpt= number of k-points
PARENTS
m_gstate
CHILDREN
abi_zgemm_2r,dgemm,opernlc_ylm,xmpi_sum
SOURCE
226 subroutine destroy_gemm_nonlop_gpu(nkpt) 227 228 integer,intent(in) :: nkpt 229 integer :: ikpt 230 231 ! ************************************************************************* 232 233 ! TODO add cycling if kpt parallelism 234 235 ! This function must be called before destroy_gemm_nonlop so it can 236 ! properly figure out which GPU buffer to free. 237 if(.not. allocated(gemm_nonlop_kpt)) then 238 ABI_BUG("gemm_nonlop is already free, cannot free GPU resources !") 239 end if 240 241 ! deallocate GPU ressource for each k point 242 do ikpt = 1,nkpt 243 if(gemm_nonlop_kpt_gpu(ikpt)%nprojs /= -1) then 244 ! deallocate arrays projs, projs_r and projs_i 245 if (allocated(gemm_nonlop_kpt(ikpt)%projs)) then 246 call dealloc_on_gpu(gemm_nonlop_kpt_gpu(ikpt)%projs) 247 end if 248 if (allocated(gemm_nonlop_kpt(ikpt)%projs_r)) then 249 call dealloc_on_gpu(gemm_nonlop_kpt_gpu(ikpt)%projs_r) 250 end if 251 if (allocated(gemm_nonlop_kpt(ikpt)%projs_i)) then 252 call dealloc_on_gpu(gemm_nonlop_kpt_gpu(ikpt)%projs_i) 253 end if 254 gemm_nonlop_kpt_gpu(ikpt)%nprojs = -1 255 end if 256 end do 257 258 ABI_FREE(gemm_nonlop_kpt_gpu) 259 260 end subroutine destroy_gemm_nonlop_gpu
m_gemm_nonlop_gpu/gemm_nonlop_gpu [ Functions ]
[ Top ] [ m_gemm_nonlop_gpu ] [ Functions ]
NAME
gemm_nonlop_gpu
FUNCTION
Replacement of nonlop.
INPUTS
[gpu_option] = GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)
PARENTS
m_nonlop
CHILDREN
abi_zgemm_2r,dgemm,opernlc_ylm,xmpi_sum
SOURCE
371 subroutine gemm_nonlop_gpu(atindx1,choice,cpopt,cprjin,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout,& 372 & enl,indlmn,istwf_k,& 373 & lambda,lmnmax,matblk,& 374 & mpi_enreg,natom,nattyp,ndat,nkpgin,nkpgout,& 375 & nnlout,npwin,npwout,nspinor,nspinortot,ntypat,paw_opt,& 376 & sij,svectout,& 377 & useylm,vectin,vectout,& 378 & vectproj,gpu_option) 379 380 !Arguments ------------------------------------ 381 !scalars 382 integer,intent(in) :: choice,cpopt,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout 383 integer,intent(in) :: istwf_k,lmnmax,matblk,natom,ndat,nkpgin 384 integer,intent(in) :: nkpgout,nnlout,npwin,npwout,nspinor,nspinortot,ntypat 385 integer,intent(in) :: paw_opt,useylm 386 integer,optional,intent(in) :: gpu_option 387 real(dp), target, intent(in) :: lambda(ndat) 388 type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat) 389 type(MPI_type), intent(in) :: mpi_enreg 390 391 !arrays 392 integer, target, intent(in) :: atindx1(natom) 393 integer, target, intent(in) :: indlmn(6,lmnmax,ntypat) 394 integer, intent(in) :: nattyp(ntypat) 395 real(dp), target, intent(in) :: enl(dimenl1,dimenl2,nspinortot**2,dimekbq) 396 real(dp), target, intent(in) :: sij(dimenl1,ntypat*((paw_opt+1)/3)) 397 real(dp), target, intent(inout) :: vectin (2,npwin*nspinor*ndat) 398 real(dp), target, intent(out) :: svectout(2,npwout*nspinor*(paw_opt/3)*ndat) 399 real(dp), target, intent(inout) :: vectout(2,npwout*nspinor*ndat) !vz_i 400 real(dp), target, intent(inout), ABI_CONTIGUOUS optional :: vectproj(:,:,:) 401 402 ! locals 403 integer :: idat, nprojs, shift, iatom, nlmn, ierr, ibeg, iend 404 integer :: cplex, cplex_enl, cplex_fac 405 integer :: iatm, ndgxdt, ndgxdtfac, nd2gxdt, nd2gxdtfac, optder, itypat, ilmn 406 integer :: cplex_dgxdt(1), cplex_d2gxdt(1) 407 real(dp) :: dgxdt_dum_in(1,1,1,1,1), dgxdt_dum_out(1,1,1,1,1),dgxdt_dum_out2(1,1,1,1,1) 408 real(dp) :: d2gxdt_dum_in(1,1,1,1,1), d2gxdt_dum_out(1,1,1,1,1),d2gxdt_dum_out2(1,1,1,1,1) 409 integer :: npw_max 410 integer :: nattyp_max 411 412 logical :: local_vectproj 413 414 real(dp), allocatable, target :: sij_typ(:) 415 416 !type(c_ptr) :: projections_gpu, s_projections_gpu, vnl_projections_gpu 417 real(dp), ABI_CONTIGUOUS pointer :: projections_cpu(:,:,:) 418 real(dp), allocatable, target :: s_projections_cpu(:,:,:), vnl_projections_cpu(:,:,:) 419 420 ! used inside opernlc_ylm_allwf_kokkos_cpp when iphase > 1 421 type(c_ptr) :: vnl_projections2_gpu 422 423 type(c_ptr) :: temp_realvec_gpu 424 425 ! GPU waveform data are allocated in m_alloc_hamilt_gpu 426 !type(c_ptr) :: vectin_gpu, vectout_gpu, svectout_gpu 427 ! Pointers to either CUDA allocated or managed memory data 428 type(c_ptr) :: vectin_ptr, vectout_ptr, svectout_ptr 429 integer(c_size_t) :: vectin_size 430 431 type(c_ptr) :: enl_gpu 432 integer(c_size_t) :: enl_size_bytes 433 434 integer :: sizeof_int 435 436 type(c_ptr) :: atindx1_gpu 437 integer(c_size_t) :: atindx1_size_bytes 438 439 type(c_ptr) :: indlmn_gpu 440 integer(c_size_t) :: indlmn_size_bytes 441 442 type(c_ptr) :: lambda_gpu 443 integer(c_size_t) :: lambda_size_bytes 444 445 type(c_ptr) :: sij_typ_gpu 446 integer(c_size_t) :: sij_typ_size_bytes 447 448 integer(kind=c_int32_t), parameter :: izero = 0 449 integer(kind=c_int32_t), parameter :: fix_realvec_divide_by_2 = 0 450 integer(kind=c_int32_t), parameter :: fix_realvec_zero_out = 1 451 452 integer :: shift_spinor 453 454 ! ************************************************************************* 455 456 ! This function should only be called within CUDA legacy or Kokkos code paths 457 if(gpu_option/=ABI_GPU_LEGACY .and. gpu_option/=ABI_GPU_KOKKOS) then 458 ABI_BUG("Unhandled GPU value !") 459 end if 460 461 npw_max = MAX(npwin, npwout) 462 463 cplex = 2; if (istwf_k>1) cplex=1 464 cplex_enl = 1; if (paw_opt>0) cplex_enl=2*dimenl1/(lmnmax*(lmnmax+1)) ! is enl complex? 465 cplex_fac = max(cplex,dimekbq) 466 if ((nspinortot==2.or.cplex_enl==2).and.paw_opt>0.and.choice/=7) cplex_fac=2 ! is vnl_projections complex? 467 468 nprojs = gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%nprojs 469 470 ! These will store the non-local factors for vectin, svectout and vectout respectively 471 call gpu_memset(gemm_nonlop_gpu_data% projections_gpu, izero, INT(cplex, c_size_t) * nprojs * nspinor*ndat * dp) 472 call gpu_memset(gemm_nonlop_gpu_data% s_projections_gpu, izero, INT(cplex, c_size_t) * nprojs * nspinor*ndat * dp) 473 call gpu_memset(gemm_nonlop_gpu_data%vnl_projections_gpu, izero, INT(cplex_fac, c_size_t) * nprojs * nspinor*ndat * dp) 474 475 if (dimekbq>1) then 476 ABI_CHECK(cplex_fac==2,"BUG: invalid cplex_fac==1 when dimekbq=2!") 477 ABI_MALLOC_CUDA(vnl_projections2_gpu, INT(cplex_fac, c_size_t) * nprojs * nspinor*ndat * dp) 478 call gpu_memset(vnl_projections2_gpu, izero, INT(cplex_fac, c_size_t) * nprojs * nspinor*ndat * dp) 479 end if 480 481 vectin_size = 2*npwin*nspinor*ndat*dp 482 483 if(gpu_option==ABI_GPU_LEGACY) then 484 vectin_ptr = gemm_nonlop_gpu_data%vectin_gpu 485 vectout_ptr = gemm_nonlop_gpu_data%vectout_gpu 486 svectout_ptr= gemm_nonlop_gpu_data%svectout_gpu 487 488 call copy_on_gpu(vectin, gemm_nonlop_gpu_data%vectin_gpu, vectin_size) 489 if (choice == 7) then 490 call copy_on_gpu(svectout, gemm_nonlop_gpu_data%svectout_gpu, INT(2, c_size_t) * npwout*nspinor*ndat * dp) 491 end if 492 493 else if(gpu_option==ABI_GPU_KOKKOS) then 494 vectin_ptr =C_LOC(vectin) 495 vectout_ptr =C_LOC(vectout) 496 svectout_ptr=C_LOC(svectout) 497 498 call gpu_data_prefetch_async(vectin_ptr, vectin_size) 499 if (choice == 7) then 500 call gpu_data_prefetch_async(C_LOC(svectout), vectin_size) 501 end if 502 503 end if 504 505 !! gpu alloc and init : enl_gpu 506 enl_size_bytes = dimenl1 * dimenl2 * nspinortot**2 * dimekbq * dp 507 ABI_MALLOC_CUDA( enl_gpu, enl_size_bytes ) 508 call copy_on_gpu(enl, enl_gpu, enl_size_bytes ) 509 510 !! gpu alloc and init atindx1_gpu 511 sizeof_int = 4 512 atindx1_size_bytes = natom * sizeof_int 513 ABI_MALLOC_CUDA( atindx1_gpu, atindx1_size_bytes ) 514 call copy_on_gpu(atindx1, atindx1_gpu, atindx1_size_bytes ) 515 516 !! gpu alloc and init indlmn_gpu 517 indlmn_size_bytes = 6*lmnmax*ntypat * sizeof_int 518 ABI_MALLOC_CUDA( indlmn_gpu, indlmn_size_bytes ) 519 call copy_on_gpu(indlmn, indlmn_gpu, indlmn_size_bytes ) 520 521 !! gpu alloc and init lambda_gpu 522 lambda_size_bytes = ndat * dp 523 ABI_MALLOC_CUDA( lambda_gpu, lambda_size_bytes ) 524 call copy_on_gpu(lambda, lambda_gpu, lambda_size_bytes ) 525 526 if(nprojs == 0) then 527 ! TODO check if this is correct 528 vectout = zero 529 if(paw_opt>0) svectout = vectin 530 return 531 end if 532 533 ! determine precisely when temp_realvec needs to be allocated 534 ! to factorize allocate (resp. deallocate) at the begining (resp. at the end) of subroutine 535 ! to avoid multiple allocate/deallocate that can be costly 536 if (cplex /= 2) then 537 if ( (cpopt < 2) .or. & 538 & (paw_opt == 3 .or. paw_opt == 4) .or. & 539 & (paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4)) then 540 ABI_MALLOC_CUDA(temp_realvec_gpu, (INT(npw_max, c_size_t) * nspinor * ndat * dp)) 541 end if 542 end if 543 544 ! If vectproj is provided, use it for further calculations, use allocated array otherwise 545 local_vectproj=.false. 546 if(PRESENT(vectproj)) then 547 if(size(vectproj)>1) local_vectproj=.true. 548 end if 549 if (local_vectproj) then 550 projections_cpu => vectproj 551 else 552 ABI_MALLOC( projections_cpu,(cplex, nprojs,nspinor*ndat)) 553 projections_cpu = zero 554 end if 555 ABI_MALLOC( s_projections_cpu,(cplex, nprojs,nspinor*ndat)) ! TODO - TO BE REMOVED ONCE CUDA-IZATION IS OK 556 ABI_MALLOC(vnl_projections_cpu,(cplex_fac, nprojs,nspinor*ndat)) ! TODO - TO BE REMOVED ONCE CUDA-IZATION IS OK 557 s_projections_cpu = zero 558 vnl_projections_cpu = zero 559 560 if(cpopt >= 2) then 561 562 if(.not. local_vectproj) then 563 !This use-case is extremely painful for GEMM GPU nonlop performance. 564 !vectproj paramter should always be provided to avoid it 565 566 ! retrieve from cprjin 567 do idat=1, ndat*nspinor 568 shift = 0 569 do iatom = 1, natom 570 nlmn = cprjin(iatom, idat)%nlmn 571 projections_cpu(1:cplex, shift+1:shift+nlmn, idat) = cprjin(iatom, idat)%cp(1:cplex, 1:nlmn) 572 shift = shift + nlmn 573 end do 574 end do 575 end if 576 577 ! copy from HOST projections_cpu to GPU projections_gpu 578 call copy_on_gpu(projections_cpu, gemm_nonlop_gpu_data%projections_gpu,& 579 INT(cplex, c_size_t) * nprojs * nspinor*ndat * dp) 580 581 else ! cpopt < 2 582 583 ! opernla 584 if(cplex == 2) then 585 586 ! projections_gpu = projs * vectin_gpu 587 call abi_gpu_xgemm(cplex, 'C', 'N', & 588 & nprojs, ndat*nspinor, npwin, & ! M,N,K 589 & cone, & ! alpha 590 & gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs, npwin, & ! A, LDA 591 & vectin_ptr, npwin, & ! B, LDB 592 & czero, & ! beta 593 & gemm_nonlop_gpu_data%projections_gpu, nprojs) ! C, LDC 594 595 else 596 597 if (.not. gpu_allocated(temp_realvec_gpu)) then 598 ABI_ERROR("Please provide memory allocation for temp_realvec_gpu array") 599 end if 600 601 602 ! only compute real part of projections = P^* psi => projections_r = P_r^T psi_r + P_i^T psi_i 603 !temp_realvec(1:npwin*nspinor*ndat) = vectin(1,1:npwin*nspinor*ndat) 604 call extract_real_part(temp_realvec_gpu, vectin_ptr, npwin*nspinor*ndat) 605 606 if(istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then 607 ! do idat=1, ndat*nspinor 608 ! temp_realvec(1+(idat-1)*npwin) = temp_realvec(1+(idat-1)*npwin)/2 609 ! end do 610 call fix_realvec(temp_realvec_gpu, npwin, ndat*nspinor, fix_realvec_divide_by_2) 611 end if 612 613 call abi_gpu_xgemm(cplex, 'T', 'N', & 614 & nprojs, ndat*nspinor, npwin, & ! M,N,K 615 & cone, & ! alpha 616 & gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs_r, npwin, & ! A, LDA 617 & temp_realvec_gpu, npwin, & ! B, LDB 618 & czero, & ! beta 619 & gemm_nonlop_gpu_data%projections_gpu, nprojs) ! C, LDC 620 621 !temp_realvec(1:npwin*nspinor*ndat) = vectin(2,1:npwin*nspinor*ndat) 622 call extract_imag_part(temp_realvec_gpu, vectin_ptr, npwin*nspinor*ndat) 623 624 if(istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then 625 ! do idat=1, ndat*nspinor 626 ! temp_realvec(1+(idat-1)*npwin) = zero 627 ! end do 628 call fix_realvec(temp_realvec_gpu, npwin, ndat*nspinor, fix_realvec_zero_out) 629 end if 630 call abi_gpu_xgemm(cplex, 'T', 'N', & 631 & nprojs, ndat*nspinor, npwin, & 632 & cone, & 633 & gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs_i, npwin, & 634 & temp_realvec_gpu, npwin, & 635 & cone, & 636 & gemm_nonlop_gpu_data%projections_gpu, nprojs) 637 638 !gemm_nonlop_gpu_data%projections_gpu = 2 * gemm_nonlop_gpu_data%projections_gpu 639 call gpu_xscal(cplex, nprojs*nspinor*ndat, ctwo, gemm_nonlop_gpu_data%projections_gpu, 1) 640 641 end if ! cplex == 2 642 643 ! call xmpi_sum(projections,mpi_enreg%comm_fft,ierr) 644 645 if(cpopt >= 0) then 646 ! copy from GPU projections_gpu to HOST projections_cpu 647 call copy_from_gpu(projections_cpu, gemm_nonlop_gpu_data%projections_gpu,& 648 INT(cplex, c_size_t) * nprojs * nspinor*ndat * dp) 649 650 if(.not. local_vectproj) then 651 !This use-case is extremely painful for GEMM GPU nonlop performance. 652 !vectproj paramter should always be provided to avoid it 653 654 ! store in cprjin 655 do idat=1, ndat*nspinor 656 shift = 0 657 do iatom = 1, natom 658 nlmn = cprjin(iatom, idat)%nlmn 659 cprjin(iatom, idat)%cp(1:cplex, 1:nlmn) = projections_cpu(1:cplex, shift+1:shift+nlmn, idat) 660 shift = shift + nlmn 661 end do 662 end do 663 end if 664 665 end if ! cpopt >= 0 666 667 end if ! cpopt >= 2 668 669 if(choice > 0) then 670 671 if(choice /= 7) then 672 ! opernlc 673 iatm = 0 674 ndgxdt = 0 675 ndgxdtfac = 0 676 nd2gxdt = 0 677 nd2gxdtfac = 0 678 optder = 0 679 680 ! get the maximun of nattyp array 681 nattyp_max = maxval(nattyp) 682 683 !! gpu alloc and init sij_typ_size_bytes 684 sij_typ_size_bytes = (((paw_opt+1)/3)*lmnmax*(lmnmax+1)/2) * dp 685 ABI_MALLOC_CUDA( sij_typ_gpu, sij_typ_size_bytes ) 686 687 ABI_MALLOC ( sij_typ , (((paw_opt+1)/3)*lmnmax*(lmnmax+1)/2) ) 688 689 shift = 0 690 do itypat=1, ntypat 691 nlmn=count(indlmn(3,:,itypat)>0) 692 if (paw_opt>=2) then 693 694 if (cplex_enl==1) then 695 696 do ilmn=1,nlmn*(nlmn+1)/2 697 sij_typ(ilmn) = sij(ilmn,itypat) 698 end do 699 700 else 701 702 do ilmn=1,nlmn*(nlmn+1)/2 703 sij_typ(ilmn) = sij(2*ilmn-1,itypat) 704 end do 705 706 end if 707 708 call copy_on_gpu(sij_typ, sij_typ_gpu, sij_typ_size_bytes ) 709 710 end if ! paw_opt>=2 711 712 ibeg = shift+1 713 iend = shift+nattyp(itypat)*nlmn 714 715 !Parallelization over spinors treatment 716 shift_spinor = 0 717 if (mpi_enreg%paral_spinor==1) then 718 shift_spinor = mpi_enreg%me_spinor 719 end if 720 721 ! Use the Kokkos implementation of opernlc if available 722 if (gpu_option == ABI_GPU_KOKKOS) then 723 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) 724 call opernlc_ylm_allwf_kokkos(cplex, cplex_enl, cplex_fac, & 725 & dimenl1, dimenl2, dimekbq, & 726 & iatm, itypat, ntypat, nprojs, & 727 & natom, nattyp(itypat), nspinor, & 728 & nspinortot, paw_opt, & 729 & nlmn, lmnmax, & 730 & enl_gpu, & 731 & gemm_nonlop_gpu_data%projections_gpu, & 732 & gemm_nonlop_gpu_data%vnl_projections_gpu, & 733 & vnl_projections2_gpu, & 734 & gemm_nonlop_gpu_data%s_projections_gpu, & 735 & shift_spinor, ndat, & 736 & atindx1_gpu, & 737 & indlmn_gpu, & 738 & lambda_gpu, & 739 & sij_typ_gpu, & 740 & shift, nattyp_max) 741 #endif 742 ! Fall back on CPU implementation of opernlc 743 else 744 745 call copy_from_gpu( projections_cpu, gemm_nonlop_gpu_data%projections_gpu, & 746 & INT(cplex, c_size_t) * nprojs * nspinor*ndat * dp) 747 748 call opernlc_ylm_allwf_cpu(atindx1, cplex, cplex_enl, cplex_fac, & 749 & dimenl1, dimenl2, dimekbq, enl, & 750 & projections_cpu(:, ibeg:iend, 1:nspinor*ndat), & 751 & vnl_projections_cpu(:, ibeg:iend, 1:nspinor*ndat), & 752 & s_projections_cpu(:, ibeg:iend, 1:nspinor*ndat), & 753 & iatm, indlmn(:,:,itypat), itypat, ndat, lambda, mpi_enreg, natom, & 754 & nattyp(itypat), nlmn, nspinor, nspinortot, paw_opt, sij_typ) 755 756 call copy_on_gpu( s_projections_cpu, gemm_nonlop_gpu_data% s_projections_gpu, & 757 & INT(cplex, c_size_t) * nprojs * nspinor*ndat * dp) 758 call copy_on_gpu(vnl_projections_cpu, gemm_nonlop_gpu_data%vnl_projections_gpu, & 759 & INT(cplex_fac, c_size_t) * nprojs * nspinor*ndat * dp) 760 end if 761 762 shift = shift + nattyp(itypat)*nlmn 763 iatm = iatm + nattyp(itypat) 764 end do ! itypat 765 766 ABI_FREE(sij_typ) 767 ABI_FREE_CUDA( sij_typ_gpu ) 768 769 else ! choice == 7 770 771 ! TO BE REMOVED - DEBUG ONLY 772 call copy_gpu_to_gpu(gemm_nonlop_gpu_data%s_projections_gpu, & 773 & gemm_nonlop_gpu_data%projections_gpu, & 774 & INT(cplex, c_size_t) * nprojs * nspinor * ndat * dp) 775 776 end if ! choice 777 778 ! opernlb 779 if (paw_opt == 3 .or. paw_opt == 4) then 780 781 ! Get svectout from s_projections 782 if(cplex == 2) then 783 784 call abi_gpu_xgemm(cplex, 'N', 'N', & 785 & npwout, ndat*nspinor, nprojs, & ! M,N,K 786 & cone, & ! alpha 787 & gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs, npwout, & ! A, LDA 788 & gemm_nonlop_gpu_data%s_projections_gpu, nprojs, & ! B, LDB 789 & czero, & ! beta 790 & svectout_ptr, npwout) ! C, LDC 791 792 else 793 794 if (.not. gpu_allocated(temp_realvec_gpu)) then 795 ABI_ERROR("Please provide memory allocation for temp_realvec_gpu array") 796 end if 797 798 call abi_gpu_xgemm(cplex, 'N', 'N', & 799 & npwout, ndat*nspinor, nprojs, & ! M,N,K 800 & cone, & ! alpha 801 & gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs_r, npwout, & ! A, LDA 802 & gemm_nonlop_gpu_data%s_projections_gpu, nprojs, & ! B, LDB 803 & czero, & ! beta 804 & temp_realvec_gpu, npwout) ! C, LDC 805 !svectout(1,1:npwout*nspinor*ndat) = temp_realvec_gpu(1:npwout*nspinor*ndat) 806 call insert_real_part(svectout_ptr, temp_realvec_gpu, npwout*nspinor*ndat) 807 808 call abi_gpu_xgemm(cplex, 'N', 'N', & 809 & npwout, ndat*nspinor, nprojs, & ! M,N,K 810 & cone, & ! alpha 811 & gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs_i, npwout,& ! A, LDA 812 & gemm_nonlop_gpu_data%s_projections_gpu, nprojs, & ! B, LDB 813 & czero, & ! beta 814 & temp_realvec_gpu, npwout) ! C, LDC 815 !svectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat) 816 call insert_imag_part(svectout_ptr, temp_realvec_gpu, npwout*nspinor*ndat) 817 818 end if ! cplex == 2 819 820 if (choice /= 7) then 821 822 ! compute : svectout_gpu = svectout_gpu + vectin_gpu 823 ! this a axpy operation with x => vectin_gpu, y => svectout_gpu and alpha=1 824 ! please remember that svectout_gpu and vectin_gpu have same size when paw_opt >= 3 and paw_opt<6 825 ! this is the case here 826 call abi_gpu_xaxpy(1, & ! real 827 & 2*npwin*nspinor*ndat, & ! size 828 & cone, & ! alpha 829 & vectin_ptr, 1, & ! X, incrx 830 & svectout_ptr, 1) ! Y, incry 831 832 endif 833 834 if(gpu_option==ABI_GPU_LEGACY) then 835 ! copy back results on host 836 call copy_from_gpu(svectout, svectout_ptr, INT(2, c_size_t)*npwout*nspinor*(paw_opt/3)*ndat * dp) 837 end if 838 839 ! reminder : a call to cudaDeviceSynchronize is needed so that svectout can be re-used safely on host 840 ! don't do it here, only when really needed 841 842 end if ! (paw_opt == 3 .or. paw_opt == 4) 843 844 if (paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4) then 845 846 ! Get vectout from vnl_projections 847 if (cplex_fac == 2) then 848 849 call abi_gpu_xgemm(cplex, 'N', 'N', & 850 & npwout, ndat*nspinor, nprojs, & ! M,N,K 851 & cone, & ! alpha 852 & gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs, npwout, & ! A, LDA 853 & gemm_nonlop_gpu_data%vnl_projections_gpu, nprojs, & ! B, LDB 854 & czero, & ! beta 855 & vectout_ptr, npwout) ! C, LDC 856 857 else 858 859 if (.not. gpu_allocated(temp_realvec_gpu)) then 860 ABI_ERROR("Please provide memory allocation for temp_realvec_gpu array") 861 end if 862 863 call abi_gpu_xgemm(cplex, 'N', 'N', & 864 & npwout, ndat*nspinor, nprojs, & ! M,N,K 865 & cone, & ! alpha 866 & gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs_r, npwout, & ! A, LDA 867 & gemm_nonlop_gpu_data%vnl_projections_gpu, nprojs, & ! B, LDB 868 & czero, & ! beta 869 & temp_realvec_gpu, npwout) ! C, LDC 870 ! vectout(1,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat) 871 call insert_real_part(vectout_ptr, temp_realvec_gpu, npwout*nspinor*ndat) 872 873 call abi_gpu_xgemm(cplex, 'N', 'N', & 874 & npwout, ndat*nspinor, nprojs, & ! M,N,K 875 & cone, & ! alpha 876 & gemm_nonlop_kpt_gpu(gemm_nonlop_ikpt_this_proc_being_treated)%projs_i, npwout, & ! A, LDA 877 & gemm_nonlop_gpu_data%vnl_projections_gpu, nprojs, & ! B, LDB 878 & czero, & ! beta 879 & temp_realvec_gpu, npwout) ! C, LDC 880 ! vectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat) 881 call insert_imag_part(vectout_ptr, temp_realvec_gpu, npwout*nspinor*ndat) 882 883 end if ! cplex_fac == 2 884 885 ! copy back results on host 886 if(gpu_option==ABI_GPU_LEGACY) then 887 call copy_from_gpu(vectout, vectout_ptr, INT(2, c_size_t)*npwout*nspinor*ndat * dp) 888 end if 889 890 end if ! (paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4) 891 892 end if ! choice > 0 893 894 if (dimekbq>1) then 895 ABI_FREE_CUDA(vnl_projections2_gpu) 896 end if 897 898 ! device sync before reusing data computed on device 899 ! this may not be the best location to have this sync 900 call gpu_device_synchronize() 901 902 if (gpu_allocated(temp_realvec_gpu)) then 903 ABI_FREE_CUDA(temp_realvec_gpu) 904 end if 905 906 ABI_FREE_CUDA( enl_gpu ) 907 ABI_FREE_CUDA( atindx1_gpu ) 908 ABI_FREE_CUDA( indlmn_gpu ) 909 ABI_FREE_CUDA( lambda_gpu ) 910 911 ! if projections_cpu was allocated, then free it here 912 if(.not. local_vectproj) then 913 ABI_FREE(projections_cpu) 914 end if 915 ABI_FREE( s_projections_cpu) ! TO BE REMOVED 916 ABI_FREE(vnl_projections_cpu) ! TO BE REMOVED 917 918 end subroutine gemm_nonlop_gpu
m_gemm_nonlop_gpu/init_gemm_nonlop_gpu [ Functions ]
[ Top ] [ m_gemm_nonlop_gpu ] [ Functions ]
NAME
init_gemm_nonlop_gpu
FUNCTION
Memory allocation of the gemm_nonlop_kpt_gpu array
INPUTS
nkpt= number of k-points
PARENTS
m_gstate
CHILDREN
abi_zgemm_2r,dgemm,opernlc_ylm,xmpi_sum
SOURCE
190 subroutine init_gemm_nonlop_gpu(nkpt) 191 192 integer,intent(in) :: nkpt 193 integer :: ikpt 194 195 ! ************************************************************************* 196 197 ! TODO only allocate the number of kpt treated by this proc 198 ABI_MALLOC(gemm_nonlop_kpt_gpu, (nkpt)) 199 do ikpt=1,nkpt 200 gemm_nonlop_kpt_gpu(ikpt)%npw = -1 201 gemm_nonlop_kpt_gpu(ikpt)%nprojs = -1 202 end do 203 204 gemm_nonlop_gpu_data % allocated = .false. 205 206 end subroutine init_gemm_nonlop_gpu
m_gemm_nonlop_gpu/make_gemm_nonlop_gpu [ Functions ]
[ Top ] [ m_gemm_nonlop_gpu ] [ Functions ]
NAME
make_gemm_nonlop_gpu
FUNCTION
Replacement of nonlop.
INPUTS
PARENTS
m_nonlop
CHILDREN
abi_zgemm_2r,dgemm,opernlc_ylm,xmpi_sum
SOURCE
280 subroutine make_gemm_nonlop_gpu(ikpt,npw,lmnmax,ntypat,indlmn,nattyp,istwf_k,ucvol,ffnl_k, & 281 & ph3d_k,kpt_k,kg_k,kpg_k, & 282 & compute_grad_strain,compute_grad_atom) ! Optional parameters 283 284 !Arguments ------------------------------------ 285 integer, intent(in) :: ikpt 286 integer, intent(in) :: npw, lmnmax,ntypat 287 integer, intent(in) :: indlmn(:,:,:), kg_k(:,:) 288 integer, intent(in) :: nattyp(ntypat) 289 integer, intent(in) :: istwf_k 290 logical, intent(in), optional :: compute_grad_strain,compute_grad_atom 291 real(dp), intent(in) :: ucvol 292 real(dp), intent(in) :: ffnl_k(:,:,:,:) 293 real(dp), intent(in) :: ph3d_k(:,:,:) 294 real(dp), intent(in) :: kpt_k(:) 295 real(dp), intent(in), target :: kpg_k(:,:) 296 297 ! locals 298 integer :: nprojs, itypat 299 300 ! ************************************************************************* 301 302 ABI_UNUSED((/ikpt,npw,lmnmax,ntypat,indlmn,kg_k,nattyp,istwf_k/)) 303 ABI_UNUSED((/ucvol,ffnl_k,ph3d_k,kpt_k,kpg_k/)) 304 ABI_UNUSED((/compute_grad_strain,compute_grad_atom/)) 305 306 call make_gemm_nonlop(ikpt,npw,lmnmax,ntypat,indlmn,nattyp,istwf_k,ucvol,ffnl_k, & 307 ph3d_k,kpt_k,kg_k,kpg_k, & 308 compute_grad_strain=compute_grad_strain,compute_grad_atom=compute_grad_atom) 309 310 nprojs = 0 311 do itypat=1,ntypat 312 nprojs = nprojs + count(indlmn(3,:,itypat)>0)*nattyp(itypat) 313 end do 314 gemm_nonlop_kpt_gpu(ikpt)%npw = npw 315 gemm_nonlop_kpt_gpu(ikpt)%nprojs = nprojs 316 317 #ifdef DEBUG_VERBOSE_GPU 318 if(xmpi_comm_rank(xmpi_world) == 0) then 319 call check_gpu_mem("make_gemm_nonlop begin") 320 call wrtout(std_out,sjoin(" npw .......", itoa(npw)), 'COLL') 321 call wrtout(std_out,sjoin(" nprojs .......", itoa(nprojs)), 'COLL') 322 end if 323 #endif 324 325 if(istwf_k <= 1) then 326 call alloc_on_gpu(gemm_nonlop_kpt_gpu(ikpt)%projs, INT(2,c_size_t)*npw*nprojs*dp) 327 ! TODO : gradients 328 else 329 call alloc_on_gpu(gemm_nonlop_kpt_gpu(ikpt)%projs_r, INT(1, c_size_t)*npw*nprojs*dp) 330 call alloc_on_gpu(gemm_nonlop_kpt_gpu(ikpt)%projs_i, INT(1, c_size_t)*npw*nprojs*dp) 331 ! TODO : gradients 332 end if 333 334 #ifdef DEBUG_VERBOSE_GPU 335 if(xmpi_comm_rank(xmpi_world) == 0) then 336 call check_gpu_mem("make_gemm_nonlop end ") 337 end if 338 #endif 339 340 ! upload data to gpu memory 341 if(istwf_k <= 1) then 342 call copy_on_gpu(gemm_nonlop_kpt(ikpt)%projs, gemm_nonlop_kpt_gpu(ikpt)%projs, INT(2, c_size_t)*npw*nprojs*dp) 343 ! TODO : gradients 344 else 345 call copy_on_gpu(gemm_nonlop_kpt(ikpt)%projs_r, gemm_nonlop_kpt_gpu(ikpt)%projs_r, & 346 & INT(1, c_size_t)*npw*nprojs*dp) 347 call copy_on_gpu(gemm_nonlop_kpt(ikpt)%projs_i, gemm_nonlop_kpt_gpu(ikpt)%projs_i, & 348 & INT(1, c_size_t)*npw*nprojs*dp) 349 end if 350 351 end subroutine make_gemm_nonlop_gpu