TABLE OF CONTENTS


ABINIT/from_block_cyclic_to_mat [ Functions ]

[ Top ] [ Functions ]

NAME

 from_block_cyclic_to_mat

FUNCTION

 Fills the columns of full_mat owned by iproc with block_cyclic_mat, using a 1D block-cyclic distribution

COPYRIGHT

 Copyright (C) 2014-2018 ABINIT group (AL)
 this file is distributed under the terms of the
 gnu general public license, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 for the initials of contributors, see ~abinit/doc/developers/contributors.txt .

INPUTS

 block_cyclic_mat(2,vectsize*buffsize)=local part of full_mat
 vectsize=number of rows
 nband=number of columns of the full matrix
 buffsize=size of the local part of the matrix
 blocksize=size of block for the block-cyclic distribution
 iproc=local processor number
 nprocs=total number of processors

OUTPUT

 full_mat(2,vectsize*nband)=full matrix

SIDE EFFECTS

PARENTS

      rayleigh_ritz

CHILDREN

SOURCE

667 subroutine from_block_cyclic_to_mat(full_mat, vectsize, nband, block_cyclic_mat, buffsize, blocksize, iproc, nprocs)
668  use defs_basis
669  use defs_abitypes
670 
671 !This section has been created automatically by the script Abilint (TD).
672 !Do not modify the following lines by hand.
673 #undef ABI_FUNC
674 #define ABI_FUNC 'from_block_cyclic_to_mat'
675 !End of the abilint section
676 
677  implicit none
678 
679  integer, intent(in) :: vectsize, nband, buffsize, blocksize, iproc, nprocs
680  real(dp), intent(inout) :: full_mat(2, vectsize*nband)
681  real(dp), intent(in) :: block_cyclic_mat(2, vectsize*buffsize)
682 
683  integer :: shift_fullmat, shift_block_cyclic_mat
684  integer :: cur_bsize
685 
686  ! *************************************************************************
687 
688  shift_fullmat = iproc*blocksize
689  shift_block_cyclic_mat = 0
690 
691  do while(.true.)
692    cur_bsize = MIN(blocksize, nband-shift_fullmat)
693    if(cur_bsize <= 0) exit
694    full_mat(:, shift_fullmat*vectsize+1 : shift_fullmat*vectsize + cur_bsize*vectsize) =&
695 &   full_mat(:, shift_fullmat*vectsize+1 : shift_fullmat*vectsize + cur_bsize*vectsize) +&
696 &   block_cyclic_mat(:, shift_block_cyclic_mat*vectsize+1 : shift_block_cyclic_mat*vectsize + cur_bsize*vectsize)
697 
698    shift_block_cyclic_mat = shift_block_cyclic_mat + blocksize
699    shift_fullmat = shift_fullmat + blocksize*nprocs
700  end do
701 
702 end subroutine from_block_cyclic_to_mat

ABINIT/from_mat_to_block_cyclic [ Functions ]

[ Top ] [ Functions ]

NAME

 from_mat_to_block_cyclic

FUNCTION

 Fills block_cyclic_mat with the columns of full_mat owned by iproc, using a 1D block-cyclic distribution

COPYRIGHT

 Copyright (C) 2014-2018 ABINIT group (AL)
 this file is distributed under the terms of the
 gnu general public license, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 for the initials of contributors, see ~abinit/doc/developers/contributors.txt .

INPUTS

 full_mat(2,vectsize*nband)=full input matrix
 vectsize=number of rows
 nband=number of columns of the full matrix
 buffsize=size of the local part of the matrix
 blocksize=size of block for the block-cyclic distribution
 iproc=local processor number
 nprocs=total number of processors

OUTPUT

 block_cyclic_mat(2,vectsize*buffsize)=local part of full_mat

SIDE EFFECTS

PARENTS

      rayleigh_ritz

CHILDREN

SOURCE

594 subroutine from_mat_to_block_cyclic(full_mat, vectsize, nband, block_cyclic_mat, buffsize, blocksize, iproc, nprocs)
595  use defs_basis
596  use defs_abitypes
597 
598 !This section has been created automatically by the script Abilint (TD).
599 !Do not modify the following lines by hand.
600 #undef ABI_FUNC
601 #define ABI_FUNC 'from_mat_to_block_cyclic'
602 !End of the abilint section
603 
604  implicit none
605 
606  integer, intent(in) :: vectsize, nband, buffsize, blocksize, iproc, nprocs
607  real(dp), intent(in) :: full_mat(2, vectsize*nband)
608  real(dp), intent(inout) :: block_cyclic_mat(2, vectsize*buffsize)
609 
610  integer :: shift_fullmat, shift_block_cyclic_mat
611  integer :: cur_bsize
612 
613  ! *************************************************************************
614 
615  shift_fullmat = iproc*blocksize
616  shift_block_cyclic_mat = 0
617 
618  do while(.true.)
619    cur_bsize = MIN(blocksize, nband-shift_fullmat)
620    if(cur_bsize <= 0) exit
621    block_cyclic_mat(:, shift_block_cyclic_mat*vectsize+1 : shift_block_cyclic_mat*vectsize + cur_bsize*vectsize) = &
622 &   full_mat(:, shift_fullmat*vectsize+1 : shift_fullmat*vectsize + cur_bsize*vectsize)
623 
624    shift_block_cyclic_mat = shift_block_cyclic_mat + blocksize
625    shift_fullmat = shift_fullmat + blocksize*nprocs
626  end do
627 
628 end subroutine from_mat_to_block_cyclic

ABINIT/pack_matrix [ Functions ]

[ Top ] [ Functions ]

NAME

 pack_matrix

FUNCTION

 Packs a matrix into hermitian format

COPYRIGHT

 Copyright (C) 2014-2018 ABINIT group (AL)
 this file is distributed under the terms of the
 gnu general public license, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 for the initials of contributors, see ~abinit/doc/developers/contributors.txt .

INPUTS

 N: size of matrix
 cplx: is the matrix complex
 mat_in(2, N*N)= matrix to be packed

OUTPUT

 mat_out(N*N+1)= packed matrix

SIDE EFFECTS

PARENTS

      rayleigh_ritz

CHILDREN

NOTES

SOURCE

740 subroutine pack_matrix(mat_in, mat_out, N, cplx)
741  use defs_basis
742 
743 !This section has been created automatically by the script Abilint (TD).
744 !Do not modify the following lines by hand.
745 #undef ABI_FUNC
746 #define ABI_FUNC 'pack_matrix'
747 !End of the abilint section
748 
749  implicit none
750 
751 !This section has been created automatically by the script Abilint (TD).
752 !Do not modify the following lines by hand.
753 #undef ABI_FUNC
754 #define ABI_FUNC 'pack_matrix'
755 !End of the abilint section
756 
757  integer, intent(in) :: N, cplx
758  real(dp), intent(in) :: mat_in(cplx, N*N)
759  real(dp), intent(out) :: mat_out(cplx*N*(N+1)/2)
760  integer :: isubh, i, j
761 
762  ! *************************************************************************
763 
764  isubh = 1
765  do j=1,N
766    do i=1,j
767      mat_out(isubh)    = mat_in(1, (j-1)*N+i)
768      ! bad for vectorization, but it's not performance critical, so ...
769      if(cplx == 2) then
770        mat_out(isubh+1)  = mat_in(2, (j-1)*N+i)
771      end if
772      isubh=isubh+cplx
773    end do
774  end do
775 end subroutine pack_matrix

ABINIT/rayleigh_ritz_distributed [ Functions ]

[ Top ] [ Functions ]

NAME

 rayleigh_ritz_distributed

FUNCTION

 Performs a rayleigh-ritz procedure (subspace rotation), building the distributed
 hamiltonian/overlap matrices directly, and calling the ScaLapack routines

COPYRIGHT

 Copyright (C) 2014-2018 ABINIT group (AL)
 this file is distributed under the terms of the
 gnu general public license, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 for the initials of contributors, see ~abinit/doc/developers/contributors.txt .

INPUTS

  mpi_enreg=informations about MPI parallelization
  nband=number of bands at this k point for that spin polarization
  npw=number of plane waves at this k point
  nspinor=number of plane waves at this k point
  usepaw=do we use the PAW method

OUTPUT

  eig(nband)=array for holding eigenvalues (hartree)

SIDE EFFECTS

  cg(2,*)=updated wavefunctions
  ghc(2,*)=updated ghc
  gsc(2,*)=updated gsc
  gvnlc(2,*)=updated gvnlc

PARENTS

      chebfi

CHILDREN

NOTES

  Should profile for large test cases and see where the bottleneck is.
  Is it the copies? Should we do partial GEMMs?
  Is it the latency? Should we buffer more?
  Should we overlap computations and communications? (easy in theory, tedious in practice)

SOURCE

297 subroutine rayleigh_ritz_distributed(cg,ghc,gsc,gvnlc,eig,istwf_k,mpi_enreg,nband,npw,nspinor,usepaw)
298  use defs_basis
299  use defs_abitypes
300  use m_errors
301  use m_xmpi
302  use m_profiling_abi
303  use m_abi_linalg
304  use m_slk
305 
306 !This section has been created automatically by the script Abilint (TD).
307 !Do not modify the following lines by hand.
308 #undef ABI_FUNC
309 #define ABI_FUNC 'rayleigh_ritz_distributed'
310  use interfaces_18_timing
311  use interfaces_66_wfs, except_this_one => rayleigh_ritz_distributed
312 !End of the abilint section
313 
314  implicit none
315 
316  integer,external :: NUMROC
317 
318  ! Arguments
319  type(mpi_type),intent(inout) :: mpi_enreg
320  integer,intent(in) :: nband,npw,nspinor,usepaw,istwf_k
321  real(dp),intent(inout) :: cg(2,npw*nspinor*nband),gsc(2,npw*nspinor*nband),ghc(2,npw*nspinor*nband),gvnlc(2,npw*nspinor*nband)
322  real(dp),intent(out) :: eig(nband)
323 
324  ! Locals
325  integer :: blocksize,nbproc,iproc,ierr,cplx,vectsize
326  integer :: buffsize_iproc(2), coords_iproc(2), grid_dims(2)
327  real(dp) :: cg_new(2,npw*nspinor*nband),gsc_or_vnlc_new(2,npw*nspinor*nband),ghc_new(2,npw*nspinor*nband)
328  real(dp), allocatable :: ham_iproc(:,:), ovl_iproc(:,:), evec_iproc(:,:), left_temp(:,:), right_temp(:,:)
329  real(dp) :: tsec(2)
330  type(matrix_scalapack) :: sca_ham, sca_ovl, sca_evec
331  character(len=500) :: message
332  character :: blas_transpose
333 
334  integer, parameter :: timer_chebfi = 1600, timer_alltoall = 1601, timer_apply_inv_ovl = 1602, timer_rotation = 1603
335  integer, parameter :: timer_subdiago = 1604, timer_subham = 1605, timer_ortho = 1606, timer_getghc = 1607
336  integer, parameter :: timer_residuals = 1608, timer_update_eigen = 1609, timer_sync = 1610
337 
338  ! *************************************************************************
339 
340  if(istwf_k == 1) then
341    cplx = 2
342    vectsize = npw*nspinor
343    blas_transpose = 'c'
344  else
345    cplx = 1
346    vectsize = 2*npw*nspinor
347    blas_transpose = 't'
348  end if
349 
350  !write(message, *) 'RR: init'
351  !call wrtout(std_out,message,'COLL')
352  !======================================================================================================
353  ! Init Scalapack matrices
354  !======================================================================================================
355  call init_matrix_scalapack(sca_ham ,nband,nband,abi_processor,istwf_k,10)
356  call init_matrix_scalapack(sca_ovl ,nband,nband,abi_processor,istwf_k,10)
357  call init_matrix_scalapack(sca_evec,nband,nband,abi_processor,istwf_k,10)
358 
359  ! Get info
360  blocksize = sca_ham%sizeb_blocs(1) ! Assume square blocs
361  nbproc = abi_processor%grid%nbprocs
362  grid_dims = abi_processor%grid%dims
363 
364  !======================================================================================================
365  ! Build hamiltonian and overlap matrices
366  !======================================================================================================
367  ! TODO maybe we should avoid copies at the price of less BLAS efficiency (when blocksize is small)? must profile.
368 
369  call timab(timer_subham, 1, tsec)
370 
371  ! Transform cg, ghc and maybe gsc, according to istwf_k
372  if(istwf_k == 2) then
373    cg = cg * sqrt2
374    if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) / sqrt2
375    ghc = ghc * sqrt2
376    if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) / sqrt2
377    if(usepaw == 1) then
378      gsc = gsc * sqrt2
379      if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) / sqrt2
380    end if
381  end if
382  
383  do iproc=0,nbproc-1
384    ! Build the local matrix belonging to processor iproc
385    !write(message, *) 'RR: build', iproc
386    !call wrtout(std_out,message,'COLL')
387 
388    ! Get coordinates of iproc
389    coords_iproc(1) = INT(iproc / grid_dims(2))
390    coords_iproc(2) = MOD(iproc,  grid_dims(2))
391 
392    ! Get buffersize of iproc
393    buffsize_iproc(1) = NUMROC(nband,blocksize,coords_iproc(1),0,abi_processor%grid%dims(1))
394    buffsize_iproc(2) = NUMROC(nband,blocksize,coords_iproc(2),0,abi_processor%grid%dims(2))
395 
396    ! Allocate matrices_iproc, that will gather the contribution of this proc to the block owned by iproc
397    ABI_ALLOCATE(ham_iproc, (cplx*buffsize_iproc(1), buffsize_iproc(2)))
398    ABI_ALLOCATE(ovl_iproc, (cplx*buffsize_iproc(1), buffsize_iproc(2)))
399 
400    ! Build them
401    ABI_ALLOCATE(left_temp,  (2, npw*nspinor*buffsize_iproc(1)))
402    ABI_ALLOCATE(right_temp, (2, npw*nspinor*buffsize_iproc(2)))
403 
404    ! ovl
405    call from_mat_to_block_cyclic(cg, npw*nspinor, nband, left_temp, &
406 &   buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
407    if(usepaw == 1) then
408      call from_mat_to_block_cyclic(gsc, npw*nspinor, nband, right_temp, &
409 &     buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
410    else
411      call from_mat_to_block_cyclic(cg, npw*nspinor, nband, right_temp, &
412 &     buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
413    end if
414    call abi_xgemm(blas_transpose,'n',buffsize_iproc(1),buffsize_iproc(2),vectsize,cone,left_temp,vectsize,&
415 &   right_temp,vectsize,czero,ovl_iproc,buffsize_iproc(1), x_cplx=cplx)
416 
417    ! ham
418    call from_mat_to_block_cyclic(ghc, npw*nspinor, nband, right_temp, &
419 &   buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
420    call abi_xgemm(blas_transpose,'n',buffsize_iproc(1),buffsize_iproc(2),vectsize,cone,left_temp,vectsize,&
421 &   right_temp,vectsize,czero,ham_iproc,buffsize_iproc(1), x_cplx=cplx)
422    
423    ! Sum to iproc, and fill sca_ matrices
424    call xmpi_sum_master(ham_iproc, iproc, abi_communicator, ierr)
425    call xmpi_sum_master(ovl_iproc, iproc, abi_communicator, ierr)
426    if(iproc == abi_processor%myproc) then
427      ! DCOPY to bypass the real/complex issue
428      if(cplx == 2) then
429        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ham_iproc, 1, sca_ham%buffer_cplx, 1)
430        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ovl_iproc, 1, sca_ovl%buffer_cplx, 1)
431      else
432        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ham_iproc, 1, sca_ham%buffer_real, 1)
433        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ovl_iproc, 1, sca_ovl%buffer_real, 1)
434      end if
435    end if
436 
437    ABI_DEALLOCATE(ham_iproc)
438    ABI_DEALLOCATE(ovl_iproc)
439    ABI_DEALLOCATE(left_temp)
440    ABI_DEALLOCATE(right_temp)
441  end do
442 
443  ! Final sum
444  if(cplx == 2) then
445    call xmpi_sum(sca_ham%buffer_cplx, abi_complement_communicator, ierr)
446    call xmpi_sum(sca_ovl%buffer_cplx, abi_complement_communicator, ierr)
447  else
448    call xmpi_sum(sca_ham%buffer_real, abi_complement_communicator, ierr)
449    call xmpi_sum(sca_ovl%buffer_real, abi_complement_communicator, ierr)
450  end if
451  
452  ! Transform back
453  if(istwf_k == 2) then
454    cg = cg / sqrt2
455    if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * sqrt2
456    ghc = ghc / sqrt2
457    if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) * sqrt2
458    if(usepaw == 1) then
459      gsc = gsc / sqrt2
460      if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) * sqrt2
461    end if
462  end if
463  call timab(timer_subham, 2, tsec)
464  
465  !======================================================================================================
466  ! Do the diagonalization
467  !======================================================================================================
468  !write(message, *) 'RR: diag'
469  !call wrtout(std_out,message,'COLL')
470  call timab(timer_subdiago, 1, tsec)
471  call compute_generalized_eigen_problem(abi_processor,sca_ham,sca_ovl,&
472 & sca_evec,eig,abi_communicator,istwf_k)
473  call timab(timer_subdiago, 2, tsec)
474 
475  !======================================================================================================
476  ! Perform rotation
477  !======================================================================================================
478  call timab(timer_rotation, 1, tsec)
479  cg_new = zero
480  gsc_or_vnlc_new = zero
481  ghc_new = zero
482  do iproc=0,nbproc-1
483    ! Compute the contribution to the rotated matrices from this block
484    !write(message, *) 'RR: rot', iproc
485    !call wrtout(std_out,message,'COLL')
486 
487    ! Get coordinates of iproc
488    coords_iproc(1) = INT(iproc / grid_dims(2))
489    coords_iproc(2) = MOD(iproc,  grid_dims(2))
490 
491    ! Get buffersize of iproc
492    buffsize_iproc(1) = NUMROC(nband,blocksize,coords_iproc(1),0,abi_processor%grid%dims(1))
493    buffsize_iproc(2) = NUMROC(nband,blocksize,coords_iproc(2),0,abi_processor%grid%dims(2))
494 
495    ! Get data from iproc
496    ABI_ALLOCATE(evec_iproc, (cplx*buffsize_iproc(1), buffsize_iproc(2)))
497    if(iproc == abi_processor%myproc) then
498      if(cplx == 2) then
499        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), sca_evec%buffer_cplx, 1, evec_iproc, 1)
500      else
501        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), sca_evec%buffer_real, 1, evec_iproc, 1)
502      end if
503    end if
504    call xmpi_bcast(evec_iproc,iproc,abi_communicator,ierr)
505 
506    ! Compute contribution to the rotated matrices from iproc
507    ABI_ALLOCATE(left_temp,  (2,npw*nspinor*buffsize_iproc(1)))
508    ABI_ALLOCATE(right_temp,  (2,npw*nspinor*buffsize_iproc(2)))
509 
510    ! cg
511    call from_mat_to_block_cyclic(cg, npw*nspinor, nband, left_temp, &
512 &   buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
513    call abi_xgemm('n','n',vectsize,buffsize_iproc(2),buffsize_iproc(1),cone,left_temp,vectsize,&
514 &   evec_iproc, buffsize_iproc(1), czero, right_temp, vectsize, x_cplx=cplx)
515    call from_block_cyclic_to_mat(cg_new, npw*nspinor, nband, right_temp, &
516 &   buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
517 
518    ! ghc
519    call from_mat_to_block_cyclic(ghc, npw*nspinor, nband, left_temp, &
520 &   buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
521    call abi_xgemm('n','n',vectsize,buffsize_iproc(2),buffsize_iproc(1),cone,left_temp,vectsize,&
522 &   evec_iproc, buffsize_iproc(1), czero, right_temp, vectsize,x_cplx=cplx)
523    call from_block_cyclic_to_mat(ghc_new, npw*nspinor, nband, right_temp,&
524 &   buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
525 
526    ! gsc or vnlc
527    if(usepaw == 1) then
528      call from_mat_to_block_cyclic(gsc, npw*nspinor, nband, left_temp, &
529 &     buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
530    else
531      call from_mat_to_block_cyclic(gvnlc, npw*nspinor, nband, left_temp, &
532 &     buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
533    end if
534    call abi_xgemm('n','n',vectsize,buffsize_iproc(2),buffsize_iproc(1),cone,left_temp,vectsize,&
535 &   evec_iproc, buffsize_iproc(1), czero, right_temp, vectsize, x_cplx=cplx)
536    call from_block_cyclic_to_mat(gsc_or_vnlc_new, npw*nspinor, nband, right_temp, &
537 &   buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
538 
539    ABI_DEALLOCATE(evec_iproc)
540    ABI_DEALLOCATE(left_temp)
541    ABI_DEALLOCATE(right_temp)
542  end do
543 
544  ! Overwrite with _new
545  cg = cg_new
546  ghc = ghc_new
547  if(usepaw == 1) then
548    gsc = gsc_or_vnlc_new
549  else
550    gvnlc = gsc_or_vnlc_new
551  end if
552  call timab(timer_rotation, 2, tsec)
553 
554 end subroutine rayleigh_ritz_distributed

ABINIT/rayleigh_ritz_subdiago [ Functions ]

[ Top ] [ Functions ]

NAME

 rayleigh_ritz_subdiago

FUNCTION

 Performs a rayleigh-ritz procedure (subspace rotation), building the
 hamiltonian/overlap matrices in full and calling the subdiago method

COPYRIGHT

 Copyright (C) 2014-2018 ABINIT group (AL)
 this file is distributed under the terms of the
 gnu general public license, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 for the initials of contributors, see ~abinit/doc/developers/contributors.txt .

INPUTS

  mpi_enreg=informations about MPI parallelization
  nband=number of bands at this k point for that spin polarization
  npw=number of plane waves at this k point
  nspinor=number of plane waves at this k point
  usepaw=do we use the PAW method

OUTPUT

  eig(nband)=array for holding eigenvalues (hartree)

SIDE EFFECTS

  cg(2,*)=updated wavefunctions
  ghc(2,*)=updated ghc
  gsc(2,*)=updated gsc
  gvnlc(2,*)=updated gvnlc

PARENTS

      chebfi

CHILDREN

NOTES

  TODO choose generalized eigenproblem or ortho + diago (see #if 1)

SOURCE

 52 subroutine rayleigh_ritz_subdiago(cg,ghc,gsc,gvnlc,eig,istwf_k,mpi_enreg,nband,npw,nspinor,usepaw)
 53  use defs_basis
 54  use defs_abitypes
 55  use m_cgtools
 56  use m_errors
 57  use m_xmpi
 58  use m_profiling_abi
 59  use m_abi_linalg
 60 
 61 !This section has been created automatically by the script Abilint (TD).
 62 !Do not modify the following lines by hand.
 63 #undef ABI_FUNC
 64 #define ABI_FUNC 'rayleigh_ritz_subdiago'
 65  use interfaces_14_hidewrite
 66  use interfaces_18_timing
 67  use interfaces_66_wfs, except_this_one => rayleigh_ritz_subdiago
 68 !End of the abilint section
 69 
 70  implicit none
 71 
 72  ! Arguments
 73  type(mpi_type),intent(inout) :: mpi_enreg
 74  integer,intent(in) :: nband,npw,nspinor,usepaw,istwf_k
 75  real(dp),intent(inout) :: cg(2,npw*nspinor*nband),gsc(2,npw*nspinor*nband),ghc(2,npw*nspinor*nband),gvnlc(2,npw*nspinor*nband)
 76  real(dp),intent(out) :: eig(nband)
 77 
 78  ! Locals
 79  real(dp), allocatable :: subham(:), totham(:,:)
 80  real(dp), allocatable :: subovl(:), totovl(:,:)
 81  real(dp), allocatable :: evec(:,:), edummy(:,:)
 82  integer :: ierr, cplx, vectsize
 83  real(dp) :: gtempc(2,npw*nspinor*nband), tsec(2)
 84  character :: blas_transpose
 85 
 86  integer, parameter :: timer_chebfi = 1600, timer_alltoall = 1601, timer_apply_inv_ovl = 1602, timer_rotation = 1603
 87  integer, parameter :: timer_subdiago = 1604, timer_subham = 1605, timer_ortho = 1606, timer_getghc = 1607
 88  integer, parameter :: timer_residuals = 1608, timer_update_eigen = 1609, timer_sync = 1610
 89 
 90  ! *************************************************************************
 91 
 92  if(istwf_k == 1) then
 93    cplx = 2
 94    vectsize = npw*nspinor
 95    blas_transpose = 'c'
 96  else
 97    cplx = 1
 98    vectsize = 2*npw*nspinor
 99    blas_transpose = 't'
100  end if
101 
102 #if 1
103  call timab(timer_subham, 1, tsec)
104 
105  ! Transform cg, ghc and maybe gsc, according to istwf_k
106  if(istwf_k == 2) then
107    cg = cg * sqrt2
108    if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) / sqrt2
109    ghc = ghc * sqrt2
110    if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) / sqrt2
111    if(usepaw == 1) then
112      gsc = gsc * sqrt2
113      if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) / sqrt2
114    end if
115  end if
116 
117  ! Build, pack and sum suham
118  ABI_ALLOCATE(subham, (cplx*nband*(nband+1)/2))
119  ABI_ALLOCATE(totham, (cplx, nband*nband))
120  call abi_xgemm(blas_transpose,'n',nband,nband,vectsize,cone,ghc,vectsize,&
121 & cg,vectsize,czero,totham,nband, x_cplx=cplx)
122  call pack_matrix(totham, subham, nband, cplx)
123  ABI_DEALLOCATE(totham)
124  call xmpi_sum(subham,mpi_enreg%comm_bandspinorfft,ierr)
125 
126 
127  ! Same for subovl
128  ABI_ALLOCATE(subovl, (cplx*nband*(nband+1)/2))
129  ABI_ALLOCATE(totovl, (cplx, nband*nband))
130  if(usepaw == 1) then
131    call abi_xgemm(blas_transpose,'n',nband,nband,vectsize,cone,gsc,vectsize,&
132 &   cg,vectsize,czero,totovl,nband, x_cplx=cplx)
133  else
134    call abi_xgemm(blas_transpose,'n',nband,nband,vectsize,cone,cg,vectsize,&
135 &   cg,vectsize,czero,totovl,nband, x_cplx=cplx)
136  end if
137  call pack_matrix(totovl, subovl, nband, cplx)
138  ABI_DEALLOCATE(totovl)
139  call xmpi_sum(subovl,mpi_enreg%comm_bandspinorfft,ierr)
140 
141 
142  ! Transform back
143  if(istwf_k == 2) then
144    cg = cg / sqrt2
145    if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * sqrt2
146    ghc = ghc / sqrt2
147    if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) * sqrt2
148    if(usepaw == 1) then
149      gsc = gsc / sqrt2
150      if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) * sqrt2
151    end if
152  end if
153  call timab(timer_subham, 2, tsec)
154 
155 
156  call timab(timer_subdiago, 1, tsec)
157  ABI_ALLOCATE(evec, (cplx*nband, nband))
158 
159  call abi_xhpgv(1,'V','U',nband,subham,subovl,eig,evec,istwf_k=istwf_k,use_slk=mpi_enreg%paral_kgb)
160 
161  ABI_DEALLOCATE(subham)
162  ABI_DEALLOCATE(subovl)
163 
164 ! Fix the phase (this is because of the simultaneous diagonalisation of this
165 ! matrix by different processors, allowing to get different unitary transforms, thus breaking the
166 ! coherency of parts of cg stored on different processors).
167 ! call cg_normev(evec,nband,nband)  ! Unfortunately, for cg_normev to work, one needs the vectors to be normalized, so uses fxphas_seq
168  ABI_ALLOCATE(edummy, (cplx*nband, nband))
169  call fxphas_seq(evec,edummy,0,0,1,nband*nband,nband*nband,nband,nband,0)
170  ABI_DEALLOCATE(edummy)
171 
172 
173 
174  ! Rotate
175  call abi_xgemm('n','n',vectsize,nband, nband,cone,cg , vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx)
176  cg = gtempc
177  call abi_xgemm('n','n',vectsize,nband, nband,cone,ghc, vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx)
178  ghc = gtempc
179  if(usepaw == 1) then
180    call abi_xgemm('n','n',vectsize,nband, nband,cone,gsc, vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx)
181    gsc = gtempc
182  else
183    call abi_xgemm('n','n',vectsize,nband, nband,cone,gvnlc, vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx)
184    gvnlc = gtempc
185  end if
186  ABI_DEALLOCATE(evec)
187  call timab(timer_subdiago, 2, tsec)
188 
189 #else
190  !! TODO non-functional, should be rewritten. Possibly faster (tests needed)
191  write(message, *) 'Transposed, orthogonalizing'
192  call wrtout(std_out,message,'COLL')
193 
194  ! orthonormalization
195  call timab(timer_ortho, 1, tsec)
196  if (paw) then
197    call abi_xorthonormalize(cg, gsc,nband, mpi_enreg%comm_bandspinorfft, sqgram, npw*nspinor, 2)
198  else
199    call abi_xorthonormalize(cg, cg, nband, mpi_enreg%comm_bandspinorfft, sqgram, npw*nspinor, 2)
200  end if
201  call timab(timer_ortho, 2, tsec)
202 
203  ! rotate ghc, gsc and gvnlc
204  call timab(timer_rotation, 1, tsec)
205  call abi_xtrsm('r','u','n','n',npw*nspinor,nband,cone,sqgram,nband, ghc,npw*nspinor,x_cplx=2)
206  if(paw) then
207    call abi_xtrsm('r','u','n','n',npw*nspinor,nband,cone,sqgram,nband, gsc,npw*nspinor,x_cplx=2)
208  else
209    call abi_xtrsm('r','u','n','n',npw*nspinor,nband,cone,sqgram,nband, gvnlc,npw*nspinor,x_cplx=2)
210  end if
211  call timab(timer_rotation, 2, tsec)
212 
213  write(message, *) 'Orthogonalized, building subham'
214  call wrtout(std_out,message,'COLL')
215 
216  ! build hamiltonian  in subspace
217  call timab(timer_subham, 1, tsec)
218  call abi_xgemm(blas_transpose,'n',nband,nband,npw*nspinor,cone,ghc,npw*nspinor,&
219 & cg,npw*nspinor,czero,totham,nband, x_cplx=2)
220  ! pack for compatibility with subdiago
221  call pack_matrix(totham, subham, nband)
222  call xmpi_sum(subham,mpi_enreg%comm_bandspinorfft,ierr)
223  call timab(timer_subham, 2, tsec)
224 
225  write(message, *) 'Subham built, diagonalizing'
226  call wrtout(std_out,message,'COLL')
227 
228  ! Rayleigh-Ritz
229  call timab(timer_subdiago,1,tsec)
230  call subdiago(cg,eig,evec,gsc,0,0,gs_hamk%istwf_k,&
231 & mcg,mcg,nband,npw,nspinor,dtset%paral_kgb,&
232 & subham,dummy,0,gs_hamk%usepaw,mpi_enreg%me_g0)
233  call timab(timer_subdiago,2,tsec)
234 
235  write(message, *) 'Diagonalization done'
236  call wrtout(std_out,message,'COLL')
237 
238  ! Rotate ghc and gvnlc according to evecs
239  call timab(timer_rotation, 1, tsec)
240  call abi_xgemm('n','n',npw*nspinor,nband, nband,cone,ghc, npw*nspinor, evec, nband, czero, gtempc, npw*nspinor, x_cplx=2)
241  ghc = gtempc
242  if(.not. paw) then
243    call abi_xgemm('n','n',npw*nspinor,nband, nband,cone,gvnlc, npw*nspinor, evec, nband, czero, gtempc, npw*nspinor, x_cplx=2)
244    gvnlc = gtempc
245  end if
246  call timab(timer_rotation, 2, tsec)
247 
248 #endif
249 end subroutine rayleigh_ritz_subdiago