TABLE OF CONTENTS


ABINIT/from_block_cyclic_to_mat [ Functions ]

[ Top ] [ Functions ]

NAME

 from_block_cyclic_to_mat

FUNCTION

 Fills the columns of full_mat owned by iproc with block_cyclic_mat, using a 1D block-cyclic distribution

INPUTS

 block_cyclic_mat(2,vectsize*buffsize)=local part of full_mat
 vectsize=number of rows
 nband=number of columns of the full matrix
 buffsize=size of the local part of the matrix
 blocksize=size of block for the block-cyclic distribution
 iproc=local processor number
 nprocs=total number of processors

OUTPUT

 full_mat(2,vectsize*nband)=full matrix

SIDE EFFECTS

PARENTS

      rayleigh_ritz

CHILDREN

SOURCE

664 subroutine from_block_cyclic_to_mat(full_mat, vectsize, nband, block_cyclic_mat, buffsize, blocksize, iproc, nprocs)
665 
666 
667 !This section has been created automatically by the script Abilint (TD).
668 !Do not modify the following lines by hand.
669 #undef ABI_FUNC
670 #define ABI_FUNC 'from_block_cyclic_to_mat'
671 !End of the abilint section
672 
673  implicit none
674 
675  integer, intent(in) :: vectsize, nband, buffsize, blocksize, iproc, nprocs
676  real(dp), intent(inout) :: full_mat(2, vectsize*nband)
677  real(dp), intent(in) :: block_cyclic_mat(2, vectsize*buffsize)
678 
679  integer :: shift_fullmat, shift_block_cyclic_mat
680  integer :: cur_bsize
681 
682  ! *************************************************************************
683 
684  shift_fullmat = iproc*blocksize
685  shift_block_cyclic_mat = 0
686 
687  do while(.true.)
688    cur_bsize = MIN(blocksize, nband-shift_fullmat)
689    if(cur_bsize <= 0) exit
690    full_mat(:, shift_fullmat*vectsize+1 : shift_fullmat*vectsize + cur_bsize*vectsize) =&
691 &   full_mat(:, shift_fullmat*vectsize+1 : shift_fullmat*vectsize + cur_bsize*vectsize) +&
692 &   block_cyclic_mat(:, shift_block_cyclic_mat*vectsize+1 : shift_block_cyclic_mat*vectsize + cur_bsize*vectsize)
693 
694    shift_block_cyclic_mat = shift_block_cyclic_mat + blocksize
695    shift_fullmat = shift_fullmat + blocksize*nprocs
696  end do
697 
698 end subroutine from_block_cyclic_to_mat

ABINIT/from_mat_to_block_cyclic [ Functions ]

[ Top ] [ Functions ]

NAME

 from_mat_to_block_cyclic

FUNCTION

 Fills block_cyclic_mat with the columns of full_mat owned by iproc, using a 1D block-cyclic distribution

INPUTS

 full_mat(2,vectsize*nband)=full input matrix
 vectsize=number of rows
 nband=number of columns of the full matrix
 buffsize=size of the local part of the matrix
 blocksize=size of block for the block-cyclic distribution
 iproc=local processor number
 nprocs=total number of processors

OUTPUT

 block_cyclic_mat(2,vectsize*buffsize)=local part of full_mat

SIDE EFFECTS

PARENTS

      rayleigh_ritz

CHILDREN

SOURCE

600 subroutine from_mat_to_block_cyclic(full_mat, vectsize, nband, block_cyclic_mat, buffsize, blocksize, iproc, nprocs)
601 
602 
603 !This section has been created automatically by the script Abilint (TD).
604 !Do not modify the following lines by hand.
605 #undef ABI_FUNC
606 #define ABI_FUNC 'from_mat_to_block_cyclic'
607 !End of the abilint section
608 
609  implicit none
610 
611  integer, intent(in) :: vectsize, nband, buffsize, blocksize, iproc, nprocs
612  real(dp), intent(in) :: full_mat(2, vectsize*nband)
613  real(dp), intent(inout) :: block_cyclic_mat(2, vectsize*buffsize)
614 
615  integer :: shift_fullmat, shift_block_cyclic_mat
616  integer :: cur_bsize
617 
618  ! *************************************************************************
619 
620  shift_fullmat = iproc*blocksize
621  shift_block_cyclic_mat = 0
622 
623  do while(.true.)
624    cur_bsize = MIN(blocksize, nband-shift_fullmat)
625    if(cur_bsize <= 0) exit
626    block_cyclic_mat(:, shift_block_cyclic_mat*vectsize+1 : shift_block_cyclic_mat*vectsize + cur_bsize*vectsize) = &
627 &   full_mat(:, shift_fullmat*vectsize+1 : shift_fullmat*vectsize + cur_bsize*vectsize)
628 
629    shift_block_cyclic_mat = shift_block_cyclic_mat + blocksize
630    shift_fullmat = shift_fullmat + blocksize*nprocs
631  end do
632 
633 end subroutine from_mat_to_block_cyclic

ABINIT/m_rayleigh_ritz [ Modules ]

[ Top ] [ Modules ]

NAME

  m_rayleigh_ritz

FUNCTION

 This file contains routines that perform the Rayleigh-Ritz,
 either by forming the full matrix (_subdiago) or by forming the
 distributed matrix in block-cyclic form (_distributed)

COPYRIGHT

  Copyright (C) 2014-2018 ABINIT group (AL)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

PARENTS

CHILDREN

SOURCE

23 #if defined HAVE_CONFIG_H
24 #include "config.h"
25 #endif
26 
27 #include "abi_common.h"
28 
29 module m_rayleigh_ritz
30 
31  use defs_basis
32  use defs_abitypes
33  use m_errors
34  use m_cgtools
35  use m_xmpi
36  use m_abicore
37  use m_abi_linalg
38  use m_slk
39 
40  use m_time,          only : timab
41  use m_numeric_tools, only : pack_matrix
42 
43  implicit none
44 
45  private

ABINIT/rayleigh_ritz_distributed [ Functions ]

[ Top ] [ Functions ]

NAME

 rayleigh_ritz_distributed

FUNCTION

 Performs a rayleigh-ritz procedure (subspace rotation), building the distributed
 hamiltonian/overlap matrices directly, and calling the ScaLapack routines

INPUTS

  mpi_enreg=informations about MPI parallelization
  nband=number of bands at this k point for that spin polarization
  npw=number of plane waves at this k point
  nspinor=number of plane waves at this k point
  usepaw=do we use the PAW method

OUTPUT

  eig(nband)=array for holding eigenvalues (hartree)

SIDE EFFECTS

  cg(2,*)=updated wavefunctions
  ghc(2,*)=updated ghc
  gsc(2,*)=updated gsc
  gvnlc(2,*)=updated gvnlc

PARENTS

      chebfi

CHILDREN

NOTES

  Should profile for large test cases and see where the bottleneck is.
  Is it the copies? Should we do partial GEMMs?
  Is it the latency? Should we buffer more?
  Should we overlap computations and communications? (easy in theory, tedious in practice)

SOURCE

321 subroutine rayleigh_ritz_distributed(cg,ghc,gsc,gvnlc,eig,istwf_k,mpi_enreg,nband,npw,nspinor,usepaw)
322 
323 
324 !This section has been created automatically by the script Abilint (TD).
325 !Do not modify the following lines by hand.
326 #undef ABI_FUNC
327 #define ABI_FUNC 'rayleigh_ritz_distributed'
328 !End of the abilint section
329 
330  implicit none
331 
332  integer,external :: NUMROC
333 
334  ! Arguments
335  type(mpi_type),intent(inout) :: mpi_enreg
336  integer,intent(in) :: nband,npw,nspinor,usepaw,istwf_k
337  real(dp),intent(inout) :: cg(2,npw*nspinor*nband),gsc(2,npw*nspinor*nband),ghc(2,npw*nspinor*nband),gvnlc(2,npw*nspinor*nband)
338  real(dp),intent(out) :: eig(nband)
339 
340  ! Locals
341  integer :: blocksize,nbproc,iproc,ierr,cplx,vectsize
342  integer :: buffsize_iproc(2), coords_iproc(2), grid_dims(2)
343  real(dp) :: cg_new(2,npw*nspinor*nband),gsc_or_vnlc_new(2,npw*nspinor*nband),ghc_new(2,npw*nspinor*nband)
344  real(dp), allocatable :: ham_iproc(:,:), ovl_iproc(:,:), evec_iproc(:,:), left_temp(:,:), right_temp(:,:)
345  real(dp) :: tsec(2)
346  type(matrix_scalapack) :: sca_ham, sca_ovl, sca_evec
347  character(len=500) :: message
348  character :: blas_transpose
349 
350  integer, parameter :: timer_chebfi = 1600, timer_alltoall = 1601, timer_apply_inv_ovl = 1602, timer_rotation = 1603
351  integer, parameter :: timer_subdiago = 1604, timer_subham = 1605, timer_ortho = 1606, timer_getghc = 1607
352  integer, parameter :: timer_residuals = 1608, timer_update_eigen = 1609, timer_sync = 1610
353 
354  ! *************************************************************************
355 
356  if(istwf_k == 1) then
357    cplx = 2
358    vectsize = npw*nspinor
359    blas_transpose = 'c'
360  else
361    cplx = 1
362    vectsize = 2*npw*nspinor
363    blas_transpose = 't'
364  end if
365 
366  !write(message, *) 'RR: init'
367  !call wrtout(std_out,message,'COLL')
368  !======================================================================================================
369  ! Init Scalapack matrices
370  !======================================================================================================
371  call init_matrix_scalapack(sca_ham ,nband,nband,abi_processor,istwf_k,10)
372  call init_matrix_scalapack(sca_ovl ,nband,nband,abi_processor,istwf_k,10)
373  call init_matrix_scalapack(sca_evec,nband,nband,abi_processor,istwf_k,10)
374 
375  ! Get info
376  blocksize = sca_ham%sizeb_blocs(1) ! Assume square blocs
377  nbproc = abi_processor%grid%nbprocs
378  grid_dims = abi_processor%grid%dims
379 
380  !======================================================================================================
381  ! Build hamiltonian and overlap matrices
382  !======================================================================================================
383  ! TODO maybe we should avoid copies at the price of less BLAS efficiency (when blocksize is small)? must profile.
384 
385  call timab(timer_subham, 1, tsec)
386 
387  ! Transform cg, ghc and maybe gsc, according to istwf_k
388  if(istwf_k == 2) then
389    cg = cg * sqrt2
390    if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) / sqrt2
391    ghc = ghc * sqrt2
392    if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) / sqrt2
393    if(usepaw == 1) then
394      gsc = gsc * sqrt2
395      if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) / sqrt2
396    end if
397  end if
398 
399  do iproc=0,nbproc-1
400    ! Build the local matrix belonging to processor iproc
401    !write(message, *) 'RR: build', iproc
402    !call wrtout(std_out,message,'COLL')
403 
404    ! Get coordinates of iproc
405    coords_iproc(1) = INT(iproc / grid_dims(2))
406    coords_iproc(2) = MOD(iproc,  grid_dims(2))
407 
408    ! Get buffersize of iproc
409    buffsize_iproc(1) = NUMROC(nband,blocksize,coords_iproc(1),0,abi_processor%grid%dims(1))
410    buffsize_iproc(2) = NUMROC(nband,blocksize,coords_iproc(2),0,abi_processor%grid%dims(2))
411 
412    ! Allocate matrices_iproc, that will gather the contribution of this proc to the block owned by iproc
413    ABI_ALLOCATE(ham_iproc, (cplx*buffsize_iproc(1), buffsize_iproc(2)))
414    ABI_ALLOCATE(ovl_iproc, (cplx*buffsize_iproc(1), buffsize_iproc(2)))
415 
416    ! Build them
417    ABI_ALLOCATE(left_temp,  (2, npw*nspinor*buffsize_iproc(1)))
418    ABI_ALLOCATE(right_temp, (2, npw*nspinor*buffsize_iproc(2)))
419 
420    ! ovl
421    call from_mat_to_block_cyclic(cg, npw*nspinor, nband, left_temp, &
422 &   buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
423    if(usepaw == 1) then
424      call from_mat_to_block_cyclic(gsc, npw*nspinor, nband, right_temp, &
425 &     buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
426    else
427      call from_mat_to_block_cyclic(cg, npw*nspinor, nband, right_temp, &
428 &     buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
429    end if
430    call abi_xgemm(blas_transpose,'n',buffsize_iproc(1),buffsize_iproc(2),vectsize,cone,left_temp,vectsize,&
431 &   right_temp,vectsize,czero,ovl_iproc,buffsize_iproc(1), x_cplx=cplx)
432 
433    ! ham
434    call from_mat_to_block_cyclic(ghc, npw*nspinor, nband, right_temp, &
435 &   buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
436    call abi_xgemm(blas_transpose,'n',buffsize_iproc(1),buffsize_iproc(2),vectsize,cone,left_temp,vectsize,&
437 &   right_temp,vectsize,czero,ham_iproc,buffsize_iproc(1), x_cplx=cplx)
438 
439    ! Sum to iproc, and fill sca_ matrices
440    call xmpi_sum_master(ham_iproc, iproc, abi_communicator, ierr)
441    call xmpi_sum_master(ovl_iproc, iproc, abi_communicator, ierr)
442    if(iproc == abi_processor%myproc) then
443      ! DCOPY to bypass the real/complex issue
444      if(cplx == 2) then
445        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ham_iproc, 1, sca_ham%buffer_cplx, 1)
446        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ovl_iproc, 1, sca_ovl%buffer_cplx, 1)
447      else
448        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ham_iproc, 1, sca_ham%buffer_real, 1)
449        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ovl_iproc, 1, sca_ovl%buffer_real, 1)
450      end if
451    end if
452 
453    ABI_DEALLOCATE(ham_iproc)
454    ABI_DEALLOCATE(ovl_iproc)
455    ABI_DEALLOCATE(left_temp)
456    ABI_DEALLOCATE(right_temp)
457  end do
458 
459  ! Final sum
460  if(cplx == 2) then
461    call xmpi_sum(sca_ham%buffer_cplx, abi_complement_communicator, ierr)
462    call xmpi_sum(sca_ovl%buffer_cplx, abi_complement_communicator, ierr)
463  else
464    call xmpi_sum(sca_ham%buffer_real, abi_complement_communicator, ierr)
465    call xmpi_sum(sca_ovl%buffer_real, abi_complement_communicator, ierr)
466  end if
467 
468  ! Transform back
469  if(istwf_k == 2) then
470    cg = cg / sqrt2
471    if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * sqrt2
472    ghc = ghc / sqrt2
473    if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) * sqrt2
474    if(usepaw == 1) then
475      gsc = gsc / sqrt2
476      if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) * sqrt2
477    end if
478  end if
479  call timab(timer_subham, 2, tsec)
480 
481  !======================================================================================================
482  ! Do the diagonalization
483  !======================================================================================================
484  !write(message, *) 'RR: diag'
485  !call wrtout(std_out,message,'COLL')
486  call timab(timer_subdiago, 1, tsec)
487  call compute_generalized_eigen_problem(abi_processor,sca_ham,sca_ovl,&
488 & sca_evec,eig,abi_communicator,istwf_k)
489  call timab(timer_subdiago, 2, tsec)
490 
491  !======================================================================================================
492  ! Perform rotation
493  !======================================================================================================
494  call timab(timer_rotation, 1, tsec)
495  cg_new = zero
496  gsc_or_vnlc_new = zero
497  ghc_new = zero
498  do iproc=0,nbproc-1
499    ! Compute the contribution to the rotated matrices from this block
500    !write(message, *) 'RR: rot', iproc
501    !call wrtout(std_out,message,'COLL')
502 
503    ! Get coordinates of iproc
504    coords_iproc(1) = INT(iproc / grid_dims(2))
505    coords_iproc(2) = MOD(iproc,  grid_dims(2))
506 
507    ! Get buffersize of iproc
508    buffsize_iproc(1) = NUMROC(nband,blocksize,coords_iproc(1),0,abi_processor%grid%dims(1))
509    buffsize_iproc(2) = NUMROC(nband,blocksize,coords_iproc(2),0,abi_processor%grid%dims(2))
510 
511    ! Get data from iproc
512    ABI_ALLOCATE(evec_iproc, (cplx*buffsize_iproc(1), buffsize_iproc(2)))
513    if(iproc == abi_processor%myproc) then
514      if(cplx == 2) then
515        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), sca_evec%buffer_cplx, 1, evec_iproc, 1)
516      else
517        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), sca_evec%buffer_real, 1, evec_iproc, 1)
518      end if
519    end if
520    call xmpi_bcast(evec_iproc,iproc,abi_communicator,ierr)
521 
522    ! Compute contribution to the rotated matrices from iproc
523    ABI_ALLOCATE(left_temp,  (2,npw*nspinor*buffsize_iproc(1)))
524    ABI_ALLOCATE(right_temp,  (2,npw*nspinor*buffsize_iproc(2)))
525 
526    ! cg
527    call from_mat_to_block_cyclic(cg, npw*nspinor, nband, left_temp, &
528 &   buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
529    call abi_xgemm('n','n',vectsize,buffsize_iproc(2),buffsize_iproc(1),cone,left_temp,vectsize,&
530 &   evec_iproc, buffsize_iproc(1), czero, right_temp, vectsize, x_cplx=cplx)
531    call from_block_cyclic_to_mat(cg_new, npw*nspinor, nband, right_temp, &
532 &   buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
533 
534    ! ghc
535    call from_mat_to_block_cyclic(ghc, npw*nspinor, nband, left_temp, &
536 &   buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
537    call abi_xgemm('n','n',vectsize,buffsize_iproc(2),buffsize_iproc(1),cone,left_temp,vectsize,&
538 &   evec_iproc, buffsize_iproc(1), czero, right_temp, vectsize,x_cplx=cplx)
539    call from_block_cyclic_to_mat(ghc_new, npw*nspinor, nband, right_temp,&
540 &   buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
541 
542    ! gsc or vnlc
543    if(usepaw == 1) then
544      call from_mat_to_block_cyclic(gsc, npw*nspinor, nband, left_temp, &
545 &     buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
546    else
547      call from_mat_to_block_cyclic(gvnlc, npw*nspinor, nband, left_temp, &
548 &     buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
549    end if
550    call abi_xgemm('n','n',vectsize,buffsize_iproc(2),buffsize_iproc(1),cone,left_temp,vectsize,&
551 &   evec_iproc, buffsize_iproc(1), czero, right_temp, vectsize, x_cplx=cplx)
552    call from_block_cyclic_to_mat(gsc_or_vnlc_new, npw*nspinor, nband, right_temp, &
553 &   buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
554 
555    ABI_DEALLOCATE(evec_iproc)
556    ABI_DEALLOCATE(left_temp)
557    ABI_DEALLOCATE(right_temp)
558  end do
559 
560  ! Overwrite with _new
561  cg = cg_new
562  ghc = ghc_new
563  if(usepaw == 1) then
564    gsc = gsc_or_vnlc_new
565  else
566    gvnlc = gsc_or_vnlc_new
567  end if
568  call timab(timer_rotation, 2, tsec)
569 
570 end subroutine rayleigh_ritz_distributed

ABINIT/rayleigh_ritz_subdiago [ Functions ]

[ Top ] [ Functions ]

NAME

 rayleigh_ritz_subdiago

FUNCTION

 Performs a rayleigh-ritz procedure (subspace rotation), building the
 hamiltonian/overlap matrices in full and calling the subdiago method

INPUTS

  mpi_enreg=informations about MPI parallelization
  nband=number of bands at this k point for that spin polarization
  npw=number of plane waves at this k point
  nspinor=number of plane waves at this k point
  usepaw=do we use the PAW method

OUTPUT

  eig(nband)=array for holding eigenvalues (hartree)

SIDE EFFECTS

  cg(2,*)=updated wavefunctions
  ghc(2,*)=updated ghc
  gsc(2,*)=updated gsc
  gvnlc(2,*)=updated gvnlc

PARENTS

      chebfi

CHILDREN

NOTES

  TODO choose generalized eigenproblem or ortho + diago (see #if 1)

SOURCE

 91 subroutine rayleigh_ritz_subdiago(cg,ghc,gsc,gvnlc,eig,istwf_k,mpi_enreg,nband,npw,nspinor,usepaw)
 92 
 93 
 94 !This section has been created automatically by the script Abilint (TD).
 95 !Do not modify the following lines by hand.
 96 #undef ABI_FUNC
 97 #define ABI_FUNC 'rayleigh_ritz_subdiago'
 98 !End of the abilint section
 99 
100  implicit none
101 
102  ! Arguments
103  type(mpi_type),intent(inout) :: mpi_enreg
104  integer,intent(in) :: nband,npw,nspinor,usepaw,istwf_k
105  real(dp),intent(inout) :: cg(2,npw*nspinor*nband),gsc(2,npw*nspinor*nband),ghc(2,npw*nspinor*nband),gvnlc(2,npw*nspinor*nband)
106  real(dp),intent(out) :: eig(nband)
107 
108  ! Locals
109  real(dp), allocatable :: subham(:), totham(:,:)
110  real(dp), allocatable :: subovl(:), totovl(:,:)
111  real(dp), allocatable :: evec(:,:), edummy(:,:)
112  integer :: ierr, cplx, vectsize
113  real(dp) :: gtempc(2,npw*nspinor*nband), tsec(2)
114  character :: blas_transpose
115 
116  integer, parameter :: timer_chebfi = 1600, timer_alltoall = 1601, timer_apply_inv_ovl = 1602, timer_rotation = 1603
117  integer, parameter :: timer_subdiago = 1604, timer_subham = 1605, timer_ortho = 1606, timer_getghc = 1607
118  integer, parameter :: timer_residuals = 1608, timer_update_eigen = 1609, timer_sync = 1610
119 
120  ! *************************************************************************
121 
122  if(istwf_k == 1) then
123    cplx = 2
124    vectsize = npw*nspinor
125    blas_transpose = 'c'
126  else
127    cplx = 1
128    vectsize = 2*npw*nspinor
129    blas_transpose = 't'
130  end if
131 
132 #if 1
133  call timab(timer_subham, 1, tsec)
134 
135  ! Transform cg, ghc and maybe gsc, according to istwf_k
136  if(istwf_k == 2) then
137    cg = cg * sqrt2
138    if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) / sqrt2
139    ghc = ghc * sqrt2
140    if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) / sqrt2
141    if(usepaw == 1) then
142      gsc = gsc * sqrt2
143      if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) / sqrt2
144    end if
145  end if
146 
147  ! Build, pack and sum suham
148  ABI_ALLOCATE(subham, (cplx*nband*(nband+1)/2))
149  ABI_ALLOCATE(totham, (cplx, nband*nband))
150  call abi_xgemm(blas_transpose,'n',nband,nband,vectsize,cone,ghc,vectsize,&
151 & cg,vectsize,czero,totham,nband, x_cplx=cplx)
152  call pack_matrix(totham, subham, nband, cplx)
153  ABI_DEALLOCATE(totham)
154  call xmpi_sum(subham,mpi_enreg%comm_bandspinorfft,ierr)
155 
156 
157  ! Same for subovl
158  ABI_ALLOCATE(subovl, (cplx*nband*(nband+1)/2))
159  ABI_ALLOCATE(totovl, (cplx, nband*nband))
160  if(usepaw == 1) then
161    call abi_xgemm(blas_transpose,'n',nband,nband,vectsize,cone,gsc,vectsize,&
162 &   cg,vectsize,czero,totovl,nband, x_cplx=cplx)
163  else
164    call abi_xgemm(blas_transpose,'n',nband,nband,vectsize,cone,cg,vectsize,&
165 &   cg,vectsize,czero,totovl,nband, x_cplx=cplx)
166  end if
167  call pack_matrix(totovl, subovl, nband, cplx)
168  ABI_DEALLOCATE(totovl)
169  call xmpi_sum(subovl,mpi_enreg%comm_bandspinorfft,ierr)
170 
171 
172  ! Transform back
173  if(istwf_k == 2) then
174    cg = cg / sqrt2
175    if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * sqrt2
176    ghc = ghc / sqrt2
177    if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) * sqrt2
178    if(usepaw == 1) then
179      gsc = gsc / sqrt2
180      if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) * sqrt2
181    end if
182  end if
183  call timab(timer_subham, 2, tsec)
184 
185 
186  call timab(timer_subdiago, 1, tsec)
187  ABI_ALLOCATE(evec, (cplx*nband, nband))
188 
189  call abi_xhpgv(1,'V','U',nband,subham,subovl,eig,evec,istwf_k=istwf_k,use_slk=mpi_enreg%paral_kgb)
190 
191  ABI_DEALLOCATE(subham)
192  ABI_DEALLOCATE(subovl)
193 
194 ! Fix the phase (this is because of the simultaneous diagonalisation of this
195 ! matrix by different processors, allowing to get different unitary transforms, thus breaking the
196 ! coherency of parts of cg stored on different processors).
197 ! call cg_normev(evec,nband,nband)  ! Unfortunately, for cg_normev to work, one needs the vectors to be normalized, so uses fxphas_seq
198  ABI_ALLOCATE(edummy, (cplx*nband, nband))
199  call fxphas_seq(evec,edummy,0,0,1,nband*nband,nband*nband,nband,nband,0)
200  ABI_DEALLOCATE(edummy)
201 
202 
203 
204  ! Rotate
205  call abi_xgemm('n','n',vectsize,nband, nband,cone,cg , vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx)
206  cg = gtempc
207  call abi_xgemm('n','n',vectsize,nband, nband,cone,ghc, vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx)
208  ghc = gtempc
209  if(usepaw == 1) then
210    call abi_xgemm('n','n',vectsize,nband, nband,cone,gsc, vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx)
211    gsc = gtempc
212  else
213    call abi_xgemm('n','n',vectsize,nband, nband,cone,gvnlc, vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx)
214    gvnlc = gtempc
215  end if
216  ABI_DEALLOCATE(evec)
217  call timab(timer_subdiago, 2, tsec)
218 
219 #else
220  !! TODO non-functional, should be rewritten. Possibly faster (tests needed)
221  write(message, *) 'Transposed, orthogonalizing'
222  call wrtout(std_out,message,'COLL')
223 
224  ! orthonormalization
225  call timab(timer_ortho, 1, tsec)
226  if (paw) then
227    call abi_xorthonormalize(cg, gsc,nband, mpi_enreg%comm_bandspinorfft, sqgram, npw*nspinor, 2)
228  else
229    call abi_xorthonormalize(cg, cg, nband, mpi_enreg%comm_bandspinorfft, sqgram, npw*nspinor, 2)
230  end if
231  call timab(timer_ortho, 2, tsec)
232 
233  ! rotate ghc, gsc and gvnlc
234  call timab(timer_rotation, 1, tsec)
235  call abi_xtrsm('r','u','n','n',npw*nspinor,nband,cone,sqgram,nband, ghc,npw*nspinor,x_cplx=2)
236  if(paw) then
237    call abi_xtrsm('r','u','n','n',npw*nspinor,nband,cone,sqgram,nband, gsc,npw*nspinor,x_cplx=2)
238  else
239    call abi_xtrsm('r','u','n','n',npw*nspinor,nband,cone,sqgram,nband, gvnlc,npw*nspinor,x_cplx=2)
240  end if
241  call timab(timer_rotation, 2, tsec)
242 
243  write(message, *) 'Orthogonalized, building subham'
244  call wrtout(std_out,message,'COLL')
245 
246  ! build hamiltonian  in subspace
247  call timab(timer_subham, 1, tsec)
248  call abi_xgemm(blas_transpose,'n',nband,nband,npw*nspinor,cone,ghc,npw*nspinor,&
249 & cg,npw*nspinor,czero,totham,nband, x_cplx=2)
250  ! pack for compatibility with subdiago
251  call pack_matrix(totham, subham, nband)
252  call xmpi_sum(subham,mpi_enreg%comm_bandspinorfft,ierr)
253  call timab(timer_subham, 2, tsec)
254 
255  write(message, *) 'Subham built, diagonalizing'
256  call wrtout(std_out,message,'COLL')
257 
258  ! Rayleigh-Ritz
259  call timab(timer_subdiago,1,tsec)
260  call subdiago(cg,eig,evec,gsc,0,0,gs_hamk%istwf_k,&
261 & mcg,mcg,nband,npw,nspinor,dtset%paral_kgb,&
262 & subham,dummy,0,gs_hamk%usepaw,mpi_enreg%me_g0)
263  call timab(timer_subdiago,2,tsec)
264 
265  write(message, *) 'Diagonalization done'
266  call wrtout(std_out,message,'COLL')
267 
268  ! Rotate ghc and gvnlc according to evecs
269  call timab(timer_rotation, 1, tsec)
270  call abi_xgemm('n','n',npw*nspinor,nband, nband,cone,ghc, npw*nspinor, evec, nband, czero, gtempc, npw*nspinor, x_cplx=2)
271  ghc = gtempc
272  if(.not. paw) then
273    call abi_xgemm('n','n',npw*nspinor,nband, nband,cone,gvnlc, npw*nspinor, evec, nband, czero, gtempc, npw*nspinor, x_cplx=2)
274    gvnlc = gtempc
275  end if
276  call timab(timer_rotation, 2, tsec)
277 
278 #endif
279 
280 end subroutine rayleigh_ritz_subdiago