TABLE OF CONTENTS


ABINIT/from_block_cyclic_to_mat [ Functions ]

[ Top ] [ Functions ]

NAME

 from_block_cyclic_to_mat

FUNCTION

 Fills the columns of full_mat owned by iproc with block_cyclic_mat, using a 1D block-cyclic distribution

INPUTS

 block_cyclic_mat(2,vectsize*buffsize)=local part of full_mat
 vectsize=number of rows
 nband=number of columns of the full matrix
 buffsize=size of the local part of the matrix
 blocksize=size of block for the block-cyclic distribution
 iproc=local processor number
 nprocs=total number of processors

OUTPUT

 full_mat(2,vectsize*nband)=full matrix

SIDE EFFECTS

SOURCE

616 subroutine from_block_cyclic_to_mat(full_mat, vectsize, nband, block_cyclic_mat, buffsize, blocksize, iproc, nprocs)
617 
618  integer, intent(in) :: vectsize, nband, buffsize, blocksize, iproc, nprocs
619  real(dp), intent(inout) :: full_mat(2, vectsize*nband)
620  real(dp), intent(in) :: block_cyclic_mat(2, vectsize*buffsize)
621 
622  integer :: shift_fullmat, shift_block_cyclic_mat
623  integer :: cur_bsize
624 
625  ! *************************************************************************
626 
627  shift_fullmat = iproc*blocksize
628  shift_block_cyclic_mat = 0
629 
630  do while(.true.)
631    cur_bsize = MIN(blocksize, nband-shift_fullmat)
632    if(cur_bsize <= 0) exit
633    full_mat(:, shift_fullmat*vectsize+1 : shift_fullmat*vectsize + cur_bsize*vectsize) =&
634 &   full_mat(:, shift_fullmat*vectsize+1 : shift_fullmat*vectsize + cur_bsize*vectsize) +&
635 &   block_cyclic_mat(:, shift_block_cyclic_mat*vectsize+1 : shift_block_cyclic_mat*vectsize + cur_bsize*vectsize)
636 
637    shift_block_cyclic_mat = shift_block_cyclic_mat + blocksize
638    shift_fullmat = shift_fullmat + blocksize*nprocs
639  end do
640 
641 end subroutine from_block_cyclic_to_mat

ABINIT/from_mat_to_block_cyclic [ Functions ]

[ Top ] [ Functions ]

NAME

 from_mat_to_block_cyclic

FUNCTION

 Fills block_cyclic_mat with the columns of full_mat owned by iproc, using a 1D block-cyclic distribution

INPUTS

 full_mat(2,vectsize*nband)=full input matrix
 vectsize=number of rows
 nband=number of columns of the full matrix
 buffsize=size of the local part of the matrix
 blocksize=size of block for the block-cyclic distribution
 iproc=local processor number
 nprocs=total number of processors

OUTPUT

 block_cyclic_mat(2,vectsize*buffsize)=local part of full_mat

SIDE EFFECTS

SOURCE

566 subroutine from_mat_to_block_cyclic(full_mat, vectsize, nband, block_cyclic_mat, buffsize, blocksize, iproc, nprocs)
567 
568  integer, intent(in) :: vectsize, nband, buffsize, blocksize, iproc, nprocs
569  real(dp), intent(in) :: full_mat(2, vectsize*nband)
570  real(dp), intent(inout) :: block_cyclic_mat(2, vectsize*buffsize)
571 
572  integer :: shift_fullmat, shift_block_cyclic_mat
573  integer :: cur_bsize
574 
575  ! *************************************************************************
576 
577  shift_fullmat = iproc*blocksize
578  shift_block_cyclic_mat = 0
579 
580  do while(.true.)
581    cur_bsize = MIN(blocksize, nband-shift_fullmat)
582    if(cur_bsize <= 0) exit
583    block_cyclic_mat(:, shift_block_cyclic_mat*vectsize+1 : shift_block_cyclic_mat*vectsize + cur_bsize*vectsize) = &
584 &   full_mat(:, shift_fullmat*vectsize+1 : shift_fullmat*vectsize + cur_bsize*vectsize)
585 
586    shift_block_cyclic_mat = shift_block_cyclic_mat + blocksize
587    shift_fullmat = shift_fullmat + blocksize*nprocs
588  end do
589 
590 end subroutine from_mat_to_block_cyclic

ABINIT/m_rayleigh_ritz [ Modules ]

[ Top ] [ Modules ]

NAME

  m_rayleigh_ritz

FUNCTION

 This file contains routines that perform the Rayleigh-Ritz,
 either by forming the full matrix (_subdiago) or by forming the
 distributed matrix in block-cyclic form (_distributed)

COPYRIGHT

  Copyright (C) 2014-2024 ABINIT group (AL)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

18 #if defined HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21 
22 #include "abi_common.h"
23 
24 module m_rayleigh_ritz
25 
26  use defs_basis
27  use m_errors
28  use m_cgtools
29  use m_xmpi
30  use m_abicore
31  use m_abi_linalg
32  use m_slk
33 
34  use defs_abitypes,   only : mpi_type
35  use m_time,          only : timab
36  use m_numeric_tools, only : pack_matrix
37 
38  implicit none
39 
40  private

ABINIT/rayleigh_ritz_distributed [ Functions ]

[ Top ] [ Functions ]

NAME

 rayleigh_ritz_distributed

FUNCTION

 Performs a rayleigh-ritz procedure (subspace rotation), building the distributed
 hamiltonian/overlap matrices directly, and calling the ScaLapack routines

INPUTS

  mpi_enreg=information about MPI parallelization
  nband=number of bands at this k point for that spin polarization
  npw=number of plane waves at this k point
  nspinor=number of plane waves at this k point
  usepaw=do we use the PAW method

OUTPUT

  eig(nband)=array for holding eigenvalues (hartree)

SIDE EFFECTS

  cg(2,*)=updated wavefunctions
  ghc(2,*)=updated ghc
  gsc(2,*)=updated gsc
  gvnlxc(2,*)=updated gvnlxc

NOTES

  Should profile for large test cases and see where the bottleneck is.
  Is it the copies? Should we do partial GEMMs?
  Is it the latency? Should we buffer more?
  Should we overlap computations and communications? (easy in theory, tedious in practice)

SOURCE

294 subroutine rayleigh_ritz_distributed(cg,ghc,gsc,gvnlxc,eig,has_fock,istwf_k,mpi_enreg,nband,npw,nspinor,usepaw)
295 
296  integer,external :: NUMROC
297 
298  ! Arguments
299  type(mpi_type),intent(inout) :: mpi_enreg
300  integer,intent(in) :: nband,npw,nspinor,usepaw,istwf_k
301  real(dp),intent(inout) :: cg(2,npw*nspinor*nband),gsc(2,npw*nspinor*nband),ghc(2,npw*nspinor*nband),gvnlxc(2,npw*nspinor*nband)
302  real(dp),intent(out) :: eig(nband)
303  logical :: has_fock
304 
305  ! Locals
306  integer :: blocksize,nbproc,iproc,ierr,cplx,vectsize
307  integer :: buffsize_iproc(2), coords_iproc(2), grid_dims(2)
308  real(dp) :: cg_new(2,npw*nspinor*nband),gsc_or_vnlxc_new(2,npw*nspinor*nband),ghc_new(2,npw*nspinor*nband)
309  real(dp), allocatable :: ham_iproc(:,:), ovl_iproc(:,:), evec_iproc(:,:), left_temp(:,:), right_temp(:,:)
310  real(dp) :: tsec(2)
311  type(matrix_scalapack) :: sca_ham, sca_ovl, sca_evec
312  !character(len=500) :: message
313  character :: blas_transpose
314 
315  integer, parameter :: timer_chebfi = 1600, timer_alltoall = 1601, timer_apply_inv_ovl = 1602, timer_rotation = 1603
316  integer, parameter :: timer_subdiago = 1604, timer_subham = 1605, timer_ortho = 1606, timer_getghc = 1607
317  integer, parameter :: timer_residuals = 1608, timer_update_eigen = 1609, timer_sync = 1610
318 
319  ! *************************************************************************
320 
321  if(istwf_k == 1) then
322    cplx = 2
323    vectsize = npw*nspinor
324    blas_transpose = 'c'
325  else
326    cplx = 1
327    vectsize = 2*npw*nspinor
328    blas_transpose = 't'
329  end if
330 
331  !write(message, *) 'RR: init'
332  !call wrtout(std_out,message)
333  !======================================================================================================
334  ! Init Scalapack matrices
335  !======================================================================================================
336  call sca_ham%init(nband,nband,slk_processor,istwf_k)
337  call sca_ovl%init(nband,nband,slk_processor,istwf_k)
338  call sca_evec%init(nband,nband,slk_processor,istwf_k)
339 
340  ! Get info
341  blocksize = sca_ham%sizeb_blocs(1) ! Assume square blocs
342  nbproc = slk_processor%grid%nbprocs
343  grid_dims = slk_processor%grid%dims
344 
345  !======================================================================================================
346  ! Build hamiltonian and overlap matrices
347  !======================================================================================================
348  ! TODO maybe we should avoid copies at the price of less BLAS efficiency (when blocksize is small)? must profile.
349 
350  call timab(timer_subham, 1, tsec)
351 
352  ! Transform cg, ghc and maybe gsc, according to istwf_k
353  if(istwf_k == 2) then
354    cg = cg * sqrt2
355    if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) / sqrt2
356    ghc = ghc * sqrt2
357    if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) / sqrt2
358    if(usepaw == 1) then
359      gsc = gsc * sqrt2
360      if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) / sqrt2
361    end if
362  end if
363 
364  do iproc=0,nbproc-1
365    ! Build the local matrix belonging to processor iproc
366    !write(message, *) 'RR: build', iproc
367    !call wrtout(std_out,message)
368 
369    ! Get coordinates of iproc
370    coords_iproc(1) = INT(iproc / grid_dims(2))
371    coords_iproc(2) = MOD(iproc,  grid_dims(2))
372 
373    ! Get buffersize of iproc
374    buffsize_iproc(1) = NUMROC(nband,blocksize,coords_iproc(1),0,slk_processor%grid%dims(1))
375    buffsize_iproc(2) = NUMROC(nband,blocksize,coords_iproc(2),0,slk_processor%grid%dims(2))
376 
377    ! Allocate matrices_iproc, that will gather the contribution of this proc to the block owned by iproc
378    ABI_MALLOC(ham_iproc, (cplx*buffsize_iproc(1), buffsize_iproc(2)))
379    ABI_MALLOC(ovl_iproc, (cplx*buffsize_iproc(1), buffsize_iproc(2)))
380 
381    ! Build them
382    ABI_MALLOC(left_temp,  (2, npw*nspinor*buffsize_iproc(1)))
383    ABI_MALLOC(right_temp, (2, npw*nspinor*buffsize_iproc(2)))
384 
385    ! ovl
386    call from_mat_to_block_cyclic(cg, npw*nspinor, nband, left_temp, &
387 &   buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
388    if(usepaw == 1) then
389      call from_mat_to_block_cyclic(gsc, npw*nspinor, nband, right_temp, &
390 &     buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
391    else
392      call from_mat_to_block_cyclic(cg, npw*nspinor, nband, right_temp, &
393 &     buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
394    end if
395    call abi_xgemm(blas_transpose,'n',buffsize_iproc(1),buffsize_iproc(2),vectsize,cone,left_temp,vectsize,&
396 &   right_temp,vectsize,czero,ovl_iproc,buffsize_iproc(1), x_cplx=cplx)
397 
398    ! ham
399    call from_mat_to_block_cyclic(ghc, npw*nspinor, nband, right_temp, &
400 &   buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
401    call abi_xgemm(blas_transpose,'n',buffsize_iproc(1),buffsize_iproc(2),vectsize,cone,left_temp,vectsize,&
402 &   right_temp,vectsize,czero,ham_iproc,buffsize_iproc(1), x_cplx=cplx)
403 
404    ! Sum to iproc, and fill sca_ matrices
405    call xmpi_sum_master(ham_iproc, iproc, slk_communicator, ierr)
406    call xmpi_sum_master(ovl_iproc, iproc, slk_communicator, ierr)
407    if(iproc == slk_processor%myproc) then
408      ! DCOPY to bypass the real/complex issue
409      if(cplx == 2) then
410        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ham_iproc, 1, sca_ham%buffer_cplx, 1)
411        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ovl_iproc, 1, sca_ovl%buffer_cplx, 1)
412      else
413        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ham_iproc, 1, sca_ham%buffer_real, 1)
414        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ovl_iproc, 1, sca_ovl%buffer_real, 1)
415      end if
416    end if
417 
418    ABI_FREE(ham_iproc)
419    ABI_FREE(ovl_iproc)
420    ABI_FREE(left_temp)
421    ABI_FREE(right_temp)
422  end do
423 
424  ! Final sum
425  if(cplx == 2) then
426    call xmpi_sum(sca_ham%buffer_cplx, slk_complement_communicator, ierr)
427    call xmpi_sum(sca_ovl%buffer_cplx, slk_complement_communicator, ierr)
428  else
429    call xmpi_sum(sca_ham%buffer_real, slk_complement_communicator, ierr)
430    call xmpi_sum(sca_ovl%buffer_real, slk_complement_communicator, ierr)
431  end if
432 
433  ! Transform back
434  if(istwf_k == 2) then
435    cg = cg / sqrt2
436    if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * sqrt2
437    ghc = ghc / sqrt2
438    if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) * sqrt2
439    if(usepaw == 1) then
440      gsc = gsc / sqrt2
441      if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) * sqrt2
442    end if
443  end if
444  call timab(timer_subham, 2, tsec)
445 
446  !======================================================================================================
447  ! Do the diagonalization
448  !======================================================================================================
449  !write(message, *) 'RR: diag'
450  !call wrtout(std_out,message)
451  call timab(timer_subdiago, 1, tsec)
452  call compute_generalized_eigen_problem(slk_processor,sca_ham,sca_ovl,&
453 & sca_evec,eig,slk_communicator,istwf_k)
454  call timab(timer_subdiago, 2, tsec)
455 
456  !======================================================================================================
457  ! Perform rotation
458  !======================================================================================================
459  call timab(timer_rotation, 1, tsec)
460  cg_new = zero
461  gsc_or_vnlxc_new = zero
462  ghc_new = zero
463  do iproc=0,nbproc-1
464    ! Compute the contribution to the rotated matrices from this block
465    !write(message, *) 'RR: rot', iproc
466    !call wrtout(std_out,message)
467 
468    ! Get coordinates of iproc
469    coords_iproc(1) = INT(iproc / grid_dims(2))
470    coords_iproc(2) = MOD(iproc,  grid_dims(2))
471 
472    ! Get buffersize of iproc
473    buffsize_iproc(1) = NUMROC(nband,blocksize,coords_iproc(1),0,slk_processor%grid%dims(1))
474    buffsize_iproc(2) = NUMROC(nband,blocksize,coords_iproc(2),0,slk_processor%grid%dims(2))
475 
476    ! Get data from iproc
477    ABI_MALLOC(evec_iproc, (cplx*buffsize_iproc(1), buffsize_iproc(2)))
478    if(iproc == slk_processor%myproc) then
479      if(cplx == 2) then
480        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), sca_evec%buffer_cplx, 1, evec_iproc, 1)
481      else
482        call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), sca_evec%buffer_real, 1, evec_iproc, 1)
483      end if
484    end if
485    call xmpi_bcast(evec_iproc,iproc,slk_communicator,ierr)
486 
487    ! Compute contribution to the rotated matrices from iproc
488    ABI_MALLOC(left_temp,  (2,npw*nspinor*buffsize_iproc(1)))
489    ABI_MALLOC(right_temp,  (2,npw*nspinor*buffsize_iproc(2)))
490 
491    ! cg
492    call from_mat_to_block_cyclic(cg, npw*nspinor, nband, left_temp, &
493 &   buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
494    call abi_xgemm('n','n',vectsize,buffsize_iproc(2),buffsize_iproc(1),cone,left_temp,vectsize,&
495 &   evec_iproc, buffsize_iproc(1), czero, right_temp, vectsize, x_cplx=cplx)
496    call from_block_cyclic_to_mat(cg_new, npw*nspinor, nband, right_temp, &
497 &   buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
498 
499    ! ghc
500    call from_mat_to_block_cyclic(ghc, npw*nspinor, nband, left_temp, &
501 &   buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
502    call abi_xgemm('n','n',vectsize,buffsize_iproc(2),buffsize_iproc(1),cone,left_temp,vectsize,&
503 &   evec_iproc, buffsize_iproc(1), czero, right_temp, vectsize,x_cplx=cplx)
504    call from_block_cyclic_to_mat(ghc_new, npw*nspinor, nband, right_temp,&
505 &   buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
506 
507    ! gsc or vnlc
508    if(usepaw == 1) then
509      call from_mat_to_block_cyclic(gsc, npw*nspinor, nband, left_temp, &
510 &     buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
511    endif
512    if(usepaw==0 .or. has_fock)then
513      call from_mat_to_block_cyclic(gvnlxc, npw*nspinor, nband, left_temp, &
514 &     buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1))
515    end if
516    call abi_xgemm('n','n',vectsize,buffsize_iproc(2),buffsize_iproc(1),cone,left_temp,vectsize,&
517 &   evec_iproc, buffsize_iproc(1), czero, right_temp, vectsize, x_cplx=cplx)
518    call from_block_cyclic_to_mat(gsc_or_vnlxc_new, npw*nspinor, nband, right_temp, &
519 &   buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2))
520 
521    ABI_FREE(evec_iproc)
522    ABI_FREE(left_temp)
523    ABI_FREE(right_temp)
524  end do
525 
526  ! Overwrite with _new
527  cg = cg_new
528  ghc = ghc_new
529  if(usepaw == 1) then
530    gsc = gsc_or_vnlxc_new
531  endif
532  if(usepaw==0 .or. has_fock)then
533    gvnlxc = gsc_or_vnlxc_new
534  end if
535  call timab(timer_rotation, 2, tsec)
536 
537  call sca_ham%free()
538  call sca_ovl%free()
539  call sca_evec%free()
540 
541 end subroutine rayleigh_ritz_distributed

ABINIT/rayleigh_ritz_subdiago [ Functions ]

[ Top ] [ Functions ]

NAME

 rayleigh_ritz_subdiago

FUNCTION

 Performs a rayleigh-ritz procedure (subspace rotation), building the
 hamiltonian/overlap matrices in full and calling the subdiago method

INPUTS

  mpi_enreg=information about MPI parallelization
  nband=number of bands at this k point for that spin polarization
  npw=number of plane waves at this k point
  nspinor=number of plane waves at this k point
  usepaw=if 1 we use the PAW method

OUTPUT

  eig(nband)=array for holding eigenvalues (hartree)

SIDE EFFECTS

  cg(2,*)=updated wavefunctions
  ghc(2,*)=updated ghc
  gsc(2,*)=updated gsc
  gvnlxc(2,*)=updated gvnlxc

NOTES

  TODO choose generalized eigenproblem or ortho + diago (see #if 1)

SOURCE

 81 subroutine rayleigh_ritz_subdiago(cg,ghc,gsc,gvnlxc,eig,has_fock,istwf_k,mpi_enreg,nband,npw,nspinor,usepaw)
 82 
 83  ! Arguments
 84  type(mpi_type),intent(inout) :: mpi_enreg
 85  integer,intent(in) :: nband,npw,nspinor,usepaw,istwf_k
 86  real(dp),intent(inout) :: cg(2,npw*nspinor*nband),gsc(2,npw*nspinor*nband),ghc(2,npw*nspinor*nband),gvnlxc(2,npw*nspinor*nband)
 87  real(dp),intent(out) :: eig(nband)
 88  logical :: has_fock
 89 
 90  ! Locals
 91  real(dp), allocatable :: subham(:), totham(:,:)
 92  real(dp), allocatable :: subovl(:), totovl(:,:)
 93  real(dp), allocatable :: evec(:,:), edummy(:,:)
 94  integer :: ierr, cplx, vectsize
 95  real(dp) :: gtempc(2,npw*nspinor*nband), tsec(2)
 96  character :: blas_transpose
 97 
 98  integer, parameter :: timer_chebfi = 1600, timer_alltoall = 1601, timer_apply_inv_ovl = 1602, timer_rotation = 1603
 99  integer, parameter :: timer_subdiago = 1604, timer_subham = 1605, timer_ortho = 1606, timer_getghc = 1607
100  integer, parameter :: timer_residuals = 1608, timer_update_eigen = 1609, timer_sync = 1610
101 
102  ! *************************************************************************
103 
104  if(istwf_k == 1) then
105    cplx = 2
106    vectsize = npw*nspinor
107    blas_transpose = 'c'
108  else
109    cplx = 1
110    vectsize = 2*npw*nspinor
111    blas_transpose = 't'
112  end if
113 
114 #if 1
115  call timab(timer_subham, 1, tsec)
116 
117  ! Transform cg, ghc and maybe gsc, according to istwf_k
118  if(istwf_k == 2) then
119    cg = cg * sqrt2
120    if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) / sqrt2
121    ghc = ghc * sqrt2
122    if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) / sqrt2
123    if(usepaw == 1) then
124      gsc = gsc * sqrt2
125      if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) / sqrt2
126    end if
127  end if
128 
129  ! Build, pack and sum suham
130  ABI_MALLOC(subham, (cplx*nband*(nband+1)/2))
131  ABI_MALLOC(totham, (cplx, nband*nband))
132  call abi_xgemm(blas_transpose,'n',nband,nband,vectsize,cone,ghc,vectsize,&
133 & cg,vectsize,czero,totham,nband, x_cplx=cplx)
134  call pack_matrix(totham, subham, nband, cplx)
135  ABI_FREE(totham)
136  call xmpi_sum(subham,mpi_enreg%comm_bandspinorfft,ierr)
137 
138 
139  ! Same for subovl
140  ABI_MALLOC(subovl, (cplx*nband*(nband+1)/2))
141  ABI_MALLOC(totovl, (cplx, nband*nband))
142  if(usepaw == 1) then
143    call abi_xgemm(blas_transpose,'n',nband,nband,vectsize,cone,gsc,vectsize,&
144 &   cg,vectsize,czero,totovl,nband, x_cplx=cplx)
145  else
146    call abi_xgemm(blas_transpose,'n',nband,nband,vectsize,cone,cg,vectsize,&
147 &   cg,vectsize,czero,totovl,nband, x_cplx=cplx)
148  end if
149  call pack_matrix(totovl, subovl, nband, cplx)
150  ABI_FREE(totovl)
151  call xmpi_sum(subovl,mpi_enreg%comm_bandspinorfft,ierr)
152 
153 
154  ! Transform back
155  if(istwf_k == 2) then
156    cg = cg / sqrt2
157    if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * sqrt2
158    ghc = ghc / sqrt2
159    if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) * sqrt2
160    if(usepaw == 1) then
161      gsc = gsc / sqrt2
162      if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) * sqrt2
163    end if
164  end if
165  call timab(timer_subham, 2, tsec)
166 
167 
168  call timab(timer_subdiago, 1, tsec)
169  ABI_MALLOC(evec, (cplx*nband, nband))
170 
171  call abi_xhpgv(1,'V','U',nband,subham,subovl,eig,evec,nband,istwf_k=istwf_k,use_slk=mpi_enreg%paral_kgb)
172 
173  ABI_FREE(subham)
174  ABI_FREE(subovl)
175 
176 ! Fix the phase (this is because of the simultaneous diagonalisation of this
177 ! matrix by different processors, allowing to get different unitary transforms, thus breaking the
178 ! coherency of parts of cg stored on different processors).
179 ! call cg_normev(evec,nband,nband)  ! Unfortunately, for cg_normev to work, one needs the vectors to be normalized, so uses fxphas_seq
180  ABI_MALLOC(edummy, (cplx*nband, nband))
181  call fxphas_seq(evec,edummy,0,0,1,nband*nband,nband*nband,nband,nband,0)
182  ABI_FREE(edummy)
183 
184  ! Rotate
185  call abi_xgemm('n','n',vectsize,nband, nband,cone,cg , vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx)
186  cg = gtempc
187  call abi_xgemm('n','n',vectsize,nband, nband,cone,ghc, vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx)
188  ghc = gtempc
189  if(usepaw == 1) then
190    call abi_xgemm('n','n',vectsize,nband, nband,cone,gsc, vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx)
191    gsc = gtempc
192  endif
193  if(usepaw==0 .or. has_fock)then
194    call abi_xgemm('n','n',vectsize,nband, nband,cone,gvnlxc, vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx)
195    gvnlxc = gtempc
196  end if
197  ABI_FREE(evec)
198  call timab(timer_subdiago, 2, tsec)
199 
200 #else
201  !! TODO non-functional, should be rewritten. Possibly faster (tests needed)
202  call wrtout(std_out, 'Transposed, orthogonalizing')
203 
204  ! orthonormalization
205  call timab(timer_ortho, 1, tsec)
206  if (usepaw==1) then
207    call abi_xorthonormalize(cg, gsc,nband, mpi_enreg%comm_bandspinorfft, sqgram, npw*nspinor, 2)
208  else
209    call abi_xorthonormalize(cg, cg, nband, mpi_enreg%comm_bandspinorfft, sqgram, npw*nspinor, 2)
210  end if
211  call timab(timer_ortho, 2, tsec)
212 
213  ! rotate ghc, gsc and gvnlxc
214  call timab(timer_rotation, 1, tsec)
215  call abi_xtrsm('r','u','n','n',npw*nspinor,nband,cone,sqgram,nband, ghc,npw*nspinor,x_cplx=2)
216  if(usepaw==1) then
217    call abi_xtrsm('r','u','n','n',npw*nspinor,nband,cone,sqgram,nband, gsc,npw*nspinor,x_cplx=2)
218  endif
219  if(usepaw==0 .or has_fock)then
220    call abi_xtrsm('r','u','n','n',npw*nspinor,nband,cone,sqgram,nband, gvnlxc,npw*nspinor,x_cplx=2)
221  end if
222  call timab(timer_rotation, 2, tsec)
223 
224  call wrtout(std_out, 'Orthogonalized, building subham')
225 
226  ! build hamiltonian  in subspace
227  call timab(timer_subham, 1, tsec)
228  call abi_xgemm(blas_transpose,'n',nband,nband,npw*nspinor,cone,ghc,npw*nspinor,&
229 & cg,npw*nspinor,czero,totham,nband, x_cplx=2)
230  ! pack for compatibility with subdiago
231  call pack_matrix(totham, subham, nband)
232  call xmpi_sum(subham,mpi_enreg%comm_bandspinorfft,ierr)
233  call timab(timer_subham, 2, tsec)
234 
235  call wrtout(std_out, 'Subham built, diagonalizing')
236 
237  ! Rayleigh-Ritz
238  call timab(timer_subdiago,1,tsec)
239  call subdiago(cg,eig,evec,gsc,0,0,gs_hamk%istwf_k,&
240 & mcg,mcg,nband,npw,nspinor,dtset%paral_kgb,&
241 & subham,dummy,0,gs_hamk%usepaw,mpi_enreg%me_g0)
242  call timab(timer_subdiago,2,tsec)
243 
244  call wrtout(std_out, 'Diagonalization done')
245 
246  ! Rotate ghc and gvnlxc according to evecs
247  call timab(timer_rotation, 1, tsec)
248  call abi_xgemm('n','n',npw*nspinor,nband, nband,cone,ghc, npw*nspinor, evec, nband, czero, gtempc, npw*nspinor, x_cplx=2)
249  ghc = gtempc
250  if(usepaw==0 .or has_fock)then
251    call abi_xgemm('n','n',npw*nspinor,nband, nband,cone,gvnlxc, npw*nspinor, evec, nband, czero, gtempc, npw*nspinor, x_cplx=2)
252    gvnlxc = gtempc
253  end if
254  call timab(timer_rotation, 2, tsec)
255 
256 #endif
257 
258 end subroutine rayleigh_ritz_subdiago