TABLE OF CONTENTS
- ABINIT/from_block_cyclic_to_mat
- ABINIT/from_mat_to_block_cyclic
- ABINIT/m_rayleigh_ritz
- ABINIT/rayleigh_ritz_distributed
- ABINIT/rayleigh_ritz_subdiago
ABINIT/from_block_cyclic_to_mat [ Functions ]
NAME
from_block_cyclic_to_mat
FUNCTION
Fills the columns of full_mat owned by iproc with block_cyclic_mat, using a 1D block-cyclic distribution
INPUTS
block_cyclic_mat(2,vectsize*buffsize)=local part of full_mat vectsize=number of rows nband=number of columns of the full matrix buffsize=size of the local part of the matrix blocksize=size of block for the block-cyclic distribution iproc=local processor number nprocs=total number of processors
OUTPUT
full_mat(2,vectsize*nband)=full matrix
SIDE EFFECTS
SOURCE
616 subroutine from_block_cyclic_to_mat(full_mat, vectsize, nband, block_cyclic_mat, buffsize, blocksize, iproc, nprocs) 617 618 integer, intent(in) :: vectsize, nband, buffsize, blocksize, iproc, nprocs 619 real(dp), intent(inout) :: full_mat(2, vectsize*nband) 620 real(dp), intent(in) :: block_cyclic_mat(2, vectsize*buffsize) 621 622 integer :: shift_fullmat, shift_block_cyclic_mat 623 integer :: cur_bsize 624 625 ! ************************************************************************* 626 627 shift_fullmat = iproc*blocksize 628 shift_block_cyclic_mat = 0 629 630 do while(.true.) 631 cur_bsize = MIN(blocksize, nband-shift_fullmat) 632 if(cur_bsize <= 0) exit 633 full_mat(:, shift_fullmat*vectsize+1 : shift_fullmat*vectsize + cur_bsize*vectsize) =& 634 & full_mat(:, shift_fullmat*vectsize+1 : shift_fullmat*vectsize + cur_bsize*vectsize) +& 635 & block_cyclic_mat(:, shift_block_cyclic_mat*vectsize+1 : shift_block_cyclic_mat*vectsize + cur_bsize*vectsize) 636 637 shift_block_cyclic_mat = shift_block_cyclic_mat + blocksize 638 shift_fullmat = shift_fullmat + blocksize*nprocs 639 end do 640 641 end subroutine from_block_cyclic_to_mat
ABINIT/from_mat_to_block_cyclic [ Functions ]
NAME
from_mat_to_block_cyclic
FUNCTION
Fills block_cyclic_mat with the columns of full_mat owned by iproc, using a 1D block-cyclic distribution
INPUTS
full_mat(2,vectsize*nband)=full input matrix vectsize=number of rows nband=number of columns of the full matrix buffsize=size of the local part of the matrix blocksize=size of block for the block-cyclic distribution iproc=local processor number nprocs=total number of processors
OUTPUT
block_cyclic_mat(2,vectsize*buffsize)=local part of full_mat
SIDE EFFECTS
SOURCE
566 subroutine from_mat_to_block_cyclic(full_mat, vectsize, nband, block_cyclic_mat, buffsize, blocksize, iproc, nprocs) 567 568 integer, intent(in) :: vectsize, nband, buffsize, blocksize, iproc, nprocs 569 real(dp), intent(in) :: full_mat(2, vectsize*nband) 570 real(dp), intent(inout) :: block_cyclic_mat(2, vectsize*buffsize) 571 572 integer :: shift_fullmat, shift_block_cyclic_mat 573 integer :: cur_bsize 574 575 ! ************************************************************************* 576 577 shift_fullmat = iproc*blocksize 578 shift_block_cyclic_mat = 0 579 580 do while(.true.) 581 cur_bsize = MIN(blocksize, nband-shift_fullmat) 582 if(cur_bsize <= 0) exit 583 block_cyclic_mat(:, shift_block_cyclic_mat*vectsize+1 : shift_block_cyclic_mat*vectsize + cur_bsize*vectsize) = & 584 & full_mat(:, shift_fullmat*vectsize+1 : shift_fullmat*vectsize + cur_bsize*vectsize) 585 586 shift_block_cyclic_mat = shift_block_cyclic_mat + blocksize 587 shift_fullmat = shift_fullmat + blocksize*nprocs 588 end do 589 590 end subroutine from_mat_to_block_cyclic
ABINIT/m_rayleigh_ritz [ Modules ]
NAME
m_rayleigh_ritz
FUNCTION
This file contains routines that perform the Rayleigh-Ritz, either by forming the full matrix (_subdiago) or by forming the distributed matrix in block-cyclic form (_distributed)
COPYRIGHT
Copyright (C) 2014-2024 ABINIT group (AL) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
18 #if defined HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include "abi_common.h" 23 24 module m_rayleigh_ritz 25 26 use defs_basis 27 use m_errors 28 use m_cgtools 29 use m_xmpi 30 use m_abicore 31 use m_abi_linalg 32 use m_slk 33 34 use defs_abitypes, only : mpi_type 35 use m_time, only : timab 36 use m_numeric_tools, only : pack_matrix 37 38 implicit none 39 40 private
ABINIT/rayleigh_ritz_distributed [ Functions ]
NAME
rayleigh_ritz_distributed
FUNCTION
Performs a rayleigh-ritz procedure (subspace rotation), building the distributed hamiltonian/overlap matrices directly, and calling the ScaLapack routines
INPUTS
mpi_enreg=information about MPI parallelization nband=number of bands at this k point for that spin polarization npw=number of plane waves at this k point nspinor=number of plane waves at this k point usepaw=do we use the PAW method
OUTPUT
eig(nband)=array for holding eigenvalues (hartree)
SIDE EFFECTS
cg(2,*)=updated wavefunctions ghc(2,*)=updated ghc gsc(2,*)=updated gsc gvnlxc(2,*)=updated gvnlxc
NOTES
Should profile for large test cases and see where the bottleneck is. Is it the copies? Should we do partial GEMMs? Is it the latency? Should we buffer more? Should we overlap computations and communications? (easy in theory, tedious in practice)
SOURCE
294 subroutine rayleigh_ritz_distributed(cg,ghc,gsc,gvnlxc,eig,has_fock,istwf_k,mpi_enreg,nband,npw,nspinor,usepaw) 295 296 integer,external :: NUMROC 297 298 ! Arguments 299 type(mpi_type),intent(inout) :: mpi_enreg 300 integer,intent(in) :: nband,npw,nspinor,usepaw,istwf_k 301 real(dp),intent(inout) :: cg(2,npw*nspinor*nband),gsc(2,npw*nspinor*nband),ghc(2,npw*nspinor*nband),gvnlxc(2,npw*nspinor*nband) 302 real(dp),intent(out) :: eig(nband) 303 logical :: has_fock 304 305 ! Locals 306 integer :: blocksize,nbproc,iproc,ierr,cplx,vectsize 307 integer :: buffsize_iproc(2), coords_iproc(2), grid_dims(2) 308 real(dp) :: cg_new(2,npw*nspinor*nband),gsc_or_vnlxc_new(2,npw*nspinor*nband),ghc_new(2,npw*nspinor*nband) 309 real(dp), allocatable :: ham_iproc(:,:), ovl_iproc(:,:), evec_iproc(:,:), left_temp(:,:), right_temp(:,:) 310 real(dp) :: tsec(2) 311 type(matrix_scalapack) :: sca_ham, sca_ovl, sca_evec 312 !character(len=500) :: message 313 character :: blas_transpose 314 315 integer, parameter :: timer_chebfi = 1600, timer_alltoall = 1601, timer_apply_inv_ovl = 1602, timer_rotation = 1603 316 integer, parameter :: timer_subdiago = 1604, timer_subham = 1605, timer_ortho = 1606, timer_getghc = 1607 317 integer, parameter :: timer_residuals = 1608, timer_update_eigen = 1609, timer_sync = 1610 318 319 ! ************************************************************************* 320 321 if(istwf_k == 1) then 322 cplx = 2 323 vectsize = npw*nspinor 324 blas_transpose = 'c' 325 else 326 cplx = 1 327 vectsize = 2*npw*nspinor 328 blas_transpose = 't' 329 end if 330 331 !write(message, *) 'RR: init' 332 !call wrtout(std_out,message) 333 !====================================================================================================== 334 ! Init Scalapack matrices 335 !====================================================================================================== 336 call sca_ham%init(nband,nband,slk_processor,istwf_k) 337 call sca_ovl%init(nband,nband,slk_processor,istwf_k) 338 call sca_evec%init(nband,nband,slk_processor,istwf_k) 339 340 ! Get info 341 blocksize = sca_ham%sizeb_blocs(1) ! Assume square blocs 342 nbproc = slk_processor%grid%nbprocs 343 grid_dims = slk_processor%grid%dims 344 345 !====================================================================================================== 346 ! Build hamiltonian and overlap matrices 347 !====================================================================================================== 348 ! TODO maybe we should avoid copies at the price of less BLAS efficiency (when blocksize is small)? must profile. 349 350 call timab(timer_subham, 1, tsec) 351 352 ! Transform cg, ghc and maybe gsc, according to istwf_k 353 if(istwf_k == 2) then 354 cg = cg * sqrt2 355 if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) / sqrt2 356 ghc = ghc * sqrt2 357 if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) / sqrt2 358 if(usepaw == 1) then 359 gsc = gsc * sqrt2 360 if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) / sqrt2 361 end if 362 end if 363 364 do iproc=0,nbproc-1 365 ! Build the local matrix belonging to processor iproc 366 !write(message, *) 'RR: build', iproc 367 !call wrtout(std_out,message) 368 369 ! Get coordinates of iproc 370 coords_iproc(1) = INT(iproc / grid_dims(2)) 371 coords_iproc(2) = MOD(iproc, grid_dims(2)) 372 373 ! Get buffersize of iproc 374 buffsize_iproc(1) = NUMROC(nband,blocksize,coords_iproc(1),0,slk_processor%grid%dims(1)) 375 buffsize_iproc(2) = NUMROC(nband,blocksize,coords_iproc(2),0,slk_processor%grid%dims(2)) 376 377 ! Allocate matrices_iproc, that will gather the contribution of this proc to the block owned by iproc 378 ABI_MALLOC(ham_iproc, (cplx*buffsize_iproc(1), buffsize_iproc(2))) 379 ABI_MALLOC(ovl_iproc, (cplx*buffsize_iproc(1), buffsize_iproc(2))) 380 381 ! Build them 382 ABI_MALLOC(left_temp, (2, npw*nspinor*buffsize_iproc(1))) 383 ABI_MALLOC(right_temp, (2, npw*nspinor*buffsize_iproc(2))) 384 385 ! ovl 386 call from_mat_to_block_cyclic(cg, npw*nspinor, nband, left_temp, & 387 & buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1)) 388 if(usepaw == 1) then 389 call from_mat_to_block_cyclic(gsc, npw*nspinor, nband, right_temp, & 390 & buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2)) 391 else 392 call from_mat_to_block_cyclic(cg, npw*nspinor, nband, right_temp, & 393 & buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2)) 394 end if 395 call abi_xgemm(blas_transpose,'n',buffsize_iproc(1),buffsize_iproc(2),vectsize,cone,left_temp,vectsize,& 396 & right_temp,vectsize,czero,ovl_iproc,buffsize_iproc(1), x_cplx=cplx) 397 398 ! ham 399 call from_mat_to_block_cyclic(ghc, npw*nspinor, nband, right_temp, & 400 & buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2)) 401 call abi_xgemm(blas_transpose,'n',buffsize_iproc(1),buffsize_iproc(2),vectsize,cone,left_temp,vectsize,& 402 & right_temp,vectsize,czero,ham_iproc,buffsize_iproc(1), x_cplx=cplx) 403 404 ! Sum to iproc, and fill sca_ matrices 405 call xmpi_sum_master(ham_iproc, iproc, slk_communicator, ierr) 406 call xmpi_sum_master(ovl_iproc, iproc, slk_communicator, ierr) 407 if(iproc == slk_processor%myproc) then 408 ! DCOPY to bypass the real/complex issue 409 if(cplx == 2) then 410 call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ham_iproc, 1, sca_ham%buffer_cplx, 1) 411 call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ovl_iproc, 1, sca_ovl%buffer_cplx, 1) 412 else 413 call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ham_iproc, 1, sca_ham%buffer_real, 1) 414 call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), ovl_iproc, 1, sca_ovl%buffer_real, 1) 415 end if 416 end if 417 418 ABI_FREE(ham_iproc) 419 ABI_FREE(ovl_iproc) 420 ABI_FREE(left_temp) 421 ABI_FREE(right_temp) 422 end do 423 424 ! Final sum 425 if(cplx == 2) then 426 call xmpi_sum(sca_ham%buffer_cplx, slk_complement_communicator, ierr) 427 call xmpi_sum(sca_ovl%buffer_cplx, slk_complement_communicator, ierr) 428 else 429 call xmpi_sum(sca_ham%buffer_real, slk_complement_communicator, ierr) 430 call xmpi_sum(sca_ovl%buffer_real, slk_complement_communicator, ierr) 431 end if 432 433 ! Transform back 434 if(istwf_k == 2) then 435 cg = cg / sqrt2 436 if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * sqrt2 437 ghc = ghc / sqrt2 438 if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) * sqrt2 439 if(usepaw == 1) then 440 gsc = gsc / sqrt2 441 if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) * sqrt2 442 end if 443 end if 444 call timab(timer_subham, 2, tsec) 445 446 !====================================================================================================== 447 ! Do the diagonalization 448 !====================================================================================================== 449 !write(message, *) 'RR: diag' 450 !call wrtout(std_out,message) 451 call timab(timer_subdiago, 1, tsec) 452 call compute_generalized_eigen_problem(slk_processor,sca_ham,sca_ovl,& 453 & sca_evec,eig,slk_communicator,istwf_k) 454 call timab(timer_subdiago, 2, tsec) 455 456 !====================================================================================================== 457 ! Perform rotation 458 !====================================================================================================== 459 call timab(timer_rotation, 1, tsec) 460 cg_new = zero 461 gsc_or_vnlxc_new = zero 462 ghc_new = zero 463 do iproc=0,nbproc-1 464 ! Compute the contribution to the rotated matrices from this block 465 !write(message, *) 'RR: rot', iproc 466 !call wrtout(std_out,message) 467 468 ! Get coordinates of iproc 469 coords_iproc(1) = INT(iproc / grid_dims(2)) 470 coords_iproc(2) = MOD(iproc, grid_dims(2)) 471 472 ! Get buffersize of iproc 473 buffsize_iproc(1) = NUMROC(nband,blocksize,coords_iproc(1),0,slk_processor%grid%dims(1)) 474 buffsize_iproc(2) = NUMROC(nband,blocksize,coords_iproc(2),0,slk_processor%grid%dims(2)) 475 476 ! Get data from iproc 477 ABI_MALLOC(evec_iproc, (cplx*buffsize_iproc(1), buffsize_iproc(2))) 478 if(iproc == slk_processor%myproc) then 479 if(cplx == 2) then 480 call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), sca_evec%buffer_cplx, 1, evec_iproc, 1) 481 else 482 call DCOPY(cplx*buffsize_iproc(1)*buffsize_iproc(2), sca_evec%buffer_real, 1, evec_iproc, 1) 483 end if 484 end if 485 call xmpi_bcast(evec_iproc,iproc,slk_communicator,ierr) 486 487 ! Compute contribution to the rotated matrices from iproc 488 ABI_MALLOC(left_temp, (2,npw*nspinor*buffsize_iproc(1))) 489 ABI_MALLOC(right_temp, (2,npw*nspinor*buffsize_iproc(2))) 490 491 ! cg 492 call from_mat_to_block_cyclic(cg, npw*nspinor, nband, left_temp, & 493 & buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1)) 494 call abi_xgemm('n','n',vectsize,buffsize_iproc(2),buffsize_iproc(1),cone,left_temp,vectsize,& 495 & evec_iproc, buffsize_iproc(1), czero, right_temp, vectsize, x_cplx=cplx) 496 call from_block_cyclic_to_mat(cg_new, npw*nspinor, nband, right_temp, & 497 & buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2)) 498 499 ! ghc 500 call from_mat_to_block_cyclic(ghc, npw*nspinor, nband, left_temp, & 501 & buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1)) 502 call abi_xgemm('n','n',vectsize,buffsize_iproc(2),buffsize_iproc(1),cone,left_temp,vectsize,& 503 & evec_iproc, buffsize_iproc(1), czero, right_temp, vectsize,x_cplx=cplx) 504 call from_block_cyclic_to_mat(ghc_new, npw*nspinor, nband, right_temp,& 505 & buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2)) 506 507 ! gsc or vnlc 508 if(usepaw == 1) then 509 call from_mat_to_block_cyclic(gsc, npw*nspinor, nband, left_temp, & 510 & buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1)) 511 endif 512 if(usepaw==0 .or. has_fock)then 513 call from_mat_to_block_cyclic(gvnlxc, npw*nspinor, nband, left_temp, & 514 & buffsize_iproc(1), blocksize, coords_iproc(1), grid_dims(1)) 515 end if 516 call abi_xgemm('n','n',vectsize,buffsize_iproc(2),buffsize_iproc(1),cone,left_temp,vectsize,& 517 & evec_iproc, buffsize_iproc(1), czero, right_temp, vectsize, x_cplx=cplx) 518 call from_block_cyclic_to_mat(gsc_or_vnlxc_new, npw*nspinor, nband, right_temp, & 519 & buffsize_iproc(2), blocksize, coords_iproc(2), grid_dims(2)) 520 521 ABI_FREE(evec_iproc) 522 ABI_FREE(left_temp) 523 ABI_FREE(right_temp) 524 end do 525 526 ! Overwrite with _new 527 cg = cg_new 528 ghc = ghc_new 529 if(usepaw == 1) then 530 gsc = gsc_or_vnlxc_new 531 endif 532 if(usepaw==0 .or. has_fock)then 533 gvnlxc = gsc_or_vnlxc_new 534 end if 535 call timab(timer_rotation, 2, tsec) 536 537 call sca_ham%free() 538 call sca_ovl%free() 539 call sca_evec%free() 540 541 end subroutine rayleigh_ritz_distributed
ABINIT/rayleigh_ritz_subdiago [ Functions ]
NAME
rayleigh_ritz_subdiago
FUNCTION
Performs a rayleigh-ritz procedure (subspace rotation), building the hamiltonian/overlap matrices in full and calling the subdiago method
INPUTS
mpi_enreg=information about MPI parallelization nband=number of bands at this k point for that spin polarization npw=number of plane waves at this k point nspinor=number of plane waves at this k point usepaw=if 1 we use the PAW method
OUTPUT
eig(nband)=array for holding eigenvalues (hartree)
SIDE EFFECTS
cg(2,*)=updated wavefunctions ghc(2,*)=updated ghc gsc(2,*)=updated gsc gvnlxc(2,*)=updated gvnlxc
NOTES
TODO choose generalized eigenproblem or ortho + diago (see #if 1)
SOURCE
81 subroutine rayleigh_ritz_subdiago(cg,ghc,gsc,gvnlxc,eig,has_fock,istwf_k,mpi_enreg,nband,npw,nspinor,usepaw) 82 83 ! Arguments 84 type(mpi_type),intent(inout) :: mpi_enreg 85 integer,intent(in) :: nband,npw,nspinor,usepaw,istwf_k 86 real(dp),intent(inout) :: cg(2,npw*nspinor*nband),gsc(2,npw*nspinor*nband),ghc(2,npw*nspinor*nband),gvnlxc(2,npw*nspinor*nband) 87 real(dp),intent(out) :: eig(nband) 88 logical :: has_fock 89 90 ! Locals 91 real(dp), allocatable :: subham(:), totham(:,:) 92 real(dp), allocatable :: subovl(:), totovl(:,:) 93 real(dp), allocatable :: evec(:,:), edummy(:,:) 94 integer :: ierr, cplx, vectsize 95 real(dp) :: gtempc(2,npw*nspinor*nband), tsec(2) 96 character :: blas_transpose 97 98 integer, parameter :: timer_chebfi = 1600, timer_alltoall = 1601, timer_apply_inv_ovl = 1602, timer_rotation = 1603 99 integer, parameter :: timer_subdiago = 1604, timer_subham = 1605, timer_ortho = 1606, timer_getghc = 1607 100 integer, parameter :: timer_residuals = 1608, timer_update_eigen = 1609, timer_sync = 1610 101 102 ! ************************************************************************* 103 104 if(istwf_k == 1) then 105 cplx = 2 106 vectsize = npw*nspinor 107 blas_transpose = 'c' 108 else 109 cplx = 1 110 vectsize = 2*npw*nspinor 111 blas_transpose = 't' 112 end if 113 114 #if 1 115 call timab(timer_subham, 1, tsec) 116 117 ! Transform cg, ghc and maybe gsc, according to istwf_k 118 if(istwf_k == 2) then 119 cg = cg * sqrt2 120 if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) / sqrt2 121 ghc = ghc * sqrt2 122 if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) / sqrt2 123 if(usepaw == 1) then 124 gsc = gsc * sqrt2 125 if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) / sqrt2 126 end if 127 end if 128 129 ! Build, pack and sum suham 130 ABI_MALLOC(subham, (cplx*nband*(nband+1)/2)) 131 ABI_MALLOC(totham, (cplx, nband*nband)) 132 call abi_xgemm(blas_transpose,'n',nband,nband,vectsize,cone,ghc,vectsize,& 133 & cg,vectsize,czero,totham,nband, x_cplx=cplx) 134 call pack_matrix(totham, subham, nband, cplx) 135 ABI_FREE(totham) 136 call xmpi_sum(subham,mpi_enreg%comm_bandspinorfft,ierr) 137 138 139 ! Same for subovl 140 ABI_MALLOC(subovl, (cplx*nband*(nband+1)/2)) 141 ABI_MALLOC(totovl, (cplx, nband*nband)) 142 if(usepaw == 1) then 143 call abi_xgemm(blas_transpose,'n',nband,nband,vectsize,cone,gsc,vectsize,& 144 & cg,vectsize,czero,totovl,nband, x_cplx=cplx) 145 else 146 call abi_xgemm(blas_transpose,'n',nband,nband,vectsize,cone,cg,vectsize,& 147 & cg,vectsize,czero,totovl,nband, x_cplx=cplx) 148 end if 149 call pack_matrix(totovl, subovl, nband, cplx) 150 ABI_FREE(totovl) 151 call xmpi_sum(subovl,mpi_enreg%comm_bandspinorfft,ierr) 152 153 154 ! Transform back 155 if(istwf_k == 2) then 156 cg = cg / sqrt2 157 if(mpi_enreg%me_g0 == 1) cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * sqrt2 158 ghc = ghc / sqrt2 159 if(mpi_enreg%me_g0 == 1) ghc(:, 1:npw*nspinor*nband:npw) = ghc(:, 1:npw*nspinor*nband:npw) * sqrt2 160 if(usepaw == 1) then 161 gsc = gsc / sqrt2 162 if(mpi_enreg%me_g0 == 1) gsc(:, 1:npw*nspinor*nband:npw) = gsc(:, 1:npw*nspinor*nband:npw) * sqrt2 163 end if 164 end if 165 call timab(timer_subham, 2, tsec) 166 167 168 call timab(timer_subdiago, 1, tsec) 169 ABI_MALLOC(evec, (cplx*nband, nband)) 170 171 call abi_xhpgv(1,'V','U',nband,subham,subovl,eig,evec,nband,istwf_k=istwf_k,use_slk=mpi_enreg%paral_kgb) 172 173 ABI_FREE(subham) 174 ABI_FREE(subovl) 175 176 ! Fix the phase (this is because of the simultaneous diagonalisation of this 177 ! matrix by different processors, allowing to get different unitary transforms, thus breaking the 178 ! coherency of parts of cg stored on different processors). 179 ! call cg_normev(evec,nband,nband) ! Unfortunately, for cg_normev to work, one needs the vectors to be normalized, so uses fxphas_seq 180 ABI_MALLOC(edummy, (cplx*nband, nband)) 181 call fxphas_seq(evec,edummy,0,0,1,nband*nband,nband*nband,nband,nband,0) 182 ABI_FREE(edummy) 183 184 ! Rotate 185 call abi_xgemm('n','n',vectsize,nband, nband,cone,cg , vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx) 186 cg = gtempc 187 call abi_xgemm('n','n',vectsize,nband, nband,cone,ghc, vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx) 188 ghc = gtempc 189 if(usepaw == 1) then 190 call abi_xgemm('n','n',vectsize,nband, nband,cone,gsc, vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx) 191 gsc = gtempc 192 endif 193 if(usepaw==0 .or. has_fock)then 194 call abi_xgemm('n','n',vectsize,nband, nband,cone,gvnlxc, vectsize, evec, nband, czero, gtempc, vectsize, x_cplx=cplx) 195 gvnlxc = gtempc 196 end if 197 ABI_FREE(evec) 198 call timab(timer_subdiago, 2, tsec) 199 200 #else 201 !! TODO non-functional, should be rewritten. Possibly faster (tests needed) 202 call wrtout(std_out, 'Transposed, orthogonalizing') 203 204 ! orthonormalization 205 call timab(timer_ortho, 1, tsec) 206 if (usepaw==1) then 207 call abi_xorthonormalize(cg, gsc,nband, mpi_enreg%comm_bandspinorfft, sqgram, npw*nspinor, 2) 208 else 209 call abi_xorthonormalize(cg, cg, nband, mpi_enreg%comm_bandspinorfft, sqgram, npw*nspinor, 2) 210 end if 211 call timab(timer_ortho, 2, tsec) 212 213 ! rotate ghc, gsc and gvnlxc 214 call timab(timer_rotation, 1, tsec) 215 call abi_xtrsm('r','u','n','n',npw*nspinor,nband,cone,sqgram,nband, ghc,npw*nspinor,x_cplx=2) 216 if(usepaw==1) then 217 call abi_xtrsm('r','u','n','n',npw*nspinor,nband,cone,sqgram,nband, gsc,npw*nspinor,x_cplx=2) 218 endif 219 if(usepaw==0 .or has_fock)then 220 call abi_xtrsm('r','u','n','n',npw*nspinor,nband,cone,sqgram,nband, gvnlxc,npw*nspinor,x_cplx=2) 221 end if 222 call timab(timer_rotation, 2, tsec) 223 224 call wrtout(std_out, 'Orthogonalized, building subham') 225 226 ! build hamiltonian in subspace 227 call timab(timer_subham, 1, tsec) 228 call abi_xgemm(blas_transpose,'n',nband,nband,npw*nspinor,cone,ghc,npw*nspinor,& 229 & cg,npw*nspinor,czero,totham,nband, x_cplx=2) 230 ! pack for compatibility with subdiago 231 call pack_matrix(totham, subham, nband) 232 call xmpi_sum(subham,mpi_enreg%comm_bandspinorfft,ierr) 233 call timab(timer_subham, 2, tsec) 234 235 call wrtout(std_out, 'Subham built, diagonalizing') 236 237 ! Rayleigh-Ritz 238 call timab(timer_subdiago,1,tsec) 239 call subdiago(cg,eig,evec,gsc,0,0,gs_hamk%istwf_k,& 240 & mcg,mcg,nband,npw,nspinor,dtset%paral_kgb,& 241 & subham,dummy,0,gs_hamk%usepaw,mpi_enreg%me_g0) 242 call timab(timer_subdiago,2,tsec) 243 244 call wrtout(std_out, 'Diagonalization done') 245 246 ! Rotate ghc and gvnlxc according to evecs 247 call timab(timer_rotation, 1, tsec) 248 call abi_xgemm('n','n',npw*nspinor,nband, nband,cone,ghc, npw*nspinor, evec, nband, czero, gtempc, npw*nspinor, x_cplx=2) 249 ghc = gtempc 250 if(usepaw==0 .or has_fock)then 251 call abi_xgemm('n','n',npw*nspinor,nband, nband,cone,gvnlxc, npw*nspinor, evec, nband, czero, gtempc, npw*nspinor, x_cplx=2) 252 gvnlxc = gtempc 253 end if 254 call timab(timer_rotation, 2, tsec) 255 256 #endif 257 258 end subroutine rayleigh_ritz_subdiago