TABLE OF CONTENTS
ABINIT/compute_kgb_indicator [ Functions ]
NAME
compute_kgb_indicator
FUNCTION
Only for "KGB" parallelism (LOBPCG algorithm for Ground-state): Give an indicator of performance for a given distribution of processors (npband, npfft and bandpp). Determine best choice of parameters for Scalapack and/or Magma Linear Algebra routines.
COPYRIGHT
Copyright (C) 1999-2018 ABINIT group (FD) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt . For the initials of contributors, see ~abinit/doc/developers/contributors.txt.
INPUTS
bandpp=internal lobpcg optimization variable glb_comm=communicator for global MPI communications mband=maximum number of bands. mband=maximum number of plane waves npband=number of processor 'band' npfft = number of processor 'fft' uselinalggpu=indicate if we also test the gpu linear algebra
OUTPUT
acc_kgb = indicator of performance npslk = number of process to used in communicators
SIDE EFFECTS
This routine can be used to find an indicator in order to refine automatic process distribution. This indicator is returned in acc_kgb This routine can be used to find the optimal values of np_slk parameter (ScaLapack) and wheter or not we should use Magma for Linear Algebra in lobpcgwf
PARENTS
finddistrproc
CHILDREN
abi_linalg_finalize,abi_linalg_init,abi_xhegv,abi_xorthonormalize wrtout,xmpi_bcast,xmpi_comm_free
SOURCE
47 #if defined HAVE_CONFIG_H 48 #include "config.h" 49 #endif 50 51 #include "abi_common.h" 52 53 subroutine compute_kgb_indicator(acc_kgb,bandpp,glb_comm,mband,mpw,npband,npfft,npslk,& 54 & uselinalggpu) 55 56 use defs_basis 57 use m_abi_linalg 58 use m_profiling_abi 59 use m_errors 60 use m_xmpi 61 62 use m_time, only : abi_wtime 63 64 !This section has been created automatically by the script Abilint (TD). 65 !Do not modify the following lines by hand. 66 #undef ABI_FUNC 67 #define ABI_FUNC 'compute_kgb_indicator' 68 use interfaces_14_hidewrite 69 !End of the abilint section 70 71 implicit none 72 73 !Arguments ------------------------------------ 74 !scalars 75 integer,intent(in) :: bandpp,glb_comm,mband,mpw,npband,npfft 76 integer,intent(inout) :: npslk,uselinalggpu 77 real(dp),intent(inout) :: acc_kgb 78 !arrays 79 80 !Local variables------------------------------- 81 !scalars 82 integer,parameter :: max_number_of_npslk=10,max_number_of_iter=10 83 integer :: blocksize,bigorder,ierr,ii,islk,islk1,iter,jj,keep_gpu 84 integer :: kgb_comm,my_rank,np_slk,np_slk_max,np_slk_best,nranks 85 integer :: use_lapack_gpu,use_slk,vectsize 86 real(dp) :: min_eigen,min_ortho,time_xeigen,time_xortho 87 character(len=500) :: message 88 !arrays 89 integer,allocatable :: ranks(:),val_npslk(:) 90 real(dp),allocatable :: eigen(:),grama(:,:),gramb(:,:) 91 complex(dpc),allocatable :: blockvectorbx(:,:),blockvectorx(:,:),sqgram(:,:) 92 93 !****************************************************************** 94 95 DBG_ENTER("COLL") 96 97 #ifdef DEBUG_MODE 98 write(message,'(a,3i3)') 'compute_kgb_indicator : (bpp,npb,npf) = ', bandpp, npband, npfft 99 call wrtout(std_out,message,'PERS') 100 #endif 101 102 !Create local communicator for test 103 if (xmpi_paral==1) then 104 nranks=npfft*npband 105 ABI_ALLOCATE(ranks,(nranks)) 106 ranks=(/((my_rank-1),my_rank=1,nranks)/) 107 kgb_comm=xmpi_subcomm(glb_comm,nranks,ranks,my_rank_in_group=my_rank) 108 ABI_DEALLOCATE(ranks) 109 else 110 kgb_comm=xmpi_comm_self 111 my_rank=0 112 end if 113 114 !Only for process in the new subgroup 115 if (my_rank/=xmpi_undefined) then 116 117 ! We enforce vectsize >=blocksize (This is not true in lobpcg but 118 ! these are rare cases and this simplify the matrix constructions below...) 119 blocksize=npband*bandpp 120 vectsize=max(1+mpw/(npband*npfft),blocksize) 121 bigorder=3*blocksize 122 123 ABI_ALLOCATE(blockvectorx,(vectsize,blocksize)) 124 ABI_ALLOCATE(blockvectorbx,(vectsize,blocksize)) 125 ABI_ALLOCATE(sqgram,(blocksize,blocksize)) 126 ABI_ALLOCATE(grama,(2*bigorder,bigorder)) 127 ABI_ALLOCATE(gramb,(2*bigorder,bigorder)) 128 ABI_ALLOCATE(eigen,(bigorder)) 129 ABI_ALLOCATE(val_npslk,(max_number_of_npslk)) ! not too much values tested 130 131 min_eigen=greatest_real 132 min_ortho=greatest_real 133 np_slk_best=-1 ; np_slk_max=0 134 #ifdef HAVE_LINALG_SCALAPACK 135 np_slk_max=min(max_number_of_npslk,npband*npfft) 136 #endif 137 138 ! Preselect a range of available np_slk values 139 val_npslk(1:)=0 ; val_npslk(2)=1 140 do islk=3,np_slk_max 141 np_slk=val_npslk(islk-1)*2 142 do while ((modulo(npband*npfft,np_slk)>0).and.(np_slk<(npband*npfft))) 143 np_slk=np_slk+1 144 end do 145 if(np_slk>(npband*npfft).or.np_slk>mband) exit 146 val_npslk(islk)=np_slk 147 end do 148 np_slk_max=islk-1 149 150 ! Loop over np_slk values 151 islk1=1 152 #ifdef HAVE_LINALG_MAGMA 153 islk1=1-uselinalggpu 154 #endif 155 do islk=islk1,np_slk_max 156 157 time_xortho=zero ; time_xeigen=zero 158 159 use_slk=0 160 if (islk==0) then 161 ! This is the test for the GPU 162 use_lapack_gpu=1 ; np_slk=0 163 else 164 use_lapack_gpu=0 ; np_slk=val_npslk(islk) 165 if (np_slk>0) use_slk=1 166 end if 167 168 ! Initialize linalg parameters for this np_slk value 169 ! For the first np_slk value, everything is initialized 170 ! For the following np_slk values, only Scalapack parameters are updated 171 call abi_linalg_init(kgb_comm,np_slk,bigorder,my_rank,only_scalapack=(islk>islk1)) 172 173 ! We could do mband/blocksize iter as in lobpcg but it's too long 174 do iter=1,max_number_of_iter 175 176 ! Build matrixes 177 do ii=1,vectsize 178 do jj=1,blocksize 179 if (ii>jj) then 180 blockvectorx(ii,jj) =czero 181 blockvectorbx(ii,jj)=czero 182 else 183 blockvectorx(ii,jj) =cone 184 blockvectorbx(ii,jj)=cone 185 end if 186 end do 187 end do 188 grama=zero;gramb=zero 189 do jj=1,bigorder 190 do ii=jj,bigorder 191 if (ii==jj) then 192 grama(2*ii-1,jj)=one 193 gramb(2*ii-1,jj)=one 194 else 195 grama(2*ii-1:2*ii,jj)=one 196 grama(2*jj-1,ii)= one 197 grama(2*jj ,ii)=-one 198 end if 199 end do 200 end do 201 202 ! Call to abi_xorthonormalize 203 time_xortho=time_xortho-abi_wtime() 204 call abi_xorthonormalize(blockvectorx,blockvectorbx,blocksize,kgb_comm,sqgram,vectsize) 205 time_xortho = time_xortho + abi_wtime() 206 207 ! Call to abi_xhegv 208 time_xeigen=time_xeigen-abi_wtime() 209 call abi_xhegv(1,'v','u',bigorder,grama,gramb,eigen,& 210 & x_cplx=2,use_slk=use_slk,use_gpu=use_lapack_gpu) 211 time_xeigen=time_xeigen+abi_wtime() 212 213 end do ! iter 214 215 ! Finalize linalg parameters for this np_slk value 216 ! For the last np_slk value, everything is finalized 217 ! For the previous np_slk values, only Scalapack parameters are updated 218 call abi_linalg_finalize(only_scalapack=(islk<np_slk_max)) 219 220 time_xortho= time_xortho*mband/blocksize 221 time_xeigen= time_xeigen*mband/blocksize 222 if (time_xortho<min_ortho) min_ortho=time_xortho 223 if (time_xeigen<min_eigen) then 224 min_eigen=time_xeigen 225 np_slk_best=np_slk 226 keep_gpu=use_lapack_gpu 227 end if 228 229 end do ! np_slk 230 231 #ifdef DEBUG_MODE 232 write(message,'(2(a,es15.3),a,i3)') ' In the best case, xortho took ',min_ortho,& 233 & ' and xeigen took ',min_eigen,' for np_slk=',np_slk_best 234 call wrtout(std_out,message,'PERS') 235 #endif 236 237 ! Final values to be sent to others process 238 acc_kgb=min_ortho+four*min_eigen 239 npslk=max(np_slk_best,1) 240 uselinalggpu=keep_gpu 241 242 ABI_DEALLOCATE(blockvectorx) 243 ABI_DEALLOCATE(blockvectorbx) 244 ABI_DEALLOCATE(sqgram) 245 ABI_DEALLOCATE(grama) 246 ABI_DEALLOCATE(gramb) 247 ABI_DEALLOCATE(eigen) 248 ABI_DEALLOCATE(val_npslk) 249 250 end if ! my_rank in group 251 252 !Free local MPI communicator 253 call xmpi_comm_free(kgb_comm) 254 255 !Broadcast of results to be sure every process has them 256 call xmpi_bcast(acc_kgb,0,glb_comm,ierr) 257 call xmpi_bcast(npslk,0,glb_comm,ierr) 258 call xmpi_bcast(uselinalggpu,0,glb_comm,ierr) 259 260 #ifndef DEBUG_MODE 261 ABI_UNUSED(message) 262 #endif 263 264 DBG_EXIT("COLL") 265 266 end subroutine compute_kgb_indicator