TABLE OF CONTENTS
ABINIT/finddistrproc [ Functions ]
NAME
finddistrproc
FUNCTION
Given a total number of processors, find a suitable distribution that fill all the different levels of parallelization (npimage, nppert, npkpt, npspinor, npband, npfft, bandpp) Also determine parameters of parallel Linear Algebra routines (use_slk, np_slk, gpu_linalg_limit)
COPYRIGHT
Copyright (C) 1999-2018 ABINIT group (FJ,MT,FD) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt . For the initials of contributors, see ~abinit/doc/developers/contributors.txt.
INPUTS
dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables, for all datasets; at this stage only datasets with index lower than idtset are already initalized filnam(5)=character strings giving file names idtset=number of the current dataset mpi_enreg=information about MPI parallelization mband=maximum number of bands. ndtset_alloc=number of datasets, corrected for allocation of at least one data set tread(11)=flags indicating wether parallel input parameters were read from input file tread(1) : paral_kgb tread(6) : npfft tread(2) : npimage tread(7) : npband tread(3) : nppert tread(8) : bandpp tread(4) : npkpt tread(9) : use_slk tread(5) : nspinor tread(10): np_slk tread(11) : gpu_linalg_limit
SIDE EFFECTS
iexit= if incremented, an exit is required dtset%paral_kgb= flag for band-fft parallelism dtset%npimage = number of processors for parallelisation over image dtset%nppert = number of processors for parallelisation over perturbations dtset%npspinor = number of processors for parallelisation on k points dtset%npkpt = number of processors for parallelisation on k points dtset%npfft = number of processors for parallelisation on fft grid dtset%npband = number of processors for parallelisation on bands dtset%nphf = number of processors for parallelisation on occupied states for fock exchange dtset%bandpp = internal parameter for lobpcg parallelisation algorithm dtset%use_slk = flag for ScalaPAck use dtset%np_slk = number of processors used in ScaLapack routines dtset%gpu_linalg_limit=threshold activating Linear Algebra on GPU
PARENTS
mpi_setup
CHILDREN
compute_kgb_indicator,get_npert_rbz,hdr_free,hdr_read_from_fname initmpi_world,kpgcount,metric,mkfilename,mkrdim,sort_dp,wrtout xmpi_bcast
SOURCE
62 #if defined HAVE_CONFIG_H 63 #include "config.h" 64 #endif 65 66 #include "abi_common.h" 67 68 subroutine finddistrproc(dtsets,filnam,idtset,iexit,mband,mpi_enreg,ndtset_alloc,tread) 69 70 use defs_basis 71 use defs_abitypes 72 use m_profiling_abi 73 use m_errors 74 use m_xmpi 75 use m_xomp 76 use m_hdr 77 use m_sort 78 79 use m_geometry, only : mkrdim, metric 80 use m_fftcore, only : kpgcount 81 use m_dtset, only : get_npert_rbz 82 83 !This section has been created automatically by the script Abilint (TD). 84 !Do not modify the following lines by hand. 85 #undef ABI_FUNC 86 #define ABI_FUNC 'finddistrproc' 87 use interfaces_14_hidewrite 88 use interfaces_51_manage_mpi 89 use interfaces_54_abiutil 90 use interfaces_57_iovars, except_this_one => finddistrproc 91 !End of the abilint section 92 93 implicit none 94 95 !Arguments ------------------------------------ 96 !scalars 97 integer,intent(in) :: idtset,mband,ndtset_alloc 98 integer,intent(inout) :: iexit 99 type(dataset_type),intent(inout),target :: dtsets(0:ndtset_alloc) 100 type(MPI_type),intent(inout) :: mpi_enreg 101 !arrays 102 integer,intent(in) :: tread(11) 103 character(len=fnlen),intent(in) :: filnam(5) 104 105 !Local variables------------------------------- 106 !scalars 107 !128 should be a reasonable maximum for npfft (scaling is very poor for npfft>20) 108 integer,parameter :: NPFMAX=128 109 integer,parameter :: MAXCOUNT=250,MAXBENCH=25,NPF_CUTOFF=20 110 integer :: bpp,bpp_max,bpp_min,optdriver,autoparal 111 integer :: npi_max,npi_min,npc,npc_max,npc_min 112 integer :: npk,npk_max,npk_min,npp_max,npp_min 113 integer :: nps,nps_max,nps_min,npf,npf_max,npf_min 114 integer :: npb,npb_max,npb_min,max_ncpus,ount 115 integer :: work_size,nks_per_proc,tot_ncpus 116 integer :: icount,ii,imin,jj,mcount,mcount_eff,mpw 117 integer :: n2,n3,ncell_eff,ncount,nimage_eff,nkpt_eff,npert_eff 118 integer :: nproc,nproc1,nprocmin,np_slk,use_linalg_gpu,omp_ncpus 119 logical,parameter :: new_version=.true. 120 logical :: dtset_found,file_found,first_bpp,iam_master 121 real(dp):: acc_c,acc_k,acc_kgb,acc_kgb_0,acc_s,ecut_eff,ucvol,weight0 122 real(dp):: eff 123 character(len=9) :: suffix 124 character(len=500) :: message 125 character(len=fnlen) :: filden 126 type(hdr_type) :: hdr0 127 !arrays 128 integer :: idum(1),idum3(3),ngmax(3),ngmin(3) 129 integer,allocatable :: isort(:),jdtset_(:),my_distp(:,:) 130 integer,pointer :: nkpt_rbz(:) 131 real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3),rprimd(3,3) 132 real(dp),allocatable :: weight(:) 133 real(dp),pointer :: nband_rbz(:,:) 134 type(dataset_type),pointer :: dtset 135 !Cut-off function for npfft 136 ! cutoff(nn)= & 137 !& 0.2_dp+(one-0.2_dp)*(sin((pi*(nn-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))) & 138 !& /((pi*(nn-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))))**2 139 140 !****************************************************************** 141 142 DBG_ENTER("COLL") 143 144 !Select current dataset 145 dtset => dtsets(idtset) 146 147 !Is automatic parallelization activated? 148 autoparal = dtset%autoparal 149 if (autoparal==0) return 150 151 152 ! Handy local variables 153 iam_master = (mpi_enreg%me==0) 154 optdriver = dtset%optdriver 155 max_ncpus = dtset%max_ncpus 156 157 if (max_ncpus > 0 .and. autoparal/=0) then 158 iexit = iexit + 1 ! will stop in the parent. 159 end if 160 161 ! Unit number used for outputting the autoparal sections 162 ount = std_out 163 ount = ab_out 164 165 ! Small hack: set paral_kgb to -max_ncpus so that I don't have to change the previous implementation. 166 !if (dtset%paral_kgb == 1 .and. max_ncpus > 0) then 167 ! dtset%paral_kgb = -max_ncpus 168 !end if 169 170 if (optdriver==RUNL_GSTATE .and. dtset%paral_kgb==0 .and. & 171 & max_ncpus>0 .and. autoparal/=0) then 172 if (iam_master) then 173 ! This corresponds to the simplest algorithm for GS (band-by-band CG) 174 ! with distribution of k-points and spin. 175 work_size = dtset%nkpt * dtset%nsppol 176 write(ount,"(2a)")ch10,"--- !Autoparal" 177 write(ount,"(a)")"# Autoparal section for GS run (band-by-band CG method)" 178 write(ount,"(a)") "info:" 179 write(ount,"(a,i0)")" autoparal: ",autoparal 180 write(ount,"(a,i0)")" paral_kgb: ",dtset%paral_kgb 181 write(ount,"(a,i0)")" max_ncpus: ",max_ncpus 182 write(ount,"(a,i0)")" nspinor: ",dtset%nspinor 183 write(ount,"(a,i0)")" nsppol: ",dtset%nsppol 184 write(ount,"(a,i0)")" nkpt: ",dtset%nkpt 185 write(ount,"(a,i0)")" mband: ",mband 186 187 ! List of configurations. 188 ! Assuming an OpenMP implementation with perfect speedup! 189 write(ount,"(a)")"configurations:" 190 191 do ii=1,max_ncpus 192 if (ii > work_size) cycle 193 do omp_ncpus=1,xomp_get_max_threads() 194 nks_per_proc = work_size / ii 195 nks_per_proc = nks_per_proc + MOD(work_size, ii) 196 eff = (one * work_size) / (ii * nks_per_proc) 197 198 write(ount,"(a,i0)")" - tot_ncpus: ",ii * omp_ncpus 199 write(ount,"(a,i0)")" mpi_ncpus: ",ii 200 write(ount,"(a,i0)")" omp_ncpus: ",omp_ncpus 201 write(ount,"(a,f12.9)")" efficiency: ",eff 202 !write(ount,"(a,f12.2)")" mem_per_cpu: ",mempercpu_mb 203 end do 204 end do 205 write(ount,'(a)')"..." 206 end if 207 ! Return immediately, will stop in the parent. 208 iexit = iexit + 1 209 RETURN 210 end if 211 212 213 nproc=mpi_enreg%nproc 214 !if (xmpi_paral==1.and.dtset%paral_kgb <0) nproc=-dtset%paral_kgb 215 if (max_ncpus > 0) nproc = dtset%max_ncpus 216 !if (xmpi_paral==1.and.dtset%paral_kgb <0) nproc=dtset%max_ncpus 217 if (xmpi_paral==0.and.dtset%paral_kgb>=0) nproc=1 218 219 if (dtset%paral_kgb>=0) then 220 if (nproc==1) then 221 if (tread(1)==0.or.xmpi_paral==0) dtset%paral_kgb= 0 222 if (tread(2)==0.or.xmpi_paral==0) dtset%npimage = 1 223 if (tread(3)==0.or.xmpi_paral==0) dtset%nppert = 1 224 if (tread(4)==0.or.xmpi_paral==0) dtset%npspinor = 1 225 if (tread(5)==0.or.xmpi_paral==0) dtset%npkpt = 1 226 if (tread(6)==0.or.xmpi_paral==0) dtset%npfft = 1 227 if (tread(7)==0.or.xmpi_paral==0) dtset%npband = 1 228 if (tread(8)==0.or.xmpi_paral==0) dtset%bandpp = 1 229 if (tread(9)==0.or.xmpi_paral==0) dtset%use_slk = 0 230 if (tread(10)==0.or.xmpi_paral==0) dtset%np_slk = 1000000 231 return 232 end if 233 if ((dtset%optdriver/=RUNL_GSTATE.and.dtset%optdriver/=RUNL_RESPFN.and.dtset%optdriver/=RUNL_GWLS).or. & 234 & (dtset%optdriver==RUNL_GSTATE.and.dtset%usewvl==1)) then 235 dtset%paral_kgb= 0 236 dtset%npimage = max(1,dtset%npimage) 237 dtset%nppert = max(1,dtset%nppert) 238 dtset%npspinor = max(1,dtset%npspinor) 239 dtset%npkpt = max(1,dtset%npkpt) 240 dtset%npfft = max(1,dtset%npfft) 241 dtset%npband = max(1,dtset%npband) 242 dtset%bandpp = max(1,dtset%bandpp) 243 return 244 end if 245 end if 246 247 nprocmin=2 248 if (xmpi_paral==1.and.dtset%paral_kgb>=0) nprocmin=max(2,nproc-100) 249 if (max_ncpus > 0 .and. autoparal/=0) nprocmin = 1 250 251 !Need the metric tensor 252 call mkrdim(dtset%acell_orig(1:3,1),dtset%rprim_orig(1:3,1:3,1),rprimd) 253 call metric(gmet,gprimd,-1,rmet,rprimd,ucvol) 254 255 !Determine some quantities related to plane waves 256 ! - Crude estimation of the number of PW 257 ! - Number of G vectors in each direction 258 mpw=0;ngmin=0;ngmax=0 259 if (optdriver==RUNL_GSTATE) then 260 ecut_eff = dtset%ecut*dtset%dilatmx**2 261 mpw = nint(ucvol*((two*ecut_eff)**1.5_dp)/(six*pi**2)) ! Crude estimation 262 if (all(dtset%istwfk(1:dtset%nkpt)>1)) mpw=mpw/2+1 263 call kpgcount(ecut_eff,dtset%exchn2n3d,gmet,dtset%istwfk,dtset%kpt,ngmax,ngmin,dtset%nkpt) 264 write(message,'(a,i8)') ' getmpw sequential formula gave: ',mpw 265 call wrtout(std_out,message,'COLL') 266 end if 267 268 write(message,'(2a,i0)') ch10,& 269 & ' Computing all possible proc distrib for this input with nproc less than ',nproc 270 call wrtout(std_out,message,'COLL') 271 272 !Parallelization over images 273 npi_min=1;npi_max=1;nimage_eff=1 274 if (optdriver==RUNL_GSTATE) then 275 nimage_eff=dtset%ndynimage 276 if (dtset%ntimimage<=1) nimage_eff=dtset%nimage 277 npi_min=max(1,dtset%npimage) 278 npi_max=min(nproc,nimage_eff) 279 if (tread(2)==1) npi_max=dtset%npimage 280 end if 281 282 !Parallelization over k-points and spin components (GS) 283 npk_min=1;npk_max=1;nkpt_eff=0 284 if (optdriver==RUNL_GSTATE) then 285 nkpt_eff=dtset%nkpt*dtset%nsppol 286 npk_min=max(1,dtset%npkpt) 287 npk_max=min(nproc,nkpt_eff) 288 if (tread(4)==1) npk_max=dtset%npkpt 289 end if 290 291 !Parallelization over perturbations, k-points and spin components (DFPT) 292 npp_min=1;npp_max=1;npert_eff=1 293 if (optdriver==RUNL_RESPFN) then 294 if (dtset%paral_rf==1) then 295 call get_npert_rbz(dtset,nband_rbz,nkpt_rbz,npert_eff) 296 do jj=1,npert_eff 297 ii=dtset%nsppol*nkpt_rbz(jj)*maxval(nband_rbz(:,jj)) 298 nkpt_eff=max(nkpt_eff,ii) 299 end do 300 npp_min=max(1,dtset%nppert) 301 npp_max=min(nproc,npert_eff) 302 if (tread(3)==1) then 303 npp_max=dtset%nppert 304 if (npp_max>npert_eff) then 305 npp_min=npert_eff;npp_max=npert_eff 306 message='nppert is bigger than npert; we set nppert=npert' 307 MSG_WARNING(message) 308 end if 309 end if 310 npk_min=1 311 npk_max=min(nproc,nkpt_eff) 312 ABI_DEALLOCATE(nkpt_rbz) 313 ABI_DEALLOCATE(nband_rbz) 314 else 315 nkpt_eff=nproc 316 npk_min=nproc-5 317 npk_max=nproc 318 end if 319 end if 320 321 !Parallelization over spinorial components 322 nps_min=1;nps_max=1 323 if (optdriver==RUNL_GSTATE) then 324 nps_min=max(1,dtset%npspinor) 325 nps_max=min(nproc,dtset%nspinor) 326 if (tread(5)==1) nps_max=dtset%npspinor 327 end if 328 329 !KGB Parallelization 330 331 !>> FFT level 332 npf_min=1;npf_max=1 333 npb_min=1;npb_max=1 334 bpp_min=1;bpp_max=1 335 n2=0;n3=0 336 if (dtset%optdriver==RUNL_GSTATE) then 337 npf_min=max(1,dtset%npfft) 338 npf_min=min(npf_min,ngmin(2)) 339 npf_max=min(nproc,NPFMAX) 340 if (tread(6)==1) then 341 npf_max=dtset%npfft 342 if (npf_max>ngmin(2)) then 343 write(message,'(3a)') & 344 & "Value of npfft given in input file is too high for the FFT grid!",ch10,& 345 & "Action: decrease npfft or increase FFT grid (ecut, ngfft, ...)." 346 MSG_ERROR(message) 347 end if 348 end if 349 npf_max=min(npf_max,ngmin(2)) 350 if (dtset%use_gpu_cuda==1) then 351 npf_min=1;npf_max=1 352 end if 353 354 ! Number of FFT procs has to be a multiple of FFT grid sizes 355 ! In case of a restart from a density file, it has to be 356 ! compatible with the FFT grid used for the density 357 n2=dtset%ngfft(2) ; n3=dtset%ngfft(3) 358 if (n2==0.and.n3==0.and.(dtset%getden/=0.or.dtset%irdden/=0.or.dtset%iscf<0)) then 359 dtset_found=.false.;file_found=.false. 360 !1-Try to find ngfft from previous dataset 361 if (dtset%getden/=0) then 362 do ii=1,ndtset_alloc 363 jj=dtset%getden;if (jj<0) jj=dtset%jdtset+jj 364 if (dtsets(ii)%jdtset==jj) then 365 dtset_found=.true. 366 ! n2=dtsets(ii)%nfftdg;n3=0 367 n2=dtsets(ii)%ngfftdg(2);n3=dtsets(ii)%ngfftdg(3) 368 end if 369 end do 370 end if 371 !2-If not found, try to extract ngfft from density file 372 if (.not.dtset_found) then 373 !Retrieve file name 374 suffix='_DEN';if (dtset%nimage>1) suffix='_IMG1_DEN' 375 ABI_ALLOCATE(jdtset_,(0:ndtset_alloc)) 376 jdtset_=0;if(ndtset_alloc/=0) jdtset_(0:ndtset_alloc)=dtsets(0:ndtset_alloc)%jdtset 377 call mkfilename(filnam,filden,dtset%getden,idtset,dtset%irdden,jdtset_,ndtset_alloc,suffix,'den',ii) 378 ABI_DEALLOCATE(jdtset_) 379 !Retrieve ngfft from file header 380 idum3=0 381 if (mpi_enreg%me==0) then 382 inquire(file=trim(filden),exist=file_found) 383 if (file_found) then 384 call hdr_read_from_fname(hdr0,filden,ii,xmpi_comm_self) 385 idum3(1:2)=hdr0%ngfft(2:3);if (file_found) idum3(3)=1 386 call hdr_free(hdr0) 387 MSG_WARNING("Cannot find filden"//filden) 388 end if 389 end if 390 call xmpi_bcast(idum3,0,mpi_enreg%comm_world,ii) 391 n2=idum3(1);n3=idum3(2);file_found=(idum3(3)/=0) 392 end if 393 end if 394 395 ! >> Band level 396 npb_min=max(1,dtset%npband) 397 npb_max=min(nproc,mband) 398 if (tread(7)==1) npb_max=dtset%npband 399 400 ! >> banddp level 401 bpp_min=max(1,dtset%bandpp) 402 bpp_max=max(4,nint(mband/10.)) ! reasonnable bandpp max 403 if (tread(8)==1) bpp_max=dtset%bandpp 404 end if 405 406 !Disable KGB parallelisation in some cases: 407 ! - no GS 408 ! - paral_kgb=0 present in input file 409 ! - nstep=0 410 ! - Self-consistent DMFT 411 ! - Hartree-Fock or hybrid calculation (for now on) 412 if ( (optdriver/=RUNL_GSTATE) .or. (dtset%paral_kgb==0.and.tread(1)==1) .or. & 413 & (dtset%nstep==0).or. (dtset%usedmft==1.and.dtset%nstep>1) .or. & 414 & (dtset%usefock==1) ) then 415 nps_min=1; nps_max=1 416 npf_min=1; npf_max=1 417 npb_min=1; npb_max=1 418 bpp_min=1; bpp_max=1 419 end if 420 421 !Print title 422 if (iam_master) then 423 if (optdriver==RUNL_GSTATE) then 424 write(message, '(8(a12,a1),a,8(i4,a4,i4,a1))' ) & 425 'npimage','|','npkpt','|','npspinor','|','npfft','|','npband','|',' bandpp ' ,'|','nproc','|','weight','|', ch10, & 426 npi_min,' -> ',npi_max,'|',npk_min,' -> ',npk_max,'|',nps_min,' -> ',nps_max,'|', & 427 npf_min,' -> ',npf_max,'|',npb_min,' -> ',npb_max,'|',bpp_min,' -> ',bpp_max,'|', & 428 nprocmin,' -> ',nproc,'|', 1 ,' -> ',nproc,'|' 429 end if 430 if (optdriver==RUNL_RESPFN) then 431 write(message, '(4(a12,a1),a,4(i4,a4,i4,a1))' ) & 432 'nppert','|','npkpt','|','nproc','|','weight','|', ch10, & 433 npp_min,' -> ',npp_max,'|', npk_min,' -> ',npk_max,'|', & 434 nprocmin,' -> ',nproc,'|', 1 ,' -> ',nproc,'|' 435 end if 436 call wrtout(std_out,message,'COLL') 437 if(max_ncpus>0) then 438 call wrtout(ab_out,message,'COLL') 439 end if 440 end if 441 442 !Allocate lists 443 ABI_ALLOCATE(my_distp,(10,MAXCOUNT)) 444 ABI_ALLOCATE(weight,(MAXCOUNT)) 445 my_distp(1:7,:)=0;weight(:)=zero 446 my_distp(8,:)=dtset%use_slk 447 my_distp(9,:)=dtset%np_slk 448 my_distp(10,:)=dtset%gpu_linalg_limit 449 icount=0;imin=1 450 451 npc_min=1;npc_max=1;ncell_eff=1 452 if (optdriver==RUNL_GSTATE) then 453 ncell_eff=nimage_eff;npc_min=npi_min;npc_max=npi_max 454 end if 455 if (optdriver==RUNL_RESPFN) then 456 ncell_eff=npert_eff;npc_min=npp_min;npc_max=npp_max 457 end if 458 459 !Loop over all possibilities 460 !Computation of weight~"estimated acceleration" 461 if (new_version) then 462 463 ! ======= NEW VERSION ======== 464 do npc=npc_min,npc_max 465 acc_c=one;if (npc>1) acc_c=0.99_dp*speedup_fdp(ncell_eff,npc) 466 467 do npk=npk_min,npk_max 468 ! -> for DFPT runs, impose that nsppol divide npk 469 if (optdriver==RUNL_RESPFN.and.modulo(npk,dtset%nsppol)>0.and.npk>1) cycle 470 acc_k=one;if (npk>1) acc_k=0.96_dp*speedup_fdp(nkpt_eff,npk) 471 472 do nps=nps_min,nps_max 473 acc_s=one;if (nps>1) acc_s=0.85_dp*speedup_fdp(dtset%nspinor,nps) 474 475 do npf=npf_min,npf_max 476 ! -> npf should divide ngfft if set (if unset, ngfft=0 so the modulo test is ok) 477 if((modulo(n2,npf)>0).or.(modulo(n3,npf)>0)) cycle 478 ! -> npf should be only divisible by 2, 3 or 5 479 ii=npf 480 do while (modulo(ii,2)==0) 481 ii=ii/2 482 end do 483 do while (modulo(ii,3)==0) 484 ii=ii/3 485 end do 486 do while (modulo(ii,5)==0) 487 ii=ii/5 488 end do 489 if(ii/=1) cycle 490 491 do npb=npb_min,npb_max 492 nproc1=npc*npk*nps*npf*npb 493 if (nproc1<nprocmin) cycle 494 if (nproc1>nproc) cycle 495 if (modulo(mband,npb)>0) cycle 496 497 ! Base speedup 498 acc_kgb_0=one;if (npb*npf>1) acc_kgb_0=0.7_dp*speedup_fdp(mpw,(npb*npf)) 499 500 if (npb*npf>4) then 501 ! Promote npb=npf 502 acc_kgb_0=acc_kgb_0*min((one*npf)/(one*npb),(one*npb)/(one*npf)) 503 ! Promote npf<=20 504 if (npf>20)then 505 acc_kgb_0=acc_kgb_0* & 506 & 0.2_dp+(one-0.2_dp)*(sin((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))) & 507 & /((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))))**2 508 end if 509 end if 510 511 first_bpp=.true. 512 do bpp=bpp_min,bpp_max 513 if (modulo(mband/npb,bpp)>0) cycle 514 if ((bpp>1).and.(modulo(bpp,2)>0)) cycle 515 if (one*npb*bpp >max(1.,mband/3.).and.(mband>30)) cycle 516 if (npb*npf<=4.and.(.not.first_bpp)) cycle 517 first_bpp=.false. 518 519 acc_kgb=acc_kgb_0 520 ! Promote bpp*npb>mband/3 521 if (npb*npf>4.and.mband>30) acc_kgb=acc_kgb*(one-(three*bpp*npb)/(one*mband)) 522 523 ! Resulting speedup 524 ! weight0=acc_c*acc_k*acc_s*acc_kgb 525 weight0=nproc1*(acc_c+acc_k+acc_s+acc_kgb)/(npc+npk+nps+(npf*npb)) 526 527 ! Store data 528 icount=icount+1 529 if (icount<=MAXCOUNT) then 530 my_distp(1:7,icount)=(/npc,npk,nps,npf,npb,bpp,nproc1/) 531 weight(icount)=weight0 532 if (weight0<weight(imin)) imin=icount 533 else 534 if (weight0>weight(imin)) then 535 my_distp(1:7,imin)=(/npc,npk,nps,npf,npb,bpp,nproc1/) 536 weight(imin)=weight0 537 idum=minloc(weight);imin=idum(1) 538 end if 539 end if 540 541 end do ! bpp 542 end do ! npb 543 end do ! npf 544 end do ! nps 545 end do ! npk 546 end do ! npc 547 else 548 549 ! ======= OLD VERSION ======== 550 do npc=npc_min,npc_max 551 acc_c=one;if (npc>1) acc_c = 0.99_dp*ncell_eff/((ncell_eff+npc-1)/npc) 552 553 do npk=npk_min,npk_max 554 acc_k=one;if (npk>1) acc_k = 0.96_dp*nkpt_eff/((nkpt_eff+npk-1)/npk) 555 556 do nps=nps_min,nps_max 557 acc_s=one;if (nps>1) acc_s = 0.85_dp*dtset%nspinor/ ((dtset%nspinor+nps-1)/nps) 558 559 do npf=npf_min,npf_max 560 ! -> npf should divide ngfft if set (if unset, ngfft=0 so the modulo test is ok) 561 if((modulo(n2,npf)>0).or.(modulo(n3,npf)>0)) cycle 562 ! -> npf should be only divisible by 2, 3, 5, 7 or 11 563 npb=npf ! Note that here, npb is used as a temp var 564 do while (modulo(npb,2)==0) 565 npb=npb/2 566 end do 567 do while (modulo(npb,3)==0) 568 npb=npb/3 569 end do 570 do while (modulo(npb,5)==0) 571 npb=npb/5 572 end do 573 do while (modulo(npb,7)==0) 574 npb=npb/7 575 end do 576 do while (modulo(npb,11)==0) 577 npb=npb/11 578 end do 579 if(npb/=1) cycle 580 581 do npb=npb_min,npb_max 582 nproc1=npc*npk*nps*npf*npb 583 if (nproc1<nprocmin) cycle 584 if (nproc1>nproc) cycle 585 if(modulo(mband,npb)>0) cycle 586 587 do bpp=bpp_max,bpp_min,-1 588 if(modulo(mband/npb,bpp)>0) cycle 589 if((bpp>1).and.(modulo(bpp,2)>0)) cycle 590 if (1.*npb*bpp >max(1.,mband/3.)) cycle 591 592 acc_kgb=one 593 if (npb*npf>4) then 594 acc_kgb=min((one*npf)/(one*npb),(one*npb)/(one*npf)) * & 595 (mpw/(mpw/(npb*npf)))*(one-(three*bpp*npb)/mband) 596 else if (npb*npf >1) then 597 acc_kgb=(mpw*mband/(mband*mpw/(npb*npf)))*0.7_dp 598 end if 599 600 ! Weight average for efficiency and estimated acceleration 601 weight0=(acc_c+acc_k+acc_s+acc_kgb)/(npc+npk+nps+(npf*npb)) 602 weight0=weight0*nproc1 603 604 ! Store data 605 icount=icount+1 606 if (icount<=MAXCOUNT) then 607 my_distp(1:7,icount)=(/npc,npk,nps,npf,npb,bpp,nproc1/) 608 weight(icount)=weight0 609 if (weight0<weight(imin)) imin=icount 610 else 611 if (weight0>weight(imin)) then 612 my_distp(1:7,imin)=(/npc,npk,nps,npf,npb,bpp,nproc1/) 613 weight(imin)=weight0 614 idum=minloc(weight);imin=idum(1) 615 end if 616 end if 617 618 end do ! bpp 619 end do ! npb 620 end do ! npf 621 end do ! nps 622 end do ! npk 623 end do ! npc 624 625 ! New or old version 626 end if 627 628 mcount_eff=icount 629 mcount=min(mcount_eff,MAXCOUNT) 630 631 if (mcount==0) then 632 write(message,'(a,i0,2a,i0,a)') & 633 'Your input dataset does not let Abinit find an appropriate process distribution with nproc=',nproc,ch10, & 634 'Try to comment all the np* vars and set paral_kgb=',-nproc,' to have advices on process distribution.' 635 MSG_WARNING(message) 636 ! Override here the 0 default value changed in indefo1 637 dtset%npimage = max(1,dtset%npimage) 638 dtset%nppert = max(1,dtset%nppert) 639 dtset%npkpt = max(1,dtset%npkpt) 640 dtset%npspinor = max(1,dtset%npspinor) 641 dtset%npfft = max(1,dtset%npfft) 642 dtset%npband = max(1,dtset%npband) 643 dtset%bandpp = max(1,dtset%bandpp) 644 ABI_DEALLOCATE(my_distp) 645 ABI_DEALLOCATE(weight) 646 return 647 end if 648 649 !* HF or hybrid calculation: no use of the fonction "autoparal" 650 if ((dtset%usefock==1).AND.(dtset%nphf/=1)) then 651 write(message,'(a,i5,2a,i6,a)') & 652 'Hartree-Fock or hybrid calculation : Your input dataset does not let Abinit find an appropriate process distribution.' 653 MSG_WARNING(message) 654 ! Override here the 0 default value changed in indefo1 655 dtset%npimage = max(1,dtset%npimage) 656 dtset%npkpt = max(1,dtset%npkpt) 657 dtset%npspinor = max(1,dtset%npspinor) 658 dtset%npfft = max(1,dtset%npfft) 659 dtset%npband = max(1,dtset%npband) 660 dtset%bandpp = max(1,dtset%bandpp) 661 ABI_DEALLOCATE(my_distp) 662 ABI_DEALLOCATE(weight) 663 return 664 end if 665 666 !Sort data by increasing weight 667 ABI_ALLOCATE(isort,(mcount)) 668 isort=(/(ii,ii=1,mcount)/) 669 call sort_dp(mcount,weight,isort,tol6) 670 671 ncount=mcount;if (dtset%paral_kgb>=0) ncount=min(mcount,5) 672 if (iam_master) then 673 do jj=mcount,mcount-ncount+1,-1 674 ii=isort(jj) 675 if (optdriver==RUNL_GSTATE) then 676 write(message, '(7(i12,a1),f11.2,a2)') & 677 & my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|', & 678 & my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),' |' 679 end if 680 if (optdriver==RUNL_RESPFN) then 681 write(message, '(3(i12,a1),f11.2,a2)') & 682 & my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(7,ii),'|',weight(jj),' |' 683 end if 684 call wrtout(std_out,message,'COLL') 685 if(max_ncpus>0) then 686 call wrtout(ab_out,message,'COLL') 687 end if 688 end do 689 end if 690 691 if (max_ncpus>0.and.(mcount_eff>MAXCOUNT)) then 692 write(message,'(a,i0,a,i0,a)') & 693 & ' Received max_ncpus ',max_ncpus,' possible choices for nproc; only the first ',MAXCOUNT,' ones are printed...' 694 call wrtout(ab_out,message,'COLL') 695 call wrtout(std_out,message,'COLL') 696 end if 697 698 !if (iam_master .and. dtset%paral_kgb<0) then 699 if (iam_master .and. max_ncpus>0) then 700 write(ount,'(2a)')ch10,"--- !Autoparal" 701 702 if (optdriver==RUNL_GSTATE) then 703 write(ount,"(a)")"#Autoparal section for GS calculations with paral_kgb" 704 else if (optdriver==RUNL_RESPFN) then 705 write(ount,"(a)")'#Autoparal section for DFPT calculations' 706 else 707 MSG_ERROR("Unsupported optdriver") 708 end if 709 710 write(ount,"(a)") "info:" 711 write(ount,"(a,i0)")" autoparal: ",autoparal 712 write(ount,"(a,i0)")" paral_kgb: ",dtset%paral_kgb 713 write(ount,"(a,i0)")" max_ncpus: ",max_ncpus 714 write(ount,"(a,i0)")" nspinor: ",dtset%nspinor 715 write(ount,"(a,i0)")" nsppol: ",dtset%nsppol 716 write(ount,"(a,i0)")" nkpt: ",dtset%nkpt 717 write(ount,"(a,i0)")" mband: ",mband 718 719 ! List of configurations. 720 write(ount,"(a)")"configurations:" 721 722 if (optdriver==RUNL_GSTATE) then 723 724 do jj=mcount,mcount-ncount+1,-1 725 ii=isort(jj) 726 tot_ncpus = my_distp(7,ii) 727 eff = weight(jj) / tot_ncpus 728 729 write(ount,"(a,i0)")" - tot_ncpus: ",tot_ncpus 730 write(ount,"(a,i0)")" mpi_ncpus: ",tot_ncpus 731 !write(ount,"(a,i0)")" omp_ncpus: ",omp_ncpus !OMP not supported (yet) 732 write(ount,"(a,f12.9)")" efficiency: ",eff 733 !write(ount,"(a,f12.2)")" mem_per_cpu: ",mempercpu_mb 734 735 ! list of variables to use. 736 !'npimage','|','npkpt','|','npspinor','|','npfft','|','npband','|',' bandpp ' ,'|','nproc','|','weight','|' 737 write(ount,"(a)" )" vars: {" 738 write(ount,"(a,i0,a)")" npimage: ",my_distp(1,ii),"," 739 write(ount,"(a,i0,a)")" npkpt: ", my_distp(2,ii),"," 740 write(ount,"(a,i0,a)")" npspinor: ",my_distp(3,ii),"," 741 write(ount,"(a,i0,a)")" npfft: ", my_distp(4,ii),"," 742 write(ount,"(a,i0,a)")" npband: ",my_distp(5,ii),"," 743 write(ount,"(a,i0,a)")" bandpp: ",my_distp(6,ii),"," 744 write(ount,"(a)") " }" 745 end do 746 747 else if (optdriver==RUNL_RESPFN) then 748 749 do jj=mcount,mcount-ncount+1,-1 750 ii=isort(jj) 751 tot_ncpus = my_distp(7,ii) 752 eff = weight(jj) / tot_ncpus 753 754 write(ount,"(a,i0)")" - tot_ncpus: ",tot_ncpus 755 write(ount,"(a,i0)")" mpi_ncpus: ",tot_ncpus 756 !write(ount,"(a,i0)")" omp_ncpus: ",omp_ncpus !OMP not supported (yet) 757 write(ount,"(a,f12.9)")" efficiency: ",eff 758 !write(ount,"(a,f12.2)")" mem_per_cpu: ",mempercpu_mb 759 ! list of variables to use. 760 !'nppert','|','npkpt','|','nproc','|','weight','|', 761 write(ount,"(a)" )" vars: {" 762 write(ount,"(a,i0,a)")" nppert: ", my_distp(1,ii),"," 763 write(ount,"(a,i0,a)")" npkpt: ", my_distp(2,ii),"," 764 write(ount,"(a)") " }" 765 end do 766 767 end if 768 write(ount,'(a)')"..." 769 iexit = iexit + 1 770 end if 771 772 icount=isort(mcount) 773 774 !Refinement of the process distribution by mean of a LinAlg routines benchmarking 775 if (optdriver==RUNL_GSTATE.and.autoparal/=1) then 776 if (autoparal/=3) then 777 if (autoparal==2) then 778 write(message,'(5a,9(a10,a1))') ch10, & 779 & ' Values below have been tested with respect to Linear Algebra performance;',ch10,& 780 & ' Weights below are corrected according:',ch10,& 781 & 'npimage','|','npkpt' ,'|','npspinor' ,'|','npfft' ,'|','npband','|',' bandpp ' ,'|',& 782 & 'nproc' ,'|','weight','|','new weight','|' 783 else 784 write(message,'(5a,11(a10,a1))') ch10, & 785 & ' Values below have been tested with respect to Linear Algebra performance;',ch10,& 786 & ' Weights below are corrected according:',ch10,& 787 & 'npimage','|','npkpt' ,'|','npspinor' ,'|','npfft' ,'|','npband','|',' bandpp ' ,'|',& 788 & 'nproc' ,'|','weight','|','new weight','|','best npslk','|','linalggpu' ,'|' 789 end if 790 call wrtout(std_out,message,'COLL') 791 if (max_ncpus > 0) then 792 call wrtout(ab_out,message,'COLL') 793 end if 794 end if 795 acc_k=zero 796 ncount=min(MAXBENCH,mcount);if (autoparal==3) ncount=1 797 do jj=mcount,mcount-ncount+1,-1 798 ii=isort(jj) 799 npf=my_distp(4,ii);npb=my_distp(5,ii);bpp=my_distp(6,ii) 800 if ((npb*npf*bpp>1).and.(npf*npb<=mpi_enreg%nproc)) then 801 use_linalg_gpu=dtset%use_gpu_cuda 802 call compute_kgb_indicator(acc_kgb,bpp,xmpi_world,mband,mpw,npb,npf,np_slk,use_linalg_gpu) 803 if (autoparal/=2) then 804 my_distp(9,ii)=np_slk 805 if (np_slk>0) my_distp(8,ii)=1 806 ! * gpu_linalg_limit: 807 ! No use of GPU: huge value ~2 *vectsize*blocksize**2 tested 808 ! Use of GPU: tiny value ~0.5*vectsize*blocksize**2 tested 809 my_distp(10,ii)=2*dtset%mpw*(npb*bpp)**2/npf 810 if (use_linalg_gpu==1) my_distp(10,ii)=my_distp(10,ii)/4 811 end if 812 if (abs(acc_k)<=tol12) acc_k=acc_kgb ! Ref value : the first one computed 813 ! * Weight (corrected by 10% of the computed ratio) 814 weight0=weight(jj)*(one + 0.1_dp*acc_k/acc_kgb) 815 if (autoparal==2) then 816 write(message, '(7(i10,a1),f9.2,a2,f9.5,a2)') & 817 & my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',& 818 & my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |' 819 else if (autoparal==3) then 820 write(message,'(a,5(a,i3))') ch10,' For npband=',npb,', npfft=',npf,' and bandpp=',bpp, & 821 & ', compute_kgb_indicator recommends you to set np_slk=',my_distp(9,ii),& 822 & ' and use_linalg_gpu=',use_linalg_gpu 823 else 824 write(message, '(7(i10,a1),f9.2,a2,f9.5,a2,2(i10,a1))') & 825 & my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',& 826 & my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |',& 827 & my_distp(9,ii),'|',use_linalg_gpu,'|' 828 end if 829 call wrtout(std_out,message,'COLL') 830 if (max_ncpus>0) then 831 call wrtout(ab_out,message,'COLL') 832 end if 833 ! We store the best value in weight(mcount) and keep icount 834 if (weight0 > weight(mcount)) then 835 icount=ii;weight(mcount)=weight0 836 end if 837 end if 838 end do 839 end if 840 841 !Final advice in case max_ncpus > 0 842 if (max_ncpus>0) then 843 write(message,'(6a)') ch10,& 844 & ' Launch a parallel version of ABINIT with a number of processors among the above list,',ch10,& 845 & ' and the associated input variables npkpt, npband, npfft and bandpp. ',ch10,& 846 & ' The optimal weight is close to nproc and the higher should be better.' 847 call wrtout(std_out,message,'COLL') 848 call wrtout(ab_out,message,'COLL') 849 iexit=iexit+1 850 GOTO 100 851 end if 852 853 !Store new process distribution 854 if (dtset%paral_kgb>=0) then 855 nproc1=my_distp(7,icount) 856 ! Work load distribution 857 if (optdriver==RUNL_GSTATE) then 858 dtset%npimage= my_distp(1,icount) 859 dtset%nppert = 1 860 dtset%npkpt = my_distp(2,icount) 861 end if 862 if (optdriver==RUNL_RESPFN) then 863 dtset%npimage= 1 864 dtset%nppert = my_distp(1,icount) 865 dtset%npkpt = 1 866 end if 867 dtset%npspinor = my_distp(3,icount) 868 dtset%npfft = my_distp(4,icount) 869 dtset%npband = my_distp(5,icount) 870 dtset%bandpp = my_distp(6,icount) 871 ! The following lines are mandatory : the DFT+DMFT must use ALL the 872 ! available procs specified by the user. So nproc1=nproc. 873 ! Works only if paral_kgb is not activated. 874 if (dtset%usedmft/=0.and.optdriver==RUNL_GSTATE.and.dtset%paral_kgb==0) then 875 dtset%npspinor = 1 876 dtset%npfft = 1 877 dtset%npband = 1 878 dtset%bandpp = 1 879 dtset%npimage = 1 880 nproc1 = nproc 881 end if 882 if (dtset%npband*dtset%npfft*dtset%bandpp>1) dtset%paral_kgb=1 883 ! LinAlg parameters: we change values only if they are not present in input file 884 if (dtset%paral_kgb==1) then 885 if (tread(9)==0) dtset%use_slk=my_distp(8,icount) 886 if (tread(10)==0) dtset%np_slk=my_distp(9,icount) 887 if (tread(11)==0) dtset%gpu_linalg_limit=my_distp(10,icount) 888 end if 889 ! New definition of "world" MPI communicator 890 if (optdriver==RUNL_RESPFN.and.dtset%paral_rf==1) then 891 nproc1=max(nproc1,dtset%nsppol*dtset%nkpt) ! Take into account the code in respfn.F90 892 nproc1=min(nproc1,nproc) 893 nproc1=(nproc1/dtset%nppert)*dtset%nppert 894 end if 895 call initmpi_world(mpi_enreg,nproc1) 896 ! call initmpi_world(mpi_enreg,nproc1) 897 end if 898 899 100 continue 900 901 ABI_DEALLOCATE(isort) 902 ABI_DEALLOCATE(my_distp) 903 ABI_DEALLOCATE(weight) 904 905 DBG_EXIT("COLL") 906 907 contains 908 909 function speedup_fdp(nn,mm) 910 !Expected linear speedup for a nn-sized problem and mm processes 911 912 !This section has been created automatically by the script Abilint (TD). 913 !Do not modify the following lines by hand. 914 #undef ABI_FUNC 915 #define ABI_FUNC 'speedup_fdp' 916 !End of the abilint section 917 918 real(dp) :: speedup_fdp 919 integer,intent(in) :: nn,mm 920 speedup_fdp=(one*nn)/(one*((nn/mm)+merge(0,1,mod(nn,mm)==0))) 921 end function speedup_fdp 922 923 end subroutine finddistrproc