TABLE OF CONTENTS


ABINIT/finddistrproc [ Functions ]

[ Top ] [ Functions ]

NAME

 finddistrproc

FUNCTION

   Given a total number of processors, find a suitable distribution
   that fill all the different levels of parallelization
   (npimage, nppert, npkpt, npspinor, npband, npfft, bandpp)
   Also determine parameters of parallel Linear Algebra routines
   (use_slk, np_slk, gpu_linalg_limit)

COPYRIGHT

 Copyright (C) 1999-2018 ABINIT group (FJ,MT,FD)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .
 For the initials of contributors, see ~abinit/doc/developers/contributors.txt.

INPUTS

  dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables,
   for all datasets; at this stage only datasets with index lower than
   idtset are already initalized
  filnam(5)=character strings giving file names
  idtset=number of the current dataset
  mpi_enreg=information about MPI parallelization
  mband=maximum number of bands.
  ndtset_alloc=number of datasets, corrected for allocation of at least one data set
  tread(11)=flags indicating wether parallel input parameters were read from input file
            tread(1)  : paral_kgb      tread(6) : npfft
            tread(2)  : npimage        tread(7) : npband
            tread(3)  : nppert         tread(8) : bandpp
            tread(4)  : npkpt          tread(9) : use_slk
            tread(5)  : nspinor        tread(10): np_slk
            tread(11) : gpu_linalg_limit

SIDE EFFECTS

  iexit= if incremented, an exit is required
  dtset%paral_kgb= flag for band-fft parallelism
  dtset%npimage  = number of processors for parallelisation over image
  dtset%nppert   = number of processors for parallelisation over perturbations
  dtset%npspinor = number of processors for parallelisation on k points
  dtset%npkpt    = number of processors for parallelisation on k points
  dtset%npfft    = number of processors for parallelisation on fft grid
  dtset%npband   = number of processors for parallelisation on bands
  dtset%nphf     = number of processors for parallelisation on occupied states for fock exchange
  dtset%bandpp   = internal parameter for lobpcg parallelisation algorithm
  dtset%use_slk  = flag for ScalaPAck use
  dtset%np_slk   = number of processors used in ScaLapack routines
  dtset%gpu_linalg_limit=threshold activating Linear Algebra on GPU

PARENTS

      mpi_setup

CHILDREN

      compute_kgb_indicator,get_npert_rbz,hdr_free,hdr_read_from_fname
      initmpi_world,kpgcount,metric,mkfilename,mkrdim,sort_dp,wrtout
      xmpi_bcast

SOURCE

 62 #if defined HAVE_CONFIG_H
 63 #include "config.h"
 64 #endif
 65 
 66 #include "abi_common.h"
 67 
 68  subroutine finddistrproc(dtsets,filnam,idtset,iexit,mband,mpi_enreg,ndtset_alloc,tread)
 69 
 70  use defs_basis
 71  use defs_abitypes
 72  use m_profiling_abi
 73  use m_errors
 74  use m_xmpi
 75  use m_xomp
 76  use m_hdr
 77  use m_sort
 78 
 79  use m_geometry,     only : mkrdim, metric
 80  use m_fftcore,      only : kpgcount
 81  use m_dtset,        only : get_npert_rbz
 82 
 83 !This section has been created automatically by the script Abilint (TD).
 84 !Do not modify the following lines by hand.
 85 #undef ABI_FUNC
 86 #define ABI_FUNC 'finddistrproc'
 87  use interfaces_14_hidewrite
 88  use interfaces_51_manage_mpi
 89  use interfaces_54_abiutil
 90  use interfaces_57_iovars, except_this_one => finddistrproc
 91 !End of the abilint section
 92 
 93  implicit none
 94 
 95 !Arguments ------------------------------------
 96 !scalars
 97  integer,intent(in) :: idtset,mband,ndtset_alloc
 98  integer,intent(inout) :: iexit
 99  type(dataset_type),intent(inout),target :: dtsets(0:ndtset_alloc)
100  type(MPI_type),intent(inout) :: mpi_enreg
101 !arrays
102  integer,intent(in) :: tread(11)
103  character(len=fnlen),intent(in) :: filnam(5)
104 
105 !Local variables-------------------------------
106 !scalars
107 !128 should be a reasonable maximum for npfft (scaling is very poor for npfft>20)
108  integer,parameter :: NPFMAX=128
109  integer,parameter :: MAXCOUNT=250,MAXBENCH=25,NPF_CUTOFF=20
110  integer :: bpp,bpp_max,bpp_min,optdriver,autoparal
111  integer :: npi_max,npi_min,npc,npc_max,npc_min
112  integer :: npk,npk_max,npk_min,npp_max,npp_min
113  integer :: nps,nps_max,nps_min,npf,npf_max,npf_min
114  integer :: npb,npb_max,npb_min,max_ncpus,ount
115  integer :: work_size,nks_per_proc,tot_ncpus
116  integer :: icount,ii,imin,jj,mcount,mcount_eff,mpw
117  integer :: n2,n3,ncell_eff,ncount,nimage_eff,nkpt_eff,npert_eff
118  integer :: nproc,nproc1,nprocmin,np_slk,use_linalg_gpu,omp_ncpus
119  logical,parameter :: new_version=.true.
120  logical :: dtset_found,file_found,first_bpp,iam_master
121  real(dp):: acc_c,acc_k,acc_kgb,acc_kgb_0,acc_s,ecut_eff,ucvol,weight0
122  real(dp):: eff
123  character(len=9) :: suffix
124  character(len=500) :: message
125  character(len=fnlen) :: filden
126  type(hdr_type) :: hdr0
127 !arrays
128  integer :: idum(1),idum3(3),ngmax(3),ngmin(3)
129  integer,allocatable :: isort(:),jdtset_(:),my_distp(:,:)
130  integer,pointer :: nkpt_rbz(:)
131  real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3),rprimd(3,3)
132  real(dp),allocatable :: weight(:)
133  real(dp),pointer :: nband_rbz(:,:)
134  type(dataset_type),pointer :: dtset
135 !Cut-off function for npfft
136 ! cutoff(nn)= &
137 !&    0.2_dp+(one-0.2_dp)*(sin((pi*(nn-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))) &
138 !&                           /((pi*(nn-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))))**2
139 
140 !******************************************************************
141 
142  DBG_ENTER("COLL")
143 
144 !Select current dataset
145  dtset => dtsets(idtset)
146 
147 !Is automatic parallelization activated?
148  autoparal = dtset%autoparal
149  if (autoparal==0) return
150 
151 
152  ! Handy local variables
153  iam_master = (mpi_enreg%me==0)
154  optdriver = dtset%optdriver
155  max_ncpus = dtset%max_ncpus
156 
157  if (max_ncpus > 0 .and. autoparal/=0) then
158    iexit = iexit + 1 ! will stop in the parent.
159  end if
160 
161  ! Unit number used for outputting the autoparal sections
162  ount = std_out
163  ount = ab_out
164 
165  ! Small hack: set paral_kgb to -max_ncpus so that I don't have to change the previous implementation.
166  !if (dtset%paral_kgb == 1 .and. max_ncpus > 0) then
167  !  dtset%paral_kgb = -max_ncpus
168  !end if
169 
170  if (optdriver==RUNL_GSTATE .and. dtset%paral_kgb==0 .and. &
171 & max_ncpus>0 .and. autoparal/=0) then
172    if (iam_master) then
173      ! This corresponds to the simplest algorithm for GS (band-by-band CG)
174      ! with distribution of k-points and spin.
175      work_size = dtset%nkpt * dtset%nsppol
176      write(ount,"(2a)")ch10,"--- !Autoparal"
177      write(ount,"(a)")"# Autoparal section for GS run (band-by-band CG method)"
178      write(ount,"(a)")   "info:"
179      write(ount,"(a,i0)")"    autoparal: ",autoparal
180      write(ount,"(a,i0)")"    paral_kgb: ",dtset%paral_kgb
181      write(ount,"(a,i0)")"    max_ncpus: ",max_ncpus
182      write(ount,"(a,i0)")"    nspinor: ",dtset%nspinor
183      write(ount,"(a,i0)")"    nsppol: ",dtset%nsppol
184      write(ount,"(a,i0)")"    nkpt: ",dtset%nkpt
185      write(ount,"(a,i0)")"    mband: ",mband
186 
187      ! List of configurations.
188      ! Assuming an OpenMP implementation with perfect speedup!
189      write(ount,"(a)")"configurations:"
190 
191      do ii=1,max_ncpus
192        if (ii > work_size) cycle
193        do omp_ncpus=1,xomp_get_max_threads()
194          nks_per_proc = work_size / ii
195          nks_per_proc = nks_per_proc + MOD(work_size, ii)
196          eff = (one * work_size) / (ii * nks_per_proc)
197 
198          write(ount,"(a,i0)")"    - tot_ncpus: ",ii * omp_ncpus
199          write(ount,"(a,i0)")"      mpi_ncpus: ",ii
200          write(ount,"(a,i0)")"      omp_ncpus: ",omp_ncpus
201          write(ount,"(a,f12.9)")"      efficiency: ",eff
202          !write(ount,"(a,f12.2)")"      mem_per_cpu: ",mempercpu_mb
203        end do
204      end do
205      write(ount,'(a)')"..."
206    end if
207    ! Return immediately, will stop in the parent.
208    iexit = iexit + 1
209    RETURN
210  end if
211 
212 
213  nproc=mpi_enreg%nproc
214  !if (xmpi_paral==1.and.dtset%paral_kgb <0) nproc=-dtset%paral_kgb
215  if (max_ncpus > 0) nproc = dtset%max_ncpus
216  !if (xmpi_paral==1.and.dtset%paral_kgb <0) nproc=dtset%max_ncpus
217  if (xmpi_paral==0.and.dtset%paral_kgb>=0) nproc=1
218 
219  if (dtset%paral_kgb>=0) then
220    if (nproc==1) then
221      if (tread(1)==0.or.xmpi_paral==0) dtset%paral_kgb= 0
222      if (tread(2)==0.or.xmpi_paral==0) dtset%npimage  = 1
223      if (tread(3)==0.or.xmpi_paral==0) dtset%nppert   = 1
224      if (tread(4)==0.or.xmpi_paral==0) dtset%npspinor = 1
225      if (tread(5)==0.or.xmpi_paral==0) dtset%npkpt    = 1
226      if (tread(6)==0.or.xmpi_paral==0) dtset%npfft    = 1
227      if (tread(7)==0.or.xmpi_paral==0) dtset%npband   = 1
228      if (tread(8)==0.or.xmpi_paral==0) dtset%bandpp   = 1
229      if (tread(9)==0.or.xmpi_paral==0) dtset%use_slk  = 0
230      if (tread(10)==0.or.xmpi_paral==0) dtset%np_slk  = 1000000
231      return
232    end if
233    if ((dtset%optdriver/=RUNL_GSTATE.and.dtset%optdriver/=RUNL_RESPFN.and.dtset%optdriver/=RUNL_GWLS).or. &
234 &   (dtset%optdriver==RUNL_GSTATE.and.dtset%usewvl==1)) then
235      dtset%paral_kgb= 0
236      dtset%npimage  = max(1,dtset%npimage)
237      dtset%nppert   = max(1,dtset%nppert)
238      dtset%npspinor = max(1,dtset%npspinor)
239      dtset%npkpt    = max(1,dtset%npkpt)
240      dtset%npfft    = max(1,dtset%npfft)
241      dtset%npband   = max(1,dtset%npband)
242      dtset%bandpp   = max(1,dtset%bandpp)
243      return
244    end if
245  end if
246 
247  nprocmin=2
248  if (xmpi_paral==1.and.dtset%paral_kgb>=0) nprocmin=max(2,nproc-100)
249  if (max_ncpus > 0 .and. autoparal/=0) nprocmin = 1
250 
251 !Need the metric tensor
252  call mkrdim(dtset%acell_orig(1:3,1),dtset%rprim_orig(1:3,1:3,1),rprimd)
253  call metric(gmet,gprimd,-1,rmet,rprimd,ucvol)
254 
255 !Determine some quantities related to plane waves
256 !  - Crude estimation of the number of PW
257 !  - Number of G vectors in each direction
258  mpw=0;ngmin=0;ngmax=0
259  if (optdriver==RUNL_GSTATE) then
260    ecut_eff = dtset%ecut*dtset%dilatmx**2
261    mpw = nint(ucvol*((two*ecut_eff)**1.5_dp)/(six*pi**2)) ! Crude estimation
262    if (all(dtset%istwfk(1:dtset%nkpt)>1)) mpw=mpw/2+1
263    call kpgcount(ecut_eff,dtset%exchn2n3d,gmet,dtset%istwfk,dtset%kpt,ngmax,ngmin,dtset%nkpt)
264    write(message,'(a,i8)') ' getmpw sequential formula gave: ',mpw
265    call wrtout(std_out,message,'COLL')
266  end if
267 
268  write(message,'(2a,i0)')  ch10,&
269 & ' Computing all possible proc distrib for this input with nproc less than ',nproc
270  call wrtout(std_out,message,'COLL')
271 
272 !Parallelization over images
273  npi_min=1;npi_max=1;nimage_eff=1
274  if (optdriver==RUNL_GSTATE) then
275    nimage_eff=dtset%ndynimage
276    if (dtset%ntimimage<=1) nimage_eff=dtset%nimage
277    npi_min=max(1,dtset%npimage)
278    npi_max=min(nproc,nimage_eff)
279    if (tread(2)==1) npi_max=dtset%npimage
280  end if
281 
282 !Parallelization over k-points and spin components (GS)
283  npk_min=1;npk_max=1;nkpt_eff=0
284  if (optdriver==RUNL_GSTATE) then
285    nkpt_eff=dtset%nkpt*dtset%nsppol
286    npk_min=max(1,dtset%npkpt)
287    npk_max=min(nproc,nkpt_eff)
288    if (tread(4)==1) npk_max=dtset%npkpt
289  end if
290 
291 !Parallelization over perturbations, k-points and spin components (DFPT)
292  npp_min=1;npp_max=1;npert_eff=1
293  if (optdriver==RUNL_RESPFN) then
294    if (dtset%paral_rf==1) then
295      call get_npert_rbz(dtset,nband_rbz,nkpt_rbz,npert_eff)
296      do jj=1,npert_eff
297        ii=dtset%nsppol*nkpt_rbz(jj)*maxval(nband_rbz(:,jj))
298        nkpt_eff=max(nkpt_eff,ii)
299      end do
300      npp_min=max(1,dtset%nppert)
301      npp_max=min(nproc,npert_eff)
302      if (tread(3)==1) then
303        npp_max=dtset%nppert
304        if (npp_max>npert_eff) then
305          npp_min=npert_eff;npp_max=npert_eff
306          message='nppert is bigger than npert; we set nppert=npert'
307          MSG_WARNING(message)
308        end if
309      end if
310      npk_min=1
311      npk_max=min(nproc,nkpt_eff)
312      ABI_DEALLOCATE(nkpt_rbz)
313      ABI_DEALLOCATE(nband_rbz)
314    else
315      nkpt_eff=nproc
316      npk_min=nproc-5
317      npk_max=nproc
318    end if
319  end if
320 
321 !Parallelization over spinorial components
322  nps_min=1;nps_max=1
323  if (optdriver==RUNL_GSTATE) then
324    nps_min=max(1,dtset%npspinor)
325    nps_max=min(nproc,dtset%nspinor)
326    if (tread(5)==1) nps_max=dtset%npspinor
327  end if
328 
329 !KGB Parallelization
330 
331 !>> FFT level
332  npf_min=1;npf_max=1
333  npb_min=1;npb_max=1
334  bpp_min=1;bpp_max=1
335  n2=0;n3=0
336  if (dtset%optdriver==RUNL_GSTATE) then
337    npf_min=max(1,dtset%npfft)
338    npf_min=min(npf_min,ngmin(2))
339    npf_max=min(nproc,NPFMAX)
340    if (tread(6)==1) then
341      npf_max=dtset%npfft
342      if (npf_max>ngmin(2)) then
343        write(message,'(3a)') &
344 &       "Value of npfft given in input file is too high for the FFT grid!",ch10,&
345 &       "Action: decrease npfft or increase FFT grid (ecut, ngfft, ...)."
346        MSG_ERROR(message)
347      end if
348    end if
349    npf_max=min(npf_max,ngmin(2))
350    if (dtset%use_gpu_cuda==1) then
351      npf_min=1;npf_max=1
352    end if
353 
354 !  Number of FFT procs has to be a multiple of FFT grid sizes
355 !  In case of a restart from a density file, it has to be
356 !  compatible with the FFT grid used for the density
357    n2=dtset%ngfft(2) ; n3=dtset%ngfft(3)
358    if (n2==0.and.n3==0.and.(dtset%getden/=0.or.dtset%irdden/=0.or.dtset%iscf<0)) then
359      dtset_found=.false.;file_found=.false.
360      !1-Try to find ngfft from previous dataset
361      if (dtset%getden/=0) then
362        do ii=1,ndtset_alloc
363          jj=dtset%getden;if (jj<0) jj=dtset%jdtset+jj
364          if (dtsets(ii)%jdtset==jj) then
365            dtset_found=.true.
366 !          n2=dtsets(ii)%nfftdg;n3=0
367            n2=dtsets(ii)%ngfftdg(2);n3=dtsets(ii)%ngfftdg(3)
368          end if
369        end do
370      end if
371      !2-If not found, try to extract ngfft from density file
372      if (.not.dtset_found) then
373        !Retrieve file name
374        suffix='_DEN';if (dtset%nimage>1) suffix='_IMG1_DEN'
375        ABI_ALLOCATE(jdtset_,(0:ndtset_alloc))
376        jdtset_=0;if(ndtset_alloc/=0) jdtset_(0:ndtset_alloc)=dtsets(0:ndtset_alloc)%jdtset
377        call mkfilename(filnam,filden,dtset%getden,idtset,dtset%irdden,jdtset_,ndtset_alloc,suffix,'den',ii)
378        ABI_DEALLOCATE(jdtset_)
379        !Retrieve ngfft from file header
380        idum3=0
381        if (mpi_enreg%me==0) then
382          inquire(file=trim(filden),exist=file_found)
383          if (file_found) then
384            call hdr_read_from_fname(hdr0,filden,ii,xmpi_comm_self)
385            idum3(1:2)=hdr0%ngfft(2:3);if (file_found) idum3(3)=1
386            call hdr_free(hdr0)
387            MSG_WARNING("Cannot find filden"//filden)
388          end if
389        end if
390        call xmpi_bcast(idum3,0,mpi_enreg%comm_world,ii)
391        n2=idum3(1);n3=idum3(2);file_found=(idum3(3)/=0)
392      end if
393    end if
394 
395 !  >> Band level
396    npb_min=max(1,dtset%npband)
397    npb_max=min(nproc,mband)
398    if (tread(7)==1) npb_max=dtset%npband
399 
400 !  >> banddp level
401    bpp_min=max(1,dtset%bandpp)
402    bpp_max=max(4,nint(mband/10.)) ! reasonnable bandpp max
403    if (tread(8)==1) bpp_max=dtset%bandpp
404  end if
405 
406 !Disable KGB parallelisation in some cases:
407 !  - no GS
408 !  - paral_kgb=0 present in input file
409 !  - nstep=0
410 !  - Self-consistent DMFT
411 !  - Hartree-Fock or hybrid calculation (for now on)
412  if ( (optdriver/=RUNL_GSTATE) .or. (dtset%paral_kgb==0.and.tread(1)==1) .or. &
413 & (dtset%nstep==0).or. (dtset%usedmft==1.and.dtset%nstep>1) .or. &
414 & (dtset%usefock==1) ) then
415    nps_min=1; nps_max=1
416    npf_min=1; npf_max=1
417    npb_min=1; npb_max=1
418    bpp_min=1; bpp_max=1
419  end if
420 
421 !Print title
422  if (iam_master) then
423    if (optdriver==RUNL_GSTATE) then
424      write(message, '(8(a12,a1),a,8(i4,a4,i4,a1))' )  &
425      'npimage','|','npkpt','|','npspinor','|','npfft','|','npband','|',' bandpp ' ,'|','nproc','|','weight','|', ch10, &
426      npi_min,' -> ',npi_max,'|',npk_min,' -> ',npk_max,'|',nps_min,' -> ',nps_max,'|', &
427      npf_min,' -> ',npf_max,'|',npb_min,' -> ',npb_max,'|',bpp_min,' -> ',bpp_max,'|', &
428      nprocmin,' -> ',nproc,'|', 1 ,' -> ',nproc,'|'
429    end if
430    if (optdriver==RUNL_RESPFN) then
431      write(message, '(4(a12,a1),a,4(i4,a4,i4,a1))' )  &
432      'nppert','|','npkpt','|','nproc','|','weight','|', ch10, &
433      npp_min,' -> ',npp_max,'|',      npk_min,' -> ',npk_max,'|', &
434      nprocmin,' -> ',nproc,'|', 1 ,' -> ',nproc,'|'
435    end if
436    call wrtout(std_out,message,'COLL')
437    if(max_ncpus>0) then
438      call wrtout(ab_out,message,'COLL')
439    end if
440  end if
441 
442 !Allocate lists
443  ABI_ALLOCATE(my_distp,(10,MAXCOUNT))
444  ABI_ALLOCATE(weight,(MAXCOUNT))
445  my_distp(1:7,:)=0;weight(:)=zero
446  my_distp(8,:)=dtset%use_slk
447  my_distp(9,:)=dtset%np_slk
448  my_distp(10,:)=dtset%gpu_linalg_limit
449  icount=0;imin=1
450 
451  npc_min=1;npc_max=1;ncell_eff=1
452  if (optdriver==RUNL_GSTATE) then
453    ncell_eff=nimage_eff;npc_min=npi_min;npc_max=npi_max
454  end if
455  if (optdriver==RUNL_RESPFN) then
456    ncell_eff=npert_eff;npc_min=npp_min;npc_max=npp_max
457  end if
458 
459 !Loop over all possibilities
460 !Computation of weight~"estimated acceleration"
461  if (new_version) then
462 
463 !  ======= NEW VERSION ========
464    do npc=npc_min,npc_max
465      acc_c=one;if (npc>1) acc_c=0.99_dp*speedup_fdp(ncell_eff,npc)
466 
467      do npk=npk_min,npk_max
468 !      -> for DFPT runs, impose that nsppol divide npk
469        if (optdriver==RUNL_RESPFN.and.modulo(npk,dtset%nsppol)>0.and.npk>1) cycle
470        acc_k=one;if (npk>1) acc_k=0.96_dp*speedup_fdp(nkpt_eff,npk)
471 
472        do nps=nps_min,nps_max
473          acc_s=one;if (nps>1) acc_s=0.85_dp*speedup_fdp(dtset%nspinor,nps)
474 
475          do npf=npf_min,npf_max
476 !          -> npf should divide ngfft if set (if unset, ngfft=0 so the modulo test is ok)
477            if((modulo(n2,npf)>0).or.(modulo(n3,npf)>0)) cycle
478 !          -> npf should be only divisible by 2, 3 or 5
479            ii=npf
480            do while (modulo(ii,2)==0)
481              ii=ii/2
482            end do
483            do while (modulo(ii,3)==0)
484              ii=ii/3
485            end do
486            do while (modulo(ii,5)==0)
487              ii=ii/5
488            end do
489            if(ii/=1) cycle
490 
491            do npb=npb_min,npb_max
492              nproc1=npc*npk*nps*npf*npb
493              if (nproc1<nprocmin)     cycle
494              if (nproc1>nproc)        cycle
495              if (modulo(mband,npb)>0) cycle
496 
497 !            Base speedup
498              acc_kgb_0=one;if (npb*npf>1) acc_kgb_0=0.7_dp*speedup_fdp(mpw,(npb*npf))
499 
500              if (npb*npf>4) then
501 !              Promote npb=npf
502                acc_kgb_0=acc_kgb_0*min((one*npf)/(one*npb),(one*npb)/(one*npf))
503 !              Promote npf<=20
504                if (npf>20)then
505                  acc_kgb_0=acc_kgb_0* &
506 &                 0.2_dp+(one-0.2_dp)*(sin((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))) &
507 &                 /((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))))**2
508                end if
509              end if
510 
511              first_bpp=.true.
512              do bpp=bpp_min,bpp_max
513                if (modulo(mband/npb,bpp)>0) cycle
514                if ((bpp>1).and.(modulo(bpp,2)>0)) cycle
515                if (one*npb*bpp >max(1.,mband/3.).and.(mband>30)) cycle
516                if (npb*npf<=4.and.(.not.first_bpp)) cycle
517                first_bpp=.false.
518 
519                acc_kgb=acc_kgb_0
520 !              Promote bpp*npb>mband/3
521                if (npb*npf>4.and.mband>30) acc_kgb=acc_kgb*(one-(three*bpp*npb)/(one*mband))
522 
523 !              Resulting speedup
524 !              weight0=acc_c*acc_k*acc_s*acc_kgb
525                weight0=nproc1*(acc_c+acc_k+acc_s+acc_kgb)/(npc+npk+nps+(npf*npb))
526 
527 !              Store data
528                icount=icount+1
529                if (icount<=MAXCOUNT) then
530                  my_distp(1:7,icount)=(/npc,npk,nps,npf,npb,bpp,nproc1/)
531                  weight(icount)=weight0
532                  if (weight0<weight(imin)) imin=icount
533                else
534                  if (weight0>weight(imin)) then
535                    my_distp(1:7,imin)=(/npc,npk,nps,npf,npb,bpp,nproc1/)
536                    weight(imin)=weight0
537                    idum=minloc(weight);imin=idum(1)
538                  end if
539                end if
540 
541              end do ! bpp
542            end do ! npb
543          end do ! npf
544        end do ! nps
545      end do ! npk
546    end do ! npc
547  else
548 
549 !  ======= OLD VERSION ========
550    do npc=npc_min,npc_max
551      acc_c=one;if (npc>1) acc_c = 0.99_dp*ncell_eff/((ncell_eff+npc-1)/npc)
552 
553      do npk=npk_min,npk_max
554        acc_k=one;if (npk>1) acc_k = 0.96_dp*nkpt_eff/((nkpt_eff+npk-1)/npk)
555 
556        do nps=nps_min,nps_max
557          acc_s=one;if (nps>1) acc_s = 0.85_dp*dtset%nspinor/ ((dtset%nspinor+nps-1)/nps)
558 
559          do npf=npf_min,npf_max
560 !          -> npf should divide ngfft if set (if unset, ngfft=0 so the modulo test is ok)
561            if((modulo(n2,npf)>0).or.(modulo(n3,npf)>0)) cycle
562 !          -> npf should be only divisible by 2, 3, 5, 7 or 11
563            npb=npf ! Note that here, npb is used as a temp var
564            do while (modulo(npb,2)==0)
565              npb=npb/2
566            end do
567            do while (modulo(npb,3)==0)
568              npb=npb/3
569            end do
570            do while (modulo(npb,5)==0)
571              npb=npb/5
572            end do
573            do while (modulo(npb,7)==0)
574              npb=npb/7
575            end do
576            do while (modulo(npb,11)==0)
577              npb=npb/11
578            end do
579            if(npb/=1) cycle
580 
581            do npb=npb_min,npb_max
582              nproc1=npc*npk*nps*npf*npb
583              if (nproc1<nprocmin) cycle
584              if (nproc1>nproc) cycle
585              if(modulo(mband,npb)>0) cycle
586 
587              do bpp=bpp_max,bpp_min,-1
588                if(modulo(mband/npb,bpp)>0) cycle
589                if((bpp>1).and.(modulo(bpp,2)>0)) cycle
590                if (1.*npb*bpp >max(1.,mband/3.)) cycle
591 
592                acc_kgb=one
593                if (npb*npf>4) then
594                  acc_kgb=min((one*npf)/(one*npb),(one*npb)/(one*npf))  * &
595                  (mpw/(mpw/(npb*npf)))*(one-(three*bpp*npb)/mband)
596                else if (npb*npf >1) then
597                  acc_kgb=(mpw*mband/(mband*mpw/(npb*npf)))*0.7_dp
598                end if
599 
600 !              Weight average for efficiency and estimated acceleration
601                weight0=(acc_c+acc_k+acc_s+acc_kgb)/(npc+npk+nps+(npf*npb))
602                weight0=weight0*nproc1
603 
604 !              Store data
605                icount=icount+1
606                if (icount<=MAXCOUNT) then
607                  my_distp(1:7,icount)=(/npc,npk,nps,npf,npb,bpp,nproc1/)
608                  weight(icount)=weight0
609                  if (weight0<weight(imin)) imin=icount
610                else
611                  if (weight0>weight(imin)) then
612                    my_distp(1:7,imin)=(/npc,npk,nps,npf,npb,bpp,nproc1/)
613                    weight(imin)=weight0
614                    idum=minloc(weight);imin=idum(1)
615                  end if
616                end if
617 
618              end do ! bpp
619            end do ! npb
620          end do ! npf
621        end do ! nps
622      end do ! npk
623    end do ! npc
624 
625 !  New or old version
626  end if
627 
628  mcount_eff=icount
629  mcount=min(mcount_eff,MAXCOUNT)
630 
631  if (mcount==0) then
632    write(message,'(a,i0,2a,i0,a)')  &
633    'Your input dataset does not let Abinit find an appropriate process distribution with nproc=',nproc,ch10, &
634    'Try to comment all the np* vars and set paral_kgb=',-nproc,' to have advices on process distribution.'
635    MSG_WARNING(message)
636 !  Override here the 0 default value changed in indefo1
637    dtset%npimage  = max(1,dtset%npimage)
638    dtset%nppert   = max(1,dtset%nppert)
639    dtset%npkpt    = max(1,dtset%npkpt)
640    dtset%npspinor = max(1,dtset%npspinor)
641    dtset%npfft    = max(1,dtset%npfft)
642    dtset%npband   = max(1,dtset%npband)
643    dtset%bandpp   = max(1,dtset%bandpp)
644    ABI_DEALLOCATE(my_distp)
645    ABI_DEALLOCATE(weight)
646    return
647  end if
648 
649 !* HF or hybrid calculation: no use of the fonction "autoparal"
650  if ((dtset%usefock==1).AND.(dtset%nphf/=1)) then
651    write(message,'(a,i5,2a,i6,a)')  &
652    'Hartree-Fock or hybrid calculation : Your input dataset does not let Abinit find an appropriate process distribution.'
653    MSG_WARNING(message)
654 !  Override here the 0 default value changed in indefo1
655    dtset%npimage  = max(1,dtset%npimage)
656    dtset%npkpt    = max(1,dtset%npkpt)
657    dtset%npspinor = max(1,dtset%npspinor)
658    dtset%npfft    = max(1,dtset%npfft)
659    dtset%npband   = max(1,dtset%npband)
660    dtset%bandpp   = max(1,dtset%bandpp)
661    ABI_DEALLOCATE(my_distp)
662    ABI_DEALLOCATE(weight)
663    return
664  end if
665 
666 !Sort data by increasing weight
667  ABI_ALLOCATE(isort,(mcount))
668  isort=(/(ii,ii=1,mcount)/)
669  call sort_dp(mcount,weight,isort,tol6)
670 
671  ncount=mcount;if (dtset%paral_kgb>=0) ncount=min(mcount,5)
672  if (iam_master) then
673    do jj=mcount,mcount-ncount+1,-1
674      ii=isort(jj)
675      if (optdriver==RUNL_GSTATE) then
676        write(message, '(7(i12,a1),f11.2,a2)') &
677 &       my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|', &
678 &       my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),' |'
679      end if
680      if (optdriver==RUNL_RESPFN) then
681        write(message, '(3(i12,a1),f11.2,a2)') &
682 &       my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(7,ii),'|',weight(jj),' |'
683      end if
684      call wrtout(std_out,message,'COLL')
685      if(max_ncpus>0) then
686        call wrtout(ab_out,message,'COLL')
687      end if
688    end do
689  end if
690 
691  if (max_ncpus>0.and.(mcount_eff>MAXCOUNT)) then
692    write(message,'(a,i0,a,i0,a)') &
693 &   ' Received max_ncpus ',max_ncpus,' possible choices for nproc; only the first ',MAXCOUNT,' ones are printed...'
694    call wrtout(ab_out,message,'COLL')
695    call wrtout(std_out,message,'COLL')
696  end if
697 
698  !if (iam_master .and. dtset%paral_kgb<0) then
699  if (iam_master .and. max_ncpus>0) then
700    write(ount,'(2a)')ch10,"--- !Autoparal"
701 
702    if (optdriver==RUNL_GSTATE) then
703      write(ount,"(a)")"#Autoparal section for GS calculations with paral_kgb"
704    else if (optdriver==RUNL_RESPFN) then
705      write(ount,"(a)")'#Autoparal section for DFPT calculations'
706    else
707      MSG_ERROR("Unsupported optdriver")
708    end if
709 
710    write(ount,"(a)")   "info:"
711    write(ount,"(a,i0)")"    autoparal: ",autoparal
712    write(ount,"(a,i0)")"    paral_kgb: ",dtset%paral_kgb
713    write(ount,"(a,i0)")"    max_ncpus: ",max_ncpus
714    write(ount,"(a,i0)")"    nspinor: ",dtset%nspinor
715    write(ount,"(a,i0)")"    nsppol: ",dtset%nsppol
716    write(ount,"(a,i0)")"    nkpt: ",dtset%nkpt
717    write(ount,"(a,i0)")"    mband: ",mband
718 
719    ! List of configurations.
720    write(ount,"(a)")"configurations:"
721 
722    if (optdriver==RUNL_GSTATE) then
723 
724      do jj=mcount,mcount-ncount+1,-1
725        ii=isort(jj)
726        tot_ncpus = my_distp(7,ii)
727        eff = weight(jj) / tot_ncpus
728 
729        write(ount,"(a,i0)")"    - tot_ncpus: ",tot_ncpus
730        write(ount,"(a,i0)")"      mpi_ncpus: ",tot_ncpus
731        !write(ount,"(a,i0)")"      omp_ncpus: ",omp_ncpus !OMP not supported  (yet)
732        write(ount,"(a,f12.9)")"      efficiency: ",eff
733        !write(ount,"(a,f12.2)")"      mem_per_cpu: ",mempercpu_mb
734 
735        ! list of variables to use.
736        !'npimage','|','npkpt','|','npspinor','|','npfft','|','npband','|',' bandpp ' ,'|','nproc','|','weight','|'
737        write(ount,"(a)"   )"      vars: {"
738        write(ount,"(a,i0,a)")"            npimage: ",my_distp(1,ii),","
739        write(ount,"(a,i0,a)")"            npkpt: ", my_distp(2,ii),","
740        write(ount,"(a,i0,a)")"            npspinor: ",my_distp(3,ii),","
741        write(ount,"(a,i0,a)")"            npfft: ", my_distp(4,ii),","
742        write(ount,"(a,i0,a)")"            npband: ",my_distp(5,ii),","
743        write(ount,"(a,i0,a)")"            bandpp: ",my_distp(6,ii),","
744        write(ount,"(a)")   "            }"
745      end do
746 
747    else if (optdriver==RUNL_RESPFN) then
748 
749      do jj=mcount,mcount-ncount+1,-1
750        ii=isort(jj)
751        tot_ncpus = my_distp(7,ii)
752        eff = weight(jj) / tot_ncpus
753 
754        write(ount,"(a,i0)")"    - tot_ncpus: ",tot_ncpus
755        write(ount,"(a,i0)")"      mpi_ncpus: ",tot_ncpus
756        !write(ount,"(a,i0)")"      omp_ncpus: ",omp_ncpus !OMP not supported  (yet)
757        write(ount,"(a,f12.9)")"      efficiency: ",eff
758        !write(ount,"(a,f12.2)")"      mem_per_cpu: ",mempercpu_mb
759        ! list of variables to use.
760        !'nppert','|','npkpt','|','nproc','|','weight','|',
761        write(ount,"(a)"   )"      vars: {"
762        write(ount,"(a,i0,a)")"             nppert: ", my_distp(1,ii),","
763        write(ount,"(a,i0,a)")"             npkpt: ", my_distp(2,ii),","
764        write(ount,"(a)")   "            }"
765      end do
766 
767    end if
768    write(ount,'(a)')"..."
769    iexit = iexit + 1
770  end if
771 
772  icount=isort(mcount)
773 
774 !Refinement of the process distribution by mean of a LinAlg routines benchmarking
775  if (optdriver==RUNL_GSTATE.and.autoparal/=1) then
776    if (autoparal/=3) then
777      if (autoparal==2) then
778        write(message,'(5a,9(a10,a1))') ch10, &
779 &       ' Values below have been tested with respect to Linear Algebra performance;',ch10,&
780 &       ' Weights below are corrected according:',ch10,&
781 &       'npimage','|','npkpt' ,'|','npspinor'  ,'|','npfft'     ,'|','npband','|',' bandpp ' ,'|',&
782 &       'nproc'  ,'|','weight','|','new weight','|'
783      else
784        write(message,'(5a,11(a10,a1))') ch10, &
785 &       ' Values below have been tested with respect to Linear Algebra performance;',ch10,&
786 &       ' Weights below are corrected according:',ch10,&
787 &       'npimage','|','npkpt' ,'|','npspinor'  ,'|','npfft'     ,'|','npband','|',' bandpp ' ,'|',&
788 &       'nproc'  ,'|','weight','|','new weight','|','best npslk','|','linalggpu' ,'|'
789      end if
790      call wrtout(std_out,message,'COLL')
791      if (max_ncpus > 0) then
792        call wrtout(ab_out,message,'COLL')
793      end if
794    end if
795    acc_k=zero
796    ncount=min(MAXBENCH,mcount);if (autoparal==3) ncount=1
797    do jj=mcount,mcount-ncount+1,-1
798      ii=isort(jj)
799      npf=my_distp(4,ii);npb=my_distp(5,ii);bpp=my_distp(6,ii)
800      if ((npb*npf*bpp>1).and.(npf*npb<=mpi_enreg%nproc)) then
801        use_linalg_gpu=dtset%use_gpu_cuda
802        call compute_kgb_indicator(acc_kgb,bpp,xmpi_world,mband,mpw,npb,npf,np_slk,use_linalg_gpu)
803        if (autoparal/=2) then
804          my_distp(9,ii)=np_slk
805          if (np_slk>0) my_distp(8,ii)=1
806 !        * gpu_linalg_limit:
807 !        No use of GPU: huge value ~2  *vectsize*blocksize**2 tested
808 !        Use of GPU:    tiny value ~0.5*vectsize*blocksize**2 tested
809          my_distp(10,ii)=2*dtset%mpw*(npb*bpp)**2/npf
810          if (use_linalg_gpu==1) my_distp(10,ii)=my_distp(10,ii)/4
811        end if
812        if (abs(acc_k)<=tol12) acc_k=acc_kgb ! Ref value : the first one computed
813 !      * Weight (corrected by 10% of the computed ratio)
814        weight0=weight(jj)*(one + 0.1_dp*acc_k/acc_kgb)
815        if (autoparal==2) then
816          write(message, '(7(i10,a1),f9.2,a2,f9.5,a2)') &
817 &         my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',&
818 &         my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |'
819        else if (autoparal==3) then
820          write(message,'(a,5(a,i3))') ch10,' For npband=',npb,', npfft=',npf,' and bandpp=',bpp, &
821 &         ', compute_kgb_indicator recommends you to set np_slk=',my_distp(9,ii),&
822 &         ' and use_linalg_gpu=',use_linalg_gpu
823        else
824          write(message, '(7(i10,a1),f9.2,a2,f9.5,a2,2(i10,a1))') &
825 &         my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',&
826 &         my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |',&
827 &         my_distp(9,ii),'|',use_linalg_gpu,'|'
828        end if
829        call wrtout(std_out,message,'COLL')
830        if (max_ncpus>0) then
831          call wrtout(ab_out,message,'COLL')
832        end if
833 !      We store the best value in weight(mcount) and keep icount
834        if (weight0 > weight(mcount)) then
835          icount=ii;weight(mcount)=weight0
836        end if
837      end if
838    end do
839  end if
840 
841 !Final advice in case max_ncpus > 0
842  if (max_ncpus>0) then
843    write(message,'(6a)') ch10,&
844 &   ' Launch a parallel version of ABINIT with a number of processors among the above list,',ch10,&
845 &   ' and the associated input variables npkpt, npband, npfft and bandpp. ',ch10,&
846 &   ' The optimal weight is close to nproc and the higher should be better.'
847    call wrtout(std_out,message,'COLL')
848    call wrtout(ab_out,message,'COLL')
849    iexit=iexit+1
850    GOTO 100
851  end if
852 
853 !Store new process distribution
854  if (dtset%paral_kgb>=0) then
855    nproc1=my_distp(7,icount)
856 !  Work load distribution
857    if (optdriver==RUNL_GSTATE) then
858      dtset%npimage= my_distp(1,icount)
859      dtset%nppert = 1
860      dtset%npkpt  = my_distp(2,icount)
861    end if
862    if (optdriver==RUNL_RESPFN) then
863      dtset%npimage= 1
864      dtset%nppert = my_distp(1,icount)
865      dtset%npkpt  = 1
866    end if
867    dtset%npspinor = my_distp(3,icount)
868    dtset%npfft    = my_distp(4,icount)
869    dtset%npband   = my_distp(5,icount)
870    dtset%bandpp   = my_distp(6,icount)
871 !  The following lines are mandatory : the DFT+DMFT must use ALL the
872 !  available procs specified by the user. So nproc1=nproc.
873 !  Works only if paral_kgb is not activated.
874    if (dtset%usedmft/=0.and.optdriver==RUNL_GSTATE.and.dtset%paral_kgb==0) then
875      dtset%npspinor = 1
876      dtset%npfft    = 1
877      dtset%npband   = 1
878      dtset%bandpp   = 1
879      dtset%npimage  = 1
880      nproc1         = nproc
881    end if
882    if (dtset%npband*dtset%npfft*dtset%bandpp>1) dtset%paral_kgb=1
883 !  LinAlg parameters: we change values only if they are not present in input file
884    if (dtset%paral_kgb==1) then
885      if (tread(9)==0) dtset%use_slk=my_distp(8,icount)
886      if (tread(10)==0) dtset%np_slk=my_distp(9,icount)
887      if (tread(11)==0) dtset%gpu_linalg_limit=my_distp(10,icount)
888    end if
889 !  New definition of "world" MPI communicator
890    if (optdriver==RUNL_RESPFN.and.dtset%paral_rf==1) then
891      nproc1=max(nproc1,dtset%nsppol*dtset%nkpt) ! Take into account the code in respfn.F90
892      nproc1=min(nproc1,nproc)
893      nproc1=(nproc1/dtset%nppert)*dtset%nppert
894    end if
895    call initmpi_world(mpi_enreg,nproc1)
896 !   call initmpi_world(mpi_enreg,nproc1)
897  end if
898 
899  100 continue
900 
901  ABI_DEALLOCATE(isort)
902  ABI_DEALLOCATE(my_distp)
903  ABI_DEALLOCATE(weight)
904 
905  DBG_EXIT("COLL")
906 
907  contains
908 
909    function speedup_fdp(nn,mm)
910    !Expected linear speedup for a nn-sized problem and mm processes
911 
912 !This section has been created automatically by the script Abilint (TD).
913 !Do not modify the following lines by hand.
914 #undef ABI_FUNC
915 #define ABI_FUNC 'speedup_fdp'
916 !End of the abilint section
917 
918    real(dp) :: speedup_fdp
919    integer,intent(in) :: nn,mm
920    speedup_fdp=(one*nn)/(one*((nn/mm)+merge(0,1,mod(nn,mm)==0)))
921  end function speedup_fdp
922 
923 end subroutine finddistrproc