TABLE OF CONTENTS


ABINIT/compute_kgb_indicator [ Functions ]

[ Top ] [ Functions ]

NAME

 compute_kgb_indicator

FUNCTION

 Only for "KGB" parallelism (LOBPCG algorithm for Ground-state):
  Give an indicator of performance for a given distribution of processors
  (npband, npfft and bandpp).
  Determine best choice of parameters for Scalapack and/or Magma Linear Algebra routines.

INPUTS

  bandpp=internal lobpcg optimization variable
  glb_comm=communicator for global MPI communications
  mband=maximum number of bands.
  mband=maximum number of plane waves
  npband=number of processor 'band'
  npfft = number of processor 'fft'
  uselinalggpu=indicate if we also test the gpu linear algebra

OUTPUT

  acc_kgb = indicator of performance
  npslk = number of process to used in communicators

SIDE EFFECTS

 This routine can be used to find an indicator in order to refine automatic process distribution.
   This indicator is returned in acc_kgb
 This routine can be used to find the optimal values of np_slk parameter (ScaLapack)
   and wheter or not we should use Magma for Linear Algebra in lobpcgwf

PARENTS

      finddistrproc

CHILDREN

      abi_linalg_finalize,abi_linalg_init,abi_xhegv,abi_xorthonormalize
      wrtout,xmpi_bcast,xmpi_comm_free

SOURCE

1891 subroutine compute_kgb_indicator(acc_kgb,bandpp,glb_comm,mband,mpw,npband,npfft,npslk,uselinalggpu)
1892 
1893  use m_abi_linalg
1894 
1895 !This section has been created automatically by the script Abilint (TD).
1896 !Do not modify the following lines by hand.
1897 #undef ABI_FUNC
1898 #define ABI_FUNC 'compute_kgb_indicator'
1899 !End of the abilint section
1900 
1901  implicit none
1902 
1903 !Arguments ------------------------------------
1904 !scalars
1905  integer,intent(in) :: bandpp,glb_comm,mband,mpw,npband,npfft
1906  integer,intent(inout) :: npslk,uselinalggpu
1907  real(dp),intent(inout) :: acc_kgb
1908 
1909 !Local variables-------------------------------
1910 !scalars
1911  integer,parameter :: max_number_of_npslk=10,max_number_of_iter=10
1912  integer :: blocksize,bigorder,ierr,ii,islk,islk1,iter,jj,keep_gpu
1913  integer :: kgb_comm,my_rank,np_slk,np_slk_max,np_slk_best,nranks
1914  integer :: use_lapack_gpu,use_slk,vectsize
1915  real(dp) :: min_eigen,min_ortho,time_xeigen,time_xortho
1916  character(len=500) :: message
1917 !arrays
1918  integer,allocatable :: ranks(:),val_npslk(:)
1919  real(dp),allocatable :: eigen(:),grama(:,:),gramb(:,:)
1920  complex(dpc),allocatable :: blockvectorbx(:,:),blockvectorx(:,:),sqgram(:,:)
1921 
1922 !******************************************************************
1923 
1924  DBG_ENTER("COLL")
1925 
1926 #ifdef DEBUG_MODE
1927  write(message,'(a,3i3)') 'compute_kgb_indicator : (bpp,npb,npf) = ', bandpp, npband, npfft
1928  call wrtout(std_out,message,'PERS')
1929 #endif
1930 
1931 !Create local communicator for test
1932  if (xmpi_paral==1) then
1933    nranks=npfft*npband
1934    ABI_ALLOCATE(ranks,(nranks))
1935    ranks=(/((my_rank-1),my_rank=1,nranks)/)
1936    kgb_comm=xmpi_subcomm(glb_comm,nranks,ranks,my_rank_in_group=my_rank)
1937    ABI_DEALLOCATE(ranks)
1938  else
1939    kgb_comm=xmpi_comm_self
1940    my_rank=0
1941  end if
1942 
1943 !Only for process in the new subgroup
1944  if (my_rank/=xmpi_undefined) then
1945 
1946 !  We enforce vectsize >=blocksize  (This is not true in lobpcg but
1947 !  these are rare cases and this simplify the matrix constructions below...)
1948    blocksize=npband*bandpp
1949    vectsize=max(1+mpw/(npband*npfft),blocksize)
1950    bigorder=3*blocksize
1951 
1952    ABI_ALLOCATE(blockvectorx,(vectsize,blocksize))
1953    ABI_ALLOCATE(blockvectorbx,(vectsize,blocksize))
1954    ABI_ALLOCATE(sqgram,(blocksize,blocksize))
1955    ABI_ALLOCATE(grama,(2*bigorder,bigorder))
1956    ABI_ALLOCATE(gramb,(2*bigorder,bigorder))
1957    ABI_ALLOCATE(eigen,(bigorder))
1958    ABI_ALLOCATE(val_npslk,(max_number_of_npslk)) ! not too much values tested
1959 
1960    min_eigen=greatest_real
1961    min_ortho=greatest_real
1962    np_slk_best=-1 ; np_slk_max=0
1963 #ifdef HAVE_LINALG_SCALAPACK
1964    np_slk_max=min(max_number_of_npslk,npband*npfft)
1965 #endif
1966 
1967 !  Preselect a range of available np_slk values
1968    val_npslk(1:)=0 ; val_npslk(2)=1
1969    do islk=3,np_slk_max
1970      np_slk=val_npslk(islk-1)*2
1971      do while ((modulo(npband*npfft,np_slk)>0).and.(np_slk<(npband*npfft)))
1972        np_slk=np_slk+1
1973      end do
1974      if(np_slk>(npband*npfft).or.np_slk>mband) exit
1975      val_npslk(islk)=np_slk
1976    end do
1977    np_slk_max=islk-1
1978 
1979 !  Loop over np_slk values
1980    islk1=1
1981 #ifdef HAVE_LINALG_MAGMA
1982    islk1=1-uselinalggpu
1983 #endif
1984    do islk=islk1,np_slk_max
1985 
1986      time_xortho=zero ; time_xeigen=zero
1987 
1988      use_slk=0
1989      if (islk==0) then
1990 !      This is the test for the GPU
1991        use_lapack_gpu=1 ; np_slk=0
1992      else
1993        use_lapack_gpu=0 ; np_slk=val_npslk(islk)
1994        if (np_slk>0) use_slk=1
1995      end if
1996 
1997 !    Initialize linalg parameters for this np_slk value
1998 !    For the first np_slk value, everything is initialized
1999 !    For the following np_slk values, only Scalapack parameters are updated
2000      call abi_linalg_init(kgb_comm,np_slk,bigorder,my_rank,only_scalapack=(islk>islk1))
2001 
2002 !    We could do mband/blocksize iter as in lobpcg but it's too long
2003      do iter=1,max_number_of_iter
2004 
2005 !      Build matrixes
2006        do ii=1,vectsize
2007          do jj=1,blocksize
2008            if (ii>jj) then
2009              blockvectorx(ii,jj) =czero
2010              blockvectorbx(ii,jj)=czero
2011            else
2012              blockvectorx(ii,jj) =cone
2013              blockvectorbx(ii,jj)=cone
2014            end if
2015          end do
2016        end do
2017        grama=zero;gramb=zero
2018        do jj=1,bigorder
2019          do ii=jj,bigorder
2020            if (ii==jj) then
2021              grama(2*ii-1,jj)=one
2022              gramb(2*ii-1,jj)=one
2023            else
2024              grama(2*ii-1:2*ii,jj)=one
2025              grama(2*jj-1,ii)= one
2026              grama(2*jj  ,ii)=-one
2027            end if
2028          end do
2029        end do
2030 
2031 !      Call to abi_xorthonormalize
2032        time_xortho=time_xortho-abi_wtime()
2033        call abi_xorthonormalize(blockvectorx,blockvectorbx,blocksize,kgb_comm,sqgram,vectsize)
2034        time_xortho = time_xortho + abi_wtime()
2035 
2036 !      Call to abi_xhegv
2037        time_xeigen=time_xeigen-abi_wtime()
2038        call abi_xhegv(1,'v','u',bigorder,grama,gramb,eigen,&
2039 &       x_cplx=2,use_slk=use_slk,use_gpu=use_lapack_gpu)
2040        time_xeigen=time_xeigen+abi_wtime()
2041 
2042      end do ! iter
2043 
2044 !    Finalize linalg parameters for this np_slk value
2045 !    For the last np_slk value, everything is finalized
2046 !    For the previous np_slk values, only Scalapack parameters are updated
2047      call abi_linalg_finalize(only_scalapack=(islk<np_slk_max))
2048 
2049      time_xortho= time_xortho*mband/blocksize
2050      time_xeigen= time_xeigen*mband/blocksize
2051      if (time_xortho<min_ortho) min_ortho=time_xortho
2052      if (time_xeigen<min_eigen) then
2053        min_eigen=time_xeigen
2054        np_slk_best=np_slk
2055        keep_gpu=use_lapack_gpu
2056      end if
2057 
2058    end do ! np_slk
2059 
2060 #ifdef DEBUG_MODE
2061    write(message,'(2(a,es15.3),a,i3)') ' In the best case, xortho took ',min_ortho,&
2062 &   ' and xeigen took ',min_eigen,' for np_slk=',np_slk_best
2063    call wrtout(std_out,message,'PERS')
2064 #endif
2065 
2066 !  Final values to be sent to others process
2067    acc_kgb=min_ortho+four*min_eigen
2068    npslk=max(np_slk_best,1)
2069    uselinalggpu=keep_gpu
2070 
2071    ABI_DEALLOCATE(blockvectorx)
2072    ABI_DEALLOCATE(blockvectorbx)
2073    ABI_DEALLOCATE(sqgram)
2074    ABI_DEALLOCATE(grama)
2075    ABI_DEALLOCATE(gramb)
2076    ABI_DEALLOCATE(eigen)
2077    ABI_DEALLOCATE(val_npslk)
2078 
2079  end if ! my_rank in group
2080 
2081 !Free local MPI communicator
2082  call xmpi_comm_free(kgb_comm)
2083 
2084 !Broadcast of results to be sure every process has them
2085  call xmpi_bcast(acc_kgb,0,glb_comm,ierr)
2086  call xmpi_bcast(npslk,0,glb_comm,ierr)
2087  call xmpi_bcast(uselinalggpu,0,glb_comm,ierr)
2088 
2089 #ifndef DEBUG_MODE
2090  ABI_UNUSED(message)
2091 #endif
2092 
2093  DBG_EXIT("COLL")
2094 
2095 end subroutine compute_kgb_indicator

ABINIT/finddistrproc [ Functions ]

[ Top ] [ Functions ]

NAME

 finddistrproc

FUNCTION

   Given a total number of processors, find a suitable distribution
   that fill all the different levels of parallelization
   (npimage, nppert, npkpt, npspinor, npband, npfft, bandpp)
   Also determine parameters of parallel Linear Algebra routines
   (use_slk, np_slk, gpu_linalg_limit)

INPUTS

  dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables,
   for all datasets; at this stage only datasets with index lower than
   idtset are already initalized
  filnam(5)=character strings giving file names
  idtset=number of the current dataset
  mpi_enreg=information about MPI parallelization
  mband=maximum number of bands.
  ndtset_alloc=number of datasets, corrected for allocation of at least one data set
  tread(11)=flags indicating wether parallel input parameters were read from input file
            tread(1)  : paral_kgb      tread(6) : npfft
            tread(2)  : npimage        tread(7) : npband
            tread(3)  : nppert         tread(8) : bandpp
            tread(4)  : npkpt          tread(9) : use_slk
            tread(5)  : nspinor        tread(10): np_slk
            tread(11) : gpu_linalg_limit

SIDE EFFECTS

  iexit= if incremented, an exit is required
  dtset%paral_kgb= flag for band-fft parallelism
  dtset%npimage  = number of processors for parallelisation over image
  dtset%nppert   = number of processors for parallelisation over perturbations
  dtset%npspinor = number of processors for parallelisation on k points
  dtset%npkpt    = number of processors for parallelisation on k points
  dtset%npfft    = number of processors for parallelisation on fft grid
  dtset%npband   = number of processors for parallelisation on bands
  dtset%nphf     = number of processors for parallelisation on occupied states for fock exchange
  dtset%bandpp   = internal parameter for lobpcg parallelisation algorithm
  dtset%use_slk  = flag for ScalaPAck use
  dtset%np_slk   = number of processors used in ScaLapack routines
  dtset%gpu_linalg_limit=threshold activating Linear Algebra on GPU

PARENTS

      mpi_setup

CHILDREN

      compute_kgb_indicator,get_npert_rbz,hdr_free,hdr_read_from_fname
      initmpi_world,kpgcount,metric,mkfilename,mkrdim,sort_dp,wrtout
      xmpi_bcast

SOURCE

1012  subroutine finddistrproc(dtsets,filnam,idtset,iexit,mband,mpi_enreg,ndtset_alloc,tread)
1013 
1014 
1015 !This section has been created automatically by the script Abilint (TD).
1016 !Do not modify the following lines by hand.
1017 #undef ABI_FUNC
1018 #define ABI_FUNC 'finddistrproc'
1019 !End of the abilint section
1020 
1021  implicit none
1022 
1023 !Arguments ------------------------------------
1024 !scalars
1025  integer,intent(in) :: idtset,mband,ndtset_alloc
1026  integer,intent(inout) :: iexit
1027  type(dataset_type),intent(inout),target :: dtsets(0:ndtset_alloc)
1028  type(MPI_type),intent(inout) :: mpi_enreg
1029 !arrays
1030  integer,intent(in) :: tread(11)
1031  character(len=fnlen),intent(in) :: filnam(5)
1032 
1033 !Local variables-------------------------------
1034 !scalars
1035 !128 should be a reasonable maximum for npfft (scaling is very poor for npfft>20)
1036  integer,parameter :: NPFMAX=128
1037  integer,parameter :: MAXCOUNT=250,MAXBENCH=25,NPF_CUTOFF=20
1038  integer :: bpp,bpp_max,bpp_min,optdriver,autoparal
1039  integer :: npi_max,npi_min,npc,npc_max,npc_min
1040  integer :: npk,npk_max,npk_min,npp_max,npp_min
1041  integer :: nps,nps_max,nps_min,npf,npf_max,npf_min
1042  integer :: npb,npb_max,npb_min,max_ncpus,ount
1043  integer :: work_size,nks_per_proc,tot_ncpus
1044  integer :: icount,ii,imin,jj,mcount,mcount_eff,mpw
1045  integer :: n2,n3,ncell_eff,ncount,nimage_eff,nkpt_eff,npert_eff
1046  integer :: nproc,nproc1,nprocmin,np_slk,use_linalg_gpu,omp_ncpus
1047  logical,parameter :: new_version=.true.
1048  logical :: dtset_found,file_found,first_bpp,iam_master
1049  real(dp):: acc_c,acc_k,acc_kgb,acc_kgb_0,acc_s,ecut_eff,ucvol,weight0
1050  real(dp):: eff
1051  character(len=9) :: suffix
1052  character(len=500) :: message
1053  character(len=fnlen) :: filden
1054  type(hdr_type) :: hdr0
1055 !arrays
1056  integer :: idum(1),idum3(3),ngmax(3),ngmin(3)
1057  integer,allocatable :: isort(:),jdtset_(:),my_distp(:,:)
1058  integer,pointer :: nkpt_rbz(:)
1059  real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3),rprimd(3,3)
1060  real(dp),allocatable :: weight(:)
1061  real(dp),pointer :: nband_rbz(:,:)
1062  type(dataset_type),pointer :: dtset
1063 !Cut-off function for npfft
1064 ! cutoff(nn)= &
1065 !&    0.2_dp+(one-0.2_dp)*(sin((pi*(nn-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))) &
1066 !&                           /((pi*(nn-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))))**2
1067 
1068 !******************************************************************
1069 
1070  DBG_ENTER("COLL")
1071 
1072 !Select current dataset
1073  dtset => dtsets(idtset)
1074 
1075 !Is automatic parallelization activated?
1076  autoparal = dtset%autoparal
1077  if (autoparal==0) return
1078 
1079 
1080  ! Handy local variables
1081  iam_master = (mpi_enreg%me==0)
1082  optdriver = dtset%optdriver
1083  max_ncpus = dtset%max_ncpus
1084 
1085  if (max_ncpus > 0 .and. autoparal/=0) then
1086    iexit = iexit + 1 ! will stop in the parent.
1087  end if
1088 
1089  ! Unit number used for outputting the autoparal sections
1090  ount = std_out
1091  ount = ab_out
1092 
1093  ! Small hack: set paral_kgb to -max_ncpus so that I don't have to change the previous implementation.
1094  !if (dtset%paral_kgb == 1 .and. max_ncpus > 0) then
1095  !  dtset%paral_kgb = -max_ncpus
1096  !end if
1097 
1098  if (optdriver==RUNL_GSTATE .and. dtset%paral_kgb==0 .and. &
1099 & max_ncpus>0 .and. autoparal/=0) then
1100    if (iam_master) then
1101      ! This corresponds to the simplest algorithm for GS (band-by-band CG)
1102      ! with distribution of k-points and spin.
1103      work_size = dtset%nkpt * dtset%nsppol
1104      write(ount,"(2a)")ch10,"--- !Autoparal"
1105      write(ount,"(a)")"# Autoparal section for GS run (band-by-band CG method)"
1106      write(ount,"(a)")   "info:"
1107      write(ount,"(a,i0)")"    autoparal: ",autoparal
1108      write(ount,"(a,i0)")"    paral_kgb: ",dtset%paral_kgb
1109      write(ount,"(a,i0)")"    max_ncpus: ",max_ncpus
1110      write(ount,"(a,i0)")"    nspinor: ",dtset%nspinor
1111      write(ount,"(a,i0)")"    nsppol: ",dtset%nsppol
1112      write(ount,"(a,i0)")"    nkpt: ",dtset%nkpt
1113      write(ount,"(a,i0)")"    mband: ",mband
1114 
1115      ! List of configurations.
1116      ! Assuming an OpenMP implementation with perfect speedup!
1117      write(ount,"(a)")"configurations:"
1118 
1119      do ii=1,max_ncpus
1120        if (ii > work_size) cycle
1121        do omp_ncpus=1,xomp_get_max_threads()
1122          nks_per_proc = work_size / ii
1123          nks_per_proc = nks_per_proc + MOD(work_size, ii)
1124          eff = (one * work_size) / (ii * nks_per_proc)
1125 
1126          write(ount,"(a,i0)")"    - tot_ncpus: ",ii * omp_ncpus
1127          write(ount,"(a,i0)")"      mpi_ncpus: ",ii
1128          write(ount,"(a,i0)")"      omp_ncpus: ",omp_ncpus
1129          write(ount,"(a,f12.9)")"      efficiency: ",eff
1130          !write(ount,"(a,f12.2)")"      mem_per_cpu: ",mempercpu_mb
1131        end do
1132      end do
1133      write(ount,'(a)')"..."
1134    end if
1135    ! Return immediately, will stop in the parent.
1136    iexit = iexit + 1
1137    RETURN
1138  end if
1139 
1140 
1141  nproc=mpi_enreg%nproc
1142  !if (xmpi_paral==1.and.dtset%paral_kgb <0) nproc=-dtset%paral_kgb
1143  if (max_ncpus > 0) nproc = dtset%max_ncpus
1144  !if (xmpi_paral==1.and.dtset%paral_kgb <0) nproc=dtset%max_ncpus
1145  if (xmpi_paral==0.and.dtset%paral_kgb>=0) nproc=1
1146 
1147  if (dtset%paral_kgb>=0) then
1148    if (nproc==1) then
1149      if (tread(1)==0.or.xmpi_paral==0) dtset%paral_kgb= 0
1150      if (tread(2)==0.or.xmpi_paral==0) dtset%npimage  = 1
1151      if (tread(3)==0.or.xmpi_paral==0) dtset%nppert   = 1
1152      if (tread(4)==0.or.xmpi_paral==0) dtset%npspinor = 1
1153      if (tread(5)==0.or.xmpi_paral==0) dtset%npkpt    = 1
1154      if (tread(6)==0.or.xmpi_paral==0) dtset%npfft    = 1
1155      if (tread(7)==0.or.xmpi_paral==0) dtset%npband   = 1
1156      if (tread(8)==0.or.xmpi_paral==0) dtset%bandpp   = 1
1157      if (tread(9)==0.or.xmpi_paral==0) dtset%use_slk  = 0
1158      if (tread(10)==0.or.xmpi_paral==0) dtset%np_slk  = 1000000
1159      return
1160    end if
1161    if ((dtset%optdriver/=RUNL_GSTATE.and.dtset%optdriver/=RUNL_RESPFN.and.dtset%optdriver/=RUNL_GWLS).or. &
1162 &   (dtset%optdriver==RUNL_GSTATE.and.dtset%usewvl==1)) then
1163      dtset%paral_kgb= 0
1164      dtset%npimage  = max(1,dtset%npimage)
1165      dtset%nppert   = max(1,dtset%nppert)
1166      dtset%npspinor = max(1,dtset%npspinor)
1167      dtset%npkpt    = max(1,dtset%npkpt)
1168      dtset%npfft    = max(1,dtset%npfft)
1169      dtset%npband   = max(1,dtset%npband)
1170      dtset%bandpp   = max(1,dtset%bandpp)
1171      return
1172    end if
1173  end if
1174 
1175  nprocmin=2
1176  if (xmpi_paral==1.and.dtset%paral_kgb>=0) nprocmin=max(2,nproc-100)
1177  if (max_ncpus > 0 .and. autoparal/=0) nprocmin = 1
1178 
1179 !Need the metric tensor
1180  call mkrdim(dtset%acell_orig(1:3,1),dtset%rprim_orig(1:3,1:3,1),rprimd)
1181  call metric(gmet,gprimd,-1,rmet,rprimd,ucvol)
1182 
1183 !Determine some quantities related to plane waves
1184 !  - Crude estimation of the number of PW
1185 !  - Number of G vectors in each direction
1186  mpw=0;ngmin=0;ngmax=0
1187  if (optdriver==RUNL_GSTATE) then
1188    ecut_eff = dtset%ecut*dtset%dilatmx**2
1189    mpw = nint(ucvol*((two*ecut_eff)**1.5_dp)/(six*pi**2)) ! Crude estimation
1190    if (all(dtset%istwfk(1:dtset%nkpt)>1)) mpw=mpw/2+1
1191    call kpgcount(ecut_eff,dtset%exchn2n3d,gmet,dtset%istwfk,dtset%kpt,ngmax,ngmin,dtset%nkpt)
1192    write(message,'(a,i8)') ' getmpw sequential formula gave: ',mpw
1193    call wrtout(std_out,message,'COLL')
1194  end if
1195 
1196  write(message,'(2a,i0)')  ch10,&
1197 & ' Computing all possible proc distrib for this input with nproc less than ',nproc
1198  call wrtout(std_out,message,'COLL')
1199 
1200 !Parallelization over images
1201  npi_min=1;npi_max=1;nimage_eff=1
1202  if (optdriver==RUNL_GSTATE) then
1203    nimage_eff=dtset%ndynimage
1204    if (dtset%ntimimage<=1) nimage_eff=dtset%nimage
1205    npi_min=max(1,dtset%npimage)
1206    npi_max=min(nproc,nimage_eff)
1207    if (tread(2)==1) npi_max=dtset%npimage
1208  end if
1209 
1210 !Parallelization over k-points and spin components (GS)
1211  npk_min=1;npk_max=1;nkpt_eff=0
1212  if (optdriver==RUNL_GSTATE) then
1213    nkpt_eff=dtset%nkpt*dtset%nsppol
1214    npk_min=max(1,dtset%npkpt)
1215    npk_max=min(nproc,nkpt_eff)
1216    if (tread(4)==1) npk_max=dtset%npkpt
1217  end if
1218 
1219 !Parallelization over perturbations, k-points and spin components (DFPT)
1220  npp_min=1;npp_max=1;npert_eff=1
1221  if (optdriver==RUNL_RESPFN) then
1222    if (dtset%paral_rf==1) then
1223      call get_npert_rbz(dtset,nband_rbz,nkpt_rbz,npert_eff)
1224      do jj=1,npert_eff
1225        ii=dtset%nsppol*nkpt_rbz(jj)*maxval(nband_rbz(:,jj))
1226        nkpt_eff=max(nkpt_eff,ii)
1227      end do
1228      npp_min=max(1,dtset%nppert)
1229      npp_max=min(nproc,npert_eff)
1230      if (tread(3)==1) then
1231        npp_max=dtset%nppert
1232        if (npp_max>npert_eff) then
1233          npp_min=npert_eff;npp_max=npert_eff
1234          message='nppert is bigger than npert; we set nppert=npert'
1235          MSG_WARNING(message)
1236        end if
1237      end if
1238      npk_min=1
1239      npk_max=min(nproc,nkpt_eff)
1240      ABI_DEALLOCATE(nkpt_rbz)
1241      ABI_DEALLOCATE(nband_rbz)
1242    else
1243      nkpt_eff=nproc
1244      npk_min=nproc-5
1245      npk_max=nproc
1246    end if
1247  end if
1248 
1249 !Parallelization over spinorial components
1250  nps_min=1;nps_max=1
1251  if (optdriver==RUNL_GSTATE) then
1252    nps_min=max(1,dtset%npspinor)
1253    nps_max=min(nproc,dtset%nspinor)
1254    if (tread(5)==1) nps_max=dtset%npspinor
1255  end if
1256 
1257 !KGB Parallelization
1258 
1259 !>> FFT level
1260  npf_min=1;npf_max=1
1261  npb_min=1;npb_max=1
1262  bpp_min=1;bpp_max=1
1263  n2=0;n3=0
1264  if (dtset%optdriver==RUNL_GSTATE) then
1265    npf_min=max(1,dtset%npfft)
1266    npf_min=min(npf_min,ngmin(2))
1267    npf_max=min(nproc,NPFMAX)
1268    if (tread(6)==1) then
1269      npf_max=dtset%npfft
1270      if (npf_max>ngmin(2)) then
1271        write(message,'(3a)') &
1272 &       "Value of npfft given in input file is too high for the FFT grid!",ch10,&
1273 &       "Action: decrease npfft or increase FFT grid (ecut, ngfft, ...)."
1274        MSG_ERROR(message)
1275      end if
1276    end if
1277    npf_max=min(npf_max,ngmin(2))
1278    if (dtset%use_gpu_cuda==1) then
1279      npf_min=1;npf_max=1
1280    end if
1281 
1282 !  Number of FFT procs has to be a multiple of FFT grid sizes
1283 !  In case of a restart from a density file, it has to be
1284 !  compatible with the FFT grid used for the density
1285    n2=dtset%ngfft(2) ; n3=dtset%ngfft(3)
1286    if (n2==0.and.n3==0.and.(dtset%getden/=0.or.dtset%irdden/=0.or.dtset%iscf<0)) then
1287      dtset_found=.false.;file_found=.false.
1288      !1-Try to find ngfft from previous dataset
1289      if (dtset%getden/=0) then
1290        do ii=1,ndtset_alloc
1291          jj=dtset%getden;if (jj<0) jj=dtset%jdtset+jj
1292          if (dtsets(ii)%jdtset==jj) then
1293            dtset_found=.true.
1294 !          n2=dtsets(ii)%nfftdg;n3=0
1295            n2=dtsets(ii)%ngfftdg(2);n3=dtsets(ii)%ngfftdg(3)
1296          end if
1297        end do
1298      end if
1299      !2-If not found, try to extract ngfft from density file
1300      if (.not.dtset_found) then
1301        !Retrieve file name
1302        suffix='_DEN';if (dtset%nimage>1) suffix='_IMG1_DEN'
1303        ABI_ALLOCATE(jdtset_,(0:ndtset_alloc))
1304        jdtset_=0;if(ndtset_alloc/=0) jdtset_(0:ndtset_alloc)=dtsets(0:ndtset_alloc)%jdtset
1305        call mkfilename(filnam,filden,dtset%getden,idtset,dtset%irdden,jdtset_,ndtset_alloc,suffix,'den',ii)
1306        ABI_DEALLOCATE(jdtset_)
1307        !Retrieve ngfft from file header
1308        idum3=0
1309        if (mpi_enreg%me==0) then
1310          inquire(file=trim(filden),exist=file_found)
1311          if (file_found) then
1312            call hdr_read_from_fname(hdr0,filden,ii,xmpi_comm_self)
1313            idum3(1:2)=hdr0%ngfft(2:3);if (file_found) idum3(3)=1
1314            call hdr_free(hdr0)
1315            MSG_WARNING("Cannot find filden"//filden)
1316          end if
1317        end if
1318        call xmpi_bcast(idum3,0,mpi_enreg%comm_world,ii)
1319        n2=idum3(1);n3=idum3(2);file_found=(idum3(3)/=0)
1320      end if
1321    end if
1322 
1323 !  >> Band level
1324    npb_min=max(1,dtset%npband)
1325    npb_max=min(nproc,mband)
1326    if (tread(7)==1) npb_max=dtset%npband
1327 
1328 !  >> banddp level
1329    bpp_min=max(1,dtset%bandpp)
1330    bpp_max=max(4,nint(mband/10.)) ! reasonnable bandpp max
1331    if (tread(8)==1) bpp_max=dtset%bandpp
1332  end if
1333 
1334 !Disable KGB parallelisation in some cases:
1335 !  - no GS
1336 !  - paral_kgb=0 present in input file
1337 !  - nstep=0
1338 !  - Self-consistent DMFT
1339 !  - Hartree-Fock or hybrid calculation (for now on)
1340  if ( (optdriver/=RUNL_GSTATE) .or. (dtset%paral_kgb==0.and.tread(1)==1) .or. &
1341 & (dtset%nstep==0).or. (dtset%usedmft==1.and.dtset%nstep>1) .or. &
1342 & (dtset%usefock==1) ) then
1343    nps_min=1; nps_max=1
1344    npf_min=1; npf_max=1
1345    npb_min=1; npb_max=1
1346    bpp_min=1; bpp_max=1
1347  end if
1348 
1349 !Print title
1350  if (iam_master) then
1351    if (optdriver==RUNL_GSTATE) then
1352      write(message, '(8(a12,a1),a,8(i4,a4,i4,a1))' )  &
1353      'npimage','|','npkpt','|','npspinor','|','npfft','|','npband','|',' bandpp ' ,'|','nproc','|','weight','|', ch10, &
1354      npi_min,' -> ',npi_max,'|',npk_min,' -> ',npk_max,'|',nps_min,' -> ',nps_max,'|', &
1355      npf_min,' -> ',npf_max,'|',npb_min,' -> ',npb_max,'|',bpp_min,' -> ',bpp_max,'|', &
1356      nprocmin,' -> ',nproc,'|', 1 ,' -> ',nproc,'|'
1357    end if
1358    if (optdriver==RUNL_RESPFN) then
1359      write(message, '(4(a12,a1),a,4(i4,a4,i4,a1))' )  &
1360      'nppert','|','npkpt','|','nproc','|','weight','|', ch10, &
1361      npp_min,' -> ',npp_max,'|',      npk_min,' -> ',npk_max,'|', &
1362      nprocmin,' -> ',nproc,'|', 1 ,' -> ',nproc,'|'
1363    end if
1364    call wrtout(std_out,message,'COLL')
1365    if(max_ncpus>0) then
1366      call wrtout(ab_out,message,'COLL')
1367    end if
1368  end if
1369 
1370 !Allocate lists
1371  ABI_ALLOCATE(my_distp,(10,MAXCOUNT))
1372  ABI_ALLOCATE(weight,(MAXCOUNT))
1373  my_distp(1:7,:)=0;weight(:)=zero
1374  my_distp(8,:)=dtset%use_slk
1375  my_distp(9,:)=dtset%np_slk
1376  my_distp(10,:)=dtset%gpu_linalg_limit
1377  icount=0;imin=1
1378 
1379  npc_min=1;npc_max=1;ncell_eff=1
1380  if (optdriver==RUNL_GSTATE) then
1381    ncell_eff=nimage_eff;npc_min=npi_min;npc_max=npi_max
1382  end if
1383  if (optdriver==RUNL_RESPFN) then
1384    ncell_eff=npert_eff;npc_min=npp_min;npc_max=npp_max
1385  end if
1386 
1387 !Loop over all possibilities
1388 !Computation of weight~"estimated acceleration"
1389  if (new_version) then
1390 
1391 !  ======= NEW VERSION ========
1392    do npc=npc_min,npc_max
1393      acc_c=one;if (npc>1) acc_c=0.99_dp*speedup_fdp(ncell_eff,npc)
1394 
1395      do npk=npk_min,npk_max
1396 !      -> for DFPT runs, impose that nsppol divide npk
1397        if (optdriver==RUNL_RESPFN.and.modulo(npk,dtset%nsppol)>0.and.npk>1) cycle
1398        acc_k=one;if (npk>1) acc_k=0.96_dp*speedup_fdp(nkpt_eff,npk)
1399 
1400        do nps=nps_min,nps_max
1401          acc_s=one;if (nps>1) acc_s=0.85_dp*speedup_fdp(dtset%nspinor,nps)
1402 
1403          do npf=npf_min,npf_max
1404 !          -> npf should divide ngfft if set (if unset, ngfft=0 so the modulo test is ok)
1405            if((modulo(n2,npf)>0).or.(modulo(n3,npf)>0)) cycle
1406 !          -> npf should be only divisible by 2, 3 or 5
1407            ii=npf
1408            do while (modulo(ii,2)==0)
1409              ii=ii/2
1410            end do
1411            do while (modulo(ii,3)==0)
1412              ii=ii/3
1413            end do
1414            do while (modulo(ii,5)==0)
1415              ii=ii/5
1416            end do
1417            if(ii/=1) cycle
1418 
1419            do npb=npb_min,npb_max
1420              nproc1=npc*npk*nps*npf*npb
1421              if (nproc1<nprocmin)     cycle
1422              if (nproc1>nproc)        cycle
1423              if (modulo(mband,npb)>0) cycle
1424 
1425 !            Base speedup
1426              acc_kgb_0=one;if (npb*npf>1) acc_kgb_0=0.7_dp*speedup_fdp(mpw,(npb*npf))
1427 
1428              if (npb*npf>4) then
1429 !              Promote npb=npf
1430                acc_kgb_0=acc_kgb_0*min((one*npf)/(one*npb),(one*npb)/(one*npf))
1431 !              Promote npf<=20
1432                if (npf>20)then
1433                  acc_kgb_0=acc_kgb_0* &
1434 &                 0.2_dp+(one-0.2_dp)*(sin((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))) &
1435 &                 /((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))))**2
1436                end if
1437              end if
1438 
1439              first_bpp=.true.
1440              do bpp=bpp_min,bpp_max
1441                if (modulo(mband/npb,bpp)>0) cycle
1442                if ((bpp>1).and.(modulo(bpp,2)>0)) cycle
1443                if (one*npb*bpp >max(1.,mband/3.).and.(mband>30)) cycle
1444                if (npb*npf<=4.and.(.not.first_bpp)) cycle
1445                first_bpp=.false.
1446 
1447                acc_kgb=acc_kgb_0
1448 !              Promote bpp*npb>mband/3
1449                if (npb*npf>4.and.mband>30) acc_kgb=acc_kgb*(one-(three*bpp*npb)/(one*mband))
1450 
1451 !              Resulting speedup
1452 !              weight0=acc_c*acc_k*acc_s*acc_kgb
1453                weight0=nproc1*(acc_c+acc_k+acc_s+acc_kgb)/(npc+npk+nps+(npf*npb))
1454 
1455 !              Store data
1456                icount=icount+1
1457                if (icount<=MAXCOUNT) then
1458                  my_distp(1:7,icount)=(/npc,npk,nps,npf,npb,bpp,nproc1/)
1459                  weight(icount)=weight0
1460                  if (weight0<weight(imin)) imin=icount
1461                else
1462                  if (weight0>weight(imin)) then
1463                    my_distp(1:7,imin)=(/npc,npk,nps,npf,npb,bpp,nproc1/)
1464                    weight(imin)=weight0
1465                    idum=minloc(weight);imin=idum(1)
1466                  end if
1467                end if
1468 
1469              end do ! bpp
1470            end do ! npb
1471          end do ! npf
1472        end do ! nps
1473      end do ! npk
1474    end do ! npc
1475  else
1476 
1477 !  ======= OLD VERSION ========
1478    do npc=npc_min,npc_max
1479      acc_c=one;if (npc>1) acc_c = 0.99_dp*ncell_eff/((ncell_eff+npc-1)/npc)
1480 
1481      do npk=npk_min,npk_max
1482        acc_k=one;if (npk>1) acc_k = 0.96_dp*nkpt_eff/((nkpt_eff+npk-1)/npk)
1483 
1484        do nps=nps_min,nps_max
1485          acc_s=one;if (nps>1) acc_s = 0.85_dp*dtset%nspinor/ ((dtset%nspinor+nps-1)/nps)
1486 
1487          do npf=npf_min,npf_max
1488 !          -> npf should divide ngfft if set (if unset, ngfft=0 so the modulo test is ok)
1489            if((modulo(n2,npf)>0).or.(modulo(n3,npf)>0)) cycle
1490 !          -> npf should be only divisible by 2, 3, 5, 7 or 11
1491            npb=npf ! Note that here, npb is used as a temp var
1492            do while (modulo(npb,2)==0)
1493              npb=npb/2
1494            end do
1495            do while (modulo(npb,3)==0)
1496              npb=npb/3
1497            end do
1498            do while (modulo(npb,5)==0)
1499              npb=npb/5
1500            end do
1501            do while (modulo(npb,7)==0)
1502              npb=npb/7
1503            end do
1504            do while (modulo(npb,11)==0)
1505              npb=npb/11
1506            end do
1507            if(npb/=1) cycle
1508 
1509            do npb=npb_min,npb_max
1510              nproc1=npc*npk*nps*npf*npb
1511              if (nproc1<nprocmin) cycle
1512              if (nproc1>nproc) cycle
1513              if(modulo(mband,npb)>0) cycle
1514 
1515              do bpp=bpp_max,bpp_min,-1
1516                if(modulo(mband/npb,bpp)>0) cycle
1517                if((bpp>1).and.(modulo(bpp,2)>0)) cycle
1518                if (1.*npb*bpp >max(1.,mband/3.)) cycle
1519 
1520                acc_kgb=one
1521                if (npb*npf>4) then
1522                  acc_kgb=min((one*npf)/(one*npb),(one*npb)/(one*npf))  * &
1523                  (mpw/(mpw/(npb*npf)))*(one-(three*bpp*npb)/mband)
1524                else if (npb*npf >1) then
1525                  acc_kgb=(mpw*mband/(mband*mpw/(npb*npf)))*0.7_dp
1526                end if
1527 
1528 !              Weight average for efficiency and estimated acceleration
1529                weight0=(acc_c+acc_k+acc_s+acc_kgb)/(npc+npk+nps+(npf*npb))
1530                weight0=weight0*nproc1
1531 
1532 !              Store data
1533                icount=icount+1
1534                if (icount<=MAXCOUNT) then
1535                  my_distp(1:7,icount)=(/npc,npk,nps,npf,npb,bpp,nproc1/)
1536                  weight(icount)=weight0
1537                  if (weight0<weight(imin)) imin=icount
1538                else
1539                  if (weight0>weight(imin)) then
1540                    my_distp(1:7,imin)=(/npc,npk,nps,npf,npb,bpp,nproc1/)
1541                    weight(imin)=weight0
1542                    idum=minloc(weight);imin=idum(1)
1543                  end if
1544                end if
1545 
1546              end do ! bpp
1547            end do ! npb
1548          end do ! npf
1549        end do ! nps
1550      end do ! npk
1551    end do ! npc
1552 
1553 !  New or old version
1554  end if
1555 
1556  mcount_eff=icount
1557  mcount=min(mcount_eff,MAXCOUNT)
1558 
1559  if (mcount==0) then
1560    write(message,'(a,i0,2a,i0,a)')  &
1561    'Your input dataset does not let Abinit find an appropriate process distribution with nproc=',nproc,ch10, &
1562    'Try to comment all the np* vars and set paral_kgb=',-nproc,' to have advices on process distribution.'
1563    MSG_WARNING(message)
1564 !  Override here the 0 default value changed in indefo1
1565    dtset%npimage  = max(1,dtset%npimage)
1566    dtset%nppert   = max(1,dtset%nppert)
1567    dtset%npkpt    = max(1,dtset%npkpt)
1568    dtset%npspinor = max(1,dtset%npspinor)
1569    dtset%npfft    = max(1,dtset%npfft)
1570    dtset%npband   = max(1,dtset%npband)
1571    dtset%bandpp   = max(1,dtset%bandpp)
1572    ABI_DEALLOCATE(my_distp)
1573    ABI_DEALLOCATE(weight)
1574    return
1575  end if
1576 
1577 !* HF or hybrid calculation: no use of the fonction "autoparal"
1578  if ((dtset%usefock==1).AND.(dtset%nphf/=1)) then
1579    write(message,'(a,i5,2a,i6,a)')  &
1580    'Hartree-Fock or hybrid calculation : Your input dataset does not let Abinit find an appropriate process distribution.'
1581    MSG_WARNING(message)
1582 !  Override here the 0 default value changed in indefo1
1583    dtset%npimage  = max(1,dtset%npimage)
1584    dtset%npkpt    = max(1,dtset%npkpt)
1585    dtset%npspinor = max(1,dtset%npspinor)
1586    dtset%npfft    = max(1,dtset%npfft)
1587    dtset%npband   = max(1,dtset%npband)
1588    dtset%bandpp   = max(1,dtset%bandpp)
1589    ABI_DEALLOCATE(my_distp)
1590    ABI_DEALLOCATE(weight)
1591    return
1592  end if
1593 
1594 !Sort data by increasing weight
1595  ABI_ALLOCATE(isort,(mcount))
1596  isort=(/(ii,ii=1,mcount)/)
1597  call sort_dp(mcount,weight,isort,tol6)
1598 
1599  ncount=mcount;if (dtset%paral_kgb>=0) ncount=min(mcount,5)
1600  if (iam_master) then
1601    do jj=mcount,mcount-ncount+1,-1
1602      ii=isort(jj)
1603      if (optdriver==RUNL_GSTATE) then
1604        write(message, '(7(i12,a1),f11.2,a2)') &
1605 &       my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|', &
1606 &       my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),' |'
1607      end if
1608      if (optdriver==RUNL_RESPFN) then
1609        write(message, '(3(i12,a1),f11.2,a2)') &
1610 &       my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(7,ii),'|',weight(jj),' |'
1611      end if
1612      call wrtout(std_out,message,'COLL')
1613      if(max_ncpus>0) then
1614        call wrtout(ab_out,message,'COLL')
1615      end if
1616    end do
1617  end if
1618 
1619  if (max_ncpus>0.and.(mcount_eff>MAXCOUNT)) then
1620    write(message,'(a,i0,a,i0,a)') &
1621 &   ' Received max_ncpus ',max_ncpus,' possible choices for nproc; only the first ',MAXCOUNT,' ones are printed...'
1622    call wrtout(ab_out,message,'COLL')
1623    call wrtout(std_out,message,'COLL')
1624  end if
1625 
1626  !if (iam_master .and. dtset%paral_kgb<0) then
1627  if (iam_master .and. max_ncpus>0) then
1628    write(ount,'(2a)')ch10,"--- !Autoparal"
1629 
1630    if (optdriver==RUNL_GSTATE) then
1631      write(ount,"(a)")"#Autoparal section for GS calculations with paral_kgb"
1632    else if (optdriver==RUNL_RESPFN) then
1633      write(ount,"(a)")'#Autoparal section for DFPT calculations'
1634    else
1635      MSG_ERROR("Unsupported optdriver")
1636    end if
1637 
1638    write(ount,"(a)")   "info:"
1639    write(ount,"(a,i0)")"    autoparal: ",autoparal
1640    write(ount,"(a,i0)")"    paral_kgb: ",dtset%paral_kgb
1641    write(ount,"(a,i0)")"    max_ncpus: ",max_ncpus
1642    write(ount,"(a,i0)")"    nspinor: ",dtset%nspinor
1643    write(ount,"(a,i0)")"    nsppol: ",dtset%nsppol
1644    write(ount,"(a,i0)")"    nkpt: ",dtset%nkpt
1645    write(ount,"(a,i0)")"    mband: ",mband
1646 
1647    ! List of configurations.
1648    write(ount,"(a)")"configurations:"
1649 
1650    if (optdriver==RUNL_GSTATE) then
1651 
1652      do jj=mcount,mcount-ncount+1,-1
1653        ii=isort(jj)
1654        tot_ncpus = my_distp(7,ii)
1655        eff = weight(jj) / tot_ncpus
1656 
1657        write(ount,"(a,i0)")"    - tot_ncpus: ",tot_ncpus
1658        write(ount,"(a,i0)")"      mpi_ncpus: ",tot_ncpus
1659        !write(ount,"(a,i0)")"      omp_ncpus: ",omp_ncpus !OMP not supported  (yet)
1660        write(ount,"(a,f12.9)")"      efficiency: ",eff
1661        !write(ount,"(a,f12.2)")"      mem_per_cpu: ",mempercpu_mb
1662 
1663        ! list of variables to use.
1664        !'npimage','|','npkpt','|','npspinor','|','npfft','|','npband','|',' bandpp ' ,'|','nproc','|','weight','|'
1665        write(ount,"(a)"   )"      vars: {"
1666        write(ount,"(a,i0,a)")"            npimage: ",my_distp(1,ii),","
1667        write(ount,"(a,i0,a)")"            npkpt: ", my_distp(2,ii),","
1668        write(ount,"(a,i0,a)")"            npspinor: ",my_distp(3,ii),","
1669        write(ount,"(a,i0,a)")"            npfft: ", my_distp(4,ii),","
1670        write(ount,"(a,i0,a)")"            npband: ",my_distp(5,ii),","
1671        write(ount,"(a,i0,a)")"            bandpp: ",my_distp(6,ii),","
1672        write(ount,"(a)")   "            }"
1673      end do
1674 
1675    else if (optdriver==RUNL_RESPFN) then
1676 
1677      do jj=mcount,mcount-ncount+1,-1
1678        ii=isort(jj)
1679        tot_ncpus = my_distp(7,ii)
1680        eff = weight(jj) / tot_ncpus
1681 
1682        write(ount,"(a,i0)")"    - tot_ncpus: ",tot_ncpus
1683        write(ount,"(a,i0)")"      mpi_ncpus: ",tot_ncpus
1684        !write(ount,"(a,i0)")"      omp_ncpus: ",omp_ncpus !OMP not supported  (yet)
1685        write(ount,"(a,f12.9)")"      efficiency: ",eff
1686        !write(ount,"(a,f12.2)")"      mem_per_cpu: ",mempercpu_mb
1687        ! list of variables to use.
1688        !'nppert','|','npkpt','|','nproc','|','weight','|',
1689        write(ount,"(a)"   )"      vars: {"
1690        write(ount,"(a,i0,a)")"             nppert: ", my_distp(1,ii),","
1691        write(ount,"(a,i0,a)")"             npkpt: ", my_distp(2,ii),","
1692        write(ount,"(a)")   "            }"
1693      end do
1694 
1695    end if
1696    write(ount,'(a)')"..."
1697    iexit = iexit + 1
1698  end if
1699 
1700  icount=isort(mcount)
1701 
1702 !Refinement of the process distribution by mean of a LinAlg routines benchmarking
1703  if (optdriver==RUNL_GSTATE.and.autoparal/=1) then
1704    if (autoparal/=3) then
1705      if (autoparal==2) then
1706        write(message,'(5a,9(a10,a1))') ch10, &
1707 &       ' Values below have been tested with respect to Linear Algebra performance;',ch10,&
1708 &       ' Weights below are corrected according:',ch10,&
1709 &       'npimage','|','npkpt' ,'|','npspinor'  ,'|','npfft'     ,'|','npband','|',' bandpp ' ,'|',&
1710 &       'nproc'  ,'|','weight','|','new weight','|'
1711      else
1712        write(message,'(5a,11(a10,a1))') ch10, &
1713 &       ' Values below have been tested with respect to Linear Algebra performance;',ch10,&
1714 &       ' Weights below are corrected according:',ch10,&
1715 &       'npimage','|','npkpt' ,'|','npspinor'  ,'|','npfft'     ,'|','npband','|',' bandpp ' ,'|',&
1716 &       'nproc'  ,'|','weight','|','new weight','|','best npslk','|','linalggpu' ,'|'
1717      end if
1718      call wrtout(std_out,message,'COLL')
1719      if (max_ncpus > 0) then
1720        call wrtout(ab_out,message,'COLL')
1721      end if
1722    end if
1723    acc_k=zero
1724    ncount=min(MAXBENCH,mcount);if (autoparal==3) ncount=1
1725    do jj=mcount,mcount-ncount+1,-1
1726      ii=isort(jj)
1727      npf=my_distp(4,ii);npb=my_distp(5,ii);bpp=my_distp(6,ii)
1728      if ((npb*npf*bpp>1).and.(npf*npb<=mpi_enreg%nproc)) then
1729        use_linalg_gpu=dtset%use_gpu_cuda
1730        call compute_kgb_indicator(acc_kgb,bpp,xmpi_world,mband,mpw,npb,npf,np_slk,use_linalg_gpu)
1731        if (autoparal/=2) then
1732          my_distp(9,ii)=np_slk
1733          if (np_slk>0) my_distp(8,ii)=1
1734 !        * gpu_linalg_limit:
1735 !        No use of GPU: huge value ~2  *vectsize*blocksize**2 tested
1736 !        Use of GPU:    tiny value ~0.5*vectsize*blocksize**2 tested
1737          my_distp(10,ii)=2*dtset%mpw*(npb*bpp)**2/npf
1738          if (use_linalg_gpu==1) my_distp(10,ii)=my_distp(10,ii)/4
1739        end if
1740        if (abs(acc_k)<=tol12) acc_k=acc_kgb ! Ref value : the first one computed
1741 !      * Weight (corrected by 10% of the computed ratio)
1742        weight0=weight(jj)*(one + 0.1_dp*acc_k/acc_kgb)
1743        if (autoparal==2) then
1744          write(message, '(7(i10,a1),f9.2,a2,f9.5,a2)') &
1745 &         my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',&
1746 &         my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |'
1747        else if (autoparal==3) then
1748          write(message,'(a,5(a,i3))') ch10,' For npband=',npb,', npfft=',npf,' and bandpp=',bpp, &
1749 &         ', compute_kgb_indicator recommends you to set np_slk=',my_distp(9,ii),&
1750 &         ' and use_linalg_gpu=',use_linalg_gpu
1751        else
1752          write(message, '(7(i10,a1),f9.2,a2,f9.5,a2,2(i10,a1))') &
1753 &         my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',&
1754 &         my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |',&
1755 &         my_distp(9,ii),'|',use_linalg_gpu,'|'
1756        end if
1757        call wrtout(std_out,message,'COLL')
1758        if (max_ncpus>0) then
1759          call wrtout(ab_out,message,'COLL')
1760        end if
1761 !      We store the best value in weight(mcount) and keep icount
1762        if (weight0 > weight(mcount)) then
1763          icount=ii;weight(mcount)=weight0
1764        end if
1765      end if
1766    end do
1767  end if
1768 
1769 !Final advice in case max_ncpus > 0
1770  if (max_ncpus>0) then
1771    write(message,'(6a)') ch10,&
1772 &   ' Launch a parallel version of ABINIT with a number of processors among the above list,',ch10,&
1773 &   ' and the associated input variables npkpt, npband, npfft and bandpp. ',ch10,&
1774 &   ' The optimal weight is close to nproc and the higher should be better.'
1775    call wrtout(std_out,message,'COLL')
1776    call wrtout(ab_out,message,'COLL')
1777    iexit=iexit+1
1778    GOTO 100
1779  end if
1780 
1781 !Store new process distribution
1782  if (dtset%paral_kgb>=0) then
1783    nproc1=my_distp(7,icount)
1784 !  Work load distribution
1785    if (optdriver==RUNL_GSTATE) then
1786      dtset%npimage= my_distp(1,icount)
1787      dtset%nppert = 1
1788      dtset%npkpt  = my_distp(2,icount)
1789    end if
1790    if (optdriver==RUNL_RESPFN) then
1791      dtset%npimage= 1
1792      dtset%nppert = my_distp(1,icount)
1793      dtset%npkpt  = 1
1794    end if
1795    dtset%npspinor = my_distp(3,icount)
1796    dtset%npfft    = my_distp(4,icount)
1797    dtset%npband   = my_distp(5,icount)
1798    dtset%bandpp   = my_distp(6,icount)
1799 !  The following lines are mandatory : the DFT+DMFT must use ALL the
1800 !  available procs specified by the user. So nproc1=nproc.
1801 !  Works only if paral_kgb is not activated.
1802    if (dtset%usedmft/=0.and.optdriver==RUNL_GSTATE.and.dtset%paral_kgb==0) then
1803      dtset%npspinor = 1
1804      dtset%npfft    = 1
1805      dtset%npband   = 1
1806      dtset%bandpp   = 1
1807      dtset%npimage  = 1
1808      nproc1         = nproc
1809    end if
1810    if (dtset%npband*dtset%npfft*dtset%bandpp>1) dtset%paral_kgb=1
1811 !  LinAlg parameters: we change values only if they are not present in input file
1812    if (dtset%paral_kgb==1) then
1813      if (tread(9)==0) dtset%use_slk=my_distp(8,icount)
1814      if (tread(10)==0) dtset%np_slk=my_distp(9,icount)
1815      if (tread(11)==0) dtset%gpu_linalg_limit=my_distp(10,icount)
1816    end if
1817 !  New definition of "world" MPI communicator
1818    if (optdriver==RUNL_RESPFN.and.dtset%paral_rf==1) then
1819      nproc1=max(nproc1,dtset%nsppol*dtset%nkpt) ! Take into account the code in respfn.F90
1820      nproc1=min(nproc1,nproc)
1821      nproc1=(nproc1/dtset%nppert)*dtset%nppert
1822    end if
1823    call initmpi_world(mpi_enreg,nproc1)
1824  end if
1825 
1826  100 continue
1827 
1828  ABI_DEALLOCATE(isort)
1829  ABI_DEALLOCATE(my_distp)
1830  ABI_DEALLOCATE(weight)
1831 
1832  DBG_EXIT("COLL")
1833 
1834  contains
1835 
1836    function speedup_fdp(nn,mm)
1837    !Expected linear speedup for a nn-sized problem and mm processes
1838 
1839 !This section has been created automatically by the script Abilint (TD).
1840 !Do not modify the following lines by hand.
1841 #undef ABI_FUNC
1842 #define ABI_FUNC 'speedup_fdp'
1843 !End of the abilint section
1844 
1845    real(dp) :: speedup_fdp
1846    integer,intent(in) :: nn,mm
1847    speedup_fdp=(one*nn)/(one*((nn/mm)+merge(0,1,mod(nn,mm)==0)))
1848  end function speedup_fdp
1849 
1850 end subroutine finddistrproc

ABINIT/m_mpi_setup [ Modules ]

[ Top ] [ Modules ]

NAME

 m_mpi_setup

FUNCTION

  Initialize MPI parameters and datastructures for parallel execution

COPYRIGHT

  Copyright (C) 1999-2018 ABINIT group (FJ, MT, FD)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

PARENTS

CHILDREN

SOURCE

21 #if defined HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24 
25 #include "abi_common.h"
26 
27 module m_mpi_setup
28 
29  use defs_basis
30  use defs_abitypes
31  use m_distribfft
32  use m_xmpi
33  use m_xomp
34  use m_hdr
35  use m_sort
36  use m_errors
37  use m_abicore
38 
39  use m_time,         only : abi_wtime
40  use m_parser,       only : intagm
41  use m_geometry,     only : mkrdim, metric
42  use m_fftcore,      only : fftalg_for_npfft, getng,  kpgcount
43  use m_mpinfo,       only : init_mpi_enreg, mpi_distrib_is_ok, initmpi_atom, proc_distrb_cycle, &
44                             initmpi_grid, initmpi_pert, initmpi_img, distrb2, distrb2_hf, initmpi_world
45  use m_libpaw_tools, only : libpaw_write_comm_set
46  use m_dtset,        only : get_npert_rbz
47  use m_kg,           only : getmpw
48  use m_dtfil,        only : mkfilename
49 
50  implicit none
51 
52  private

ABINIT/mpi_setup [ Functions ]

[ Top ] [ Functions ]

NAME

 mpi_setup

FUNCTION

 Big loop on the datasets:
 - compute mgfft,mpw,nfft,... for this data set;
 - fill mpi_enreg
  *** At the output of this routine, all the dtsets input variables are known ***
 The content of dtsets should not be modified anymore afterwards.

INPUTS

  filnam(5)=character strings giving file names
  ndtset= number of datasets to be read; if 0, no multi-dataset mode
  ndtset_alloc=number of datasets, corrected for allocation of at least
      one data set.

OUTPUT

  dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables,
   some of which are initialized here, while other were already
   initialized previously.

SIDE EFFECTS

   mpi_enregs=information about MPI parallelization

PARENTS

      abinit

CHILDREN

      abi_io_redirect,distrb2,distrb2_hf,finddistrproc,get_npert_rbz,getmpw
      getng,init_distribfft,init_mpi_enreg,initmpi_atom,initmpi_grid
      initmpi_img,initmpi_pert,intagm,libpaw_write_comm_set,metric,mkrdim
      wrtout

SOURCE

 97 subroutine mpi_setup(dtsets,filnam,lenstr,mpi_enregs,ndtset,ndtset_alloc,string)
 98 
 99 
100 !This section has been created automatically by the script Abilint (TD).
101 !Do not modify the following lines by hand.
102 #undef ABI_FUNC
103 #define ABI_FUNC 'mpi_setup'
104 !End of the abilint section
105 
106  implicit none
107 
108 !Arguments ------------------------------------
109 !scalars
110  integer,intent(in) :: lenstr,ndtset,ndtset_alloc
111  type(MPI_type),intent(inout) :: mpi_enregs(0:ndtset_alloc)
112  character(len=*),intent(inout) :: string
113 !arrays
114  character(len=fnlen),intent(in) :: filnam(5)
115  type(dataset_type),intent(inout) :: dtsets(0:ndtset_alloc)
116 
117 !Local variables -------------------------------
118 !scalars
119  integer :: blocksize,exchn2n3d,iband,idtset,iexit,ii,iikpt,iikpt_modulo
120  integer :: isppol,jdtset,marr,mband_lower,mband_upper
121  integer :: me_fft,mgfft,mgfftdg,mkmem,mpw,mpw_k,optdriver
122  integer :: nfft,nfftdg,nkpt,nkpt_me,npert,nproc,nproc_fft,nqpt
123  integer :: nspink,nsppol,nsym,paral_fft,response,tnband,tread0,usepaw,vectsize
124  integer :: fftalg,fftalga,fftalgc
125 #ifdef HAVE_LINALG_ELPA
126  integer :: icol,irow,np
127 #endif
128  logical :: fftalg_read,ortalg_read,wfoptalg_read,do_check
129  real(dp) :: dilatmx,ecut,ecut_eff,ecutdg_eff,ucvol
130  character(len=500) :: message
131 !arrays
132  integer :: ngfft(18),ngfftdg(18),ngfftc(3),tread(12)
133  integer,allocatable :: intarr(:),istwfk(:),symrel(:,:,:)
134  integer,pointer :: nkpt_rbz(:)
135  real(dp),parameter :: k0(3)=(/zero,zero,zero/)
136  real(dp) :: gmet(3,3),gprimd(3,3),kpt(3),qphon(3),rmet(3,3),rprimd(3,3)
137  real(dp),allocatable :: dprarr(:),kpt_with_shift(:,:)
138  real(dp),pointer :: nband_rbz(:,:)
139  character(len=6) :: nm_mkmem(3)
140 
141 !*************************************************************************
142 
143  DBG_ENTER("COLL")
144 
145  iexit=0;mpw_k=0
146 
147  call init_mpi_enreg(mpi_enregs(0))
148  call initmpi_img(dtsets(0),mpi_enregs(0),-1)
149 
150  do idtset=1,ndtset_alloc
151    call init_mpi_enreg(mpi_enregs(idtset))
152 
153    ! Handy read-only variables.
154    optdriver = dtsets(idtset)%optdriver
155 
156 !  Read parallel input parameters
157    marr=max(5,dtsets(idtset)%npsp,dtsets(idtset)%nimage)
158    ABI_ALLOCATE(intarr,(marr))
159    ABI_ALLOCATE(dprarr,(marr))
160    nkpt  =dtsets(idtset)%nkpt
161    nsppol=dtsets(idtset)%nsppol
162    jdtset=dtsets(idtset)%jdtset ; if(ndtset==0)jdtset=0
163    usepaw=dtsets(idtset)%usepaw
164    mband_upper=maxval(dtsets(idtset)%nband(1:nkpt*nsppol))
165    mband_lower=minval(dtsets(idtset)%nband(1:nkpt*nsppol))
166 
167 !  Compute metric for this dataset
168    call mkrdim(dtsets(idtset)%acell_orig(1:3,1),dtsets(idtset)%rprim_orig(1:3,1:3,1),rprimd)
169    call metric(gmet,gprimd,-1,rmet,rprimd,ucvol)
170 
171    ! Read paral_kgb and disable it if not supported in optdriver.
172    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_kgb',tread(1),'INT')
173    if (tread(1)==1) dtsets(idtset)%paral_kgb=intarr(1)
174 
175    if(xmpi_paral==0.and.dtsets(idtset)%paral_kgb==1)then
176      dtsets(idtset)%paral_kgb=0
177      write(message, '(5a)' ) &
178 &     'When ABINIT is compiled without MPI flag,',ch10,&
179 &     'setting paral_kgb/=0 is useless. paral_kgb has been reset to 0.',ch10,&
180 &     'Action : modify compilation option or paral_kgb in the input file.'
181      MSG_WARNING(message)
182    end if
183 
184    if ( ALL(optdriver /= [RUNL_GSTATE, RUNL_GWLS]) .and. dtsets(idtset)%paral_kgb/=0) then
185      dtsets(idtset)%paral_kgb=0
186      write(message, '(a,i0,a)') &
187 &     "paral_kgb != 0 is not available in optdriver ",optdriver,". Setting paral_kgb to 0"
188      MSG_COMMENT(message)
189    end if
190 
191    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'max_ncpus',tread0,'INT')
192    if (tread0==1) dtsets(idtset)%max_ncpus=intarr(1)
193 
194    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_atom',tread0,'INT')
195    if(tread0==1) dtsets(idtset)%paral_atom=intarr(1)
196 
197    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_rf',tread0,'INT')
198    if (tread0==1.and.any(optdriver==[RUNL_RESPFN, RUNL_NONLINEAR])) dtsets(idtset)%paral_rf=intarr(1)
199 
200    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npimage',tread(2),'INT')
201    if(tread(2)==1) dtsets(idtset)%npimage=intarr(1)
202 
203    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nppert',tread(3),'INT')
204    if (tread(3)==1.and.optdriver==RUNL_RESPFN) dtsets(idtset)%nppert=intarr(1)
205 
206    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npkpt',tread(4),'INT')
207    if(tread(4)==1) dtsets(idtset)%npkpt=intarr(1)
208 
209    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npspinor',tread(5),'INT')
210    if(tread(5)==1) dtsets(idtset)%npspinor=intarr(1)
211 
212    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npfft',tread(6),'INT')
213    if(tread(6)==1) dtsets(idtset)%npfft=intarr(1)
214 
215    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npband',tread(7),'INT')
216    if(tread(7)==1) dtsets(idtset)%npband=intarr(1)
217 
218    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'bandpp',tread(8),'INT')
219    if(tread(8)==1) dtsets(idtset)%bandpp=intarr(1)
220 
221    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'use_slk',tread(9),'INT')
222    if(tread(9)==1) dtsets(idtset)%use_slk=intarr(1)
223 
224    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'np_slk',tread(10),'INT')
225    if(tread(10)==1) dtsets(idtset)%np_slk=intarr(1)
226 
227    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'slk_rankpp',tread(12),'INT')
228    if(tread(12)==1) dtsets(idtset)%slk_rankpp=intarr(1)
229 
230    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'pw_unbal_thresh',tread0,'DPR')
231    if(tread0==1) dtsets(idtset)%pw_unbal_thresh=dprarr(1)
232    mpi_enregs(idtset)%pw_unbal_thresh=dtsets(idtset)%pw_unbal_thresh
233 
234    call intagm(dprarr,intarr,jdtset,marr,5,string(1:lenstr),'gpu_devices',tread0,'INT')
235    if(tread0==1) dtsets(idtset)%gpu_devices(1:5)=intarr(1:5)
236 
237    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'gpu_linalg_limit',tread(11),'INT')
238    if(tread(11)==1) dtsets(idtset)%gpu_linalg_limit=intarr(1)
239 
240 
241    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nphf',tread0,'INT')
242    if(tread0==1) dtsets(idtset)%nphf=intarr(1)
243 
244    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'autoparal',tread0,'INT')
245    if(tread0==1) dtsets(idtset)%autoparal=intarr(1)
246 
247 
248    ! Dump the list of irreducible perturbations and exit.
249    if (dtsets(idtset)%paral_rf==-1.and.optdriver/=RUNL_NONLINEAR) then
250      call get_npert_rbz(dtsets(idtset),nband_rbz,nkpt_rbz,npert)
251      ABI_DEALLOCATE(nband_rbz)
252      ABI_DEALLOCATE(nkpt_rbz)
253      iexit = iexit + 1
254    end if
255 
256 !  From total number of procs, compute all possible distributions
257 !  Ignore exit flag if GW/EPH calculations because autoparal section is performed in screening/sigma/bethe_salpeter/eph
258    call finddistrproc(dtsets,filnam,idtset,iexit,mband_upper,mpi_enregs(idtset),ndtset_alloc,tread)
259    if (any(optdriver == [RUNL_SCREENING, RUNL_SIGMA, RUNL_BSE, RUNL_EPH, RUNL_NONLINEAR])) iexit = 0
260 
261    if ((optdriver/=RUNL_GSTATE.and.optdriver/=RUNL_GWLS).and. &
262 &   (dtsets(idtset)%npkpt/=1   .or.dtsets(idtset)%npband/=1.or.dtsets(idtset)%npfft/=1.or. &
263 &   dtsets(idtset)%npspinor/=1.or.dtsets(idtset)%bandpp/=1)) then
264 !&   .or.(dtsets(idtset)%iscf<0)) then
265      dtsets(idtset)%npkpt=1 ; dtsets(idtset)%npspinor=1 ; dtsets(idtset)%npfft=1
266      dtsets(idtset)%npband=1; dtsets(idtset)%bandpp=1  ; dtsets(idtset)%nphf=1
267      dtsets(idtset)%paral_kgb=0
268      message = 'For non ground state calculation, set bandpp, npfft, npband, npspinor npkpt and nphf to 1'
269      MSG_WARNING(message)
270    end if
271 
272 !  Read again some input data to take into account a possible change of paral_kgb
273    wfoptalg_read=.false.
274    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'wfoptalg',tread0,'INT')
275    if(tread0==1) then
276      dtsets(idtset)%wfoptalg=intarr(1)
277      wfoptalg_read=.true.
278    else
279      if (dtsets(idtset)%usepaw==0) dtsets(idtset)%wfoptalg=0
280      if (dtsets(idtset)%usepaw/=0) dtsets(idtset)%wfoptalg=10
281      if ((optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS).and.dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%wfoptalg=114
282      if (mod(dtsets(idtset)%wfoptalg,10)==4) then
283        do iikpt=1,dtsets(idtset)%nkpt
284          if (any(abs(dtsets(idtset)%kpt(:,iikpt))>tol8)) dtsets(idtset)%istwfk(iikpt)=1
285        end do
286      end if
287    end if
288 
289    dtsets(idtset)%densfor_pred=2
290    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'densfor_pred',tread0,'INT')
291    if(tread0==1) then
292      dtsets(idtset)%densfor_pred=intarr(1)
293    else
294      if (dtsets(idtset)%paral_kgb==1) dtsets(idtset)%densfor_pred=6
295    end if
296    if((dtsets(idtset)%iscf==5.or.dtsets(idtset)%iscf==6) &
297 &   .and. dtsets(idtset)%ionmov==4 .and. dtsets(idtset)%densfor_pred/=3 )then
298      dtsets(idtset)%densfor_pred=3
299      write(message, '(a,a,a)' )&
300 &     'When ionmov==4 and iscf==5 or 6, densfor_pred must be 3.',ch10,&
301 &     'Set densfor_pred to 3.'
302      MSG_COMMENT(message)
303    end if
304 
305 #ifdef HAVE_LOTF
306 !  LOTF need densfor_pred=2
307    if(dtsets(idtset)%ionmov==23) dtsets(idtset)%densfor_pred=2
308 #endif
309 
310    if (usepaw==0) then
311      dtsets(idtset)%ortalg=2
312    else
313      dtsets(idtset)%ortalg=-2
314    end if
315    ortalg_read=.false.
316    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'ortalg',tread0,'INT')
317    if(tread0==1) then
318      dtsets(idtset)%ortalg=intarr(1)
319      ortalg_read=.true.
320    else if (dtsets(idtset)%wfoptalg>=10 .and. dtsets(idtset)%ortalg>0) then
321      dtsets(idtset)%ortalg=-dtsets(idtset)%ortalg
322    end if
323 
324    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'iomode',tread0,'INT')
325    if(tread0==1) then
326      dtsets(idtset)%iomode=intarr(1)
327    else
328      if ((xmpi_mpiio==1).and.(dtsets(idtset)%paral_kgb==1)) dtsets(idtset)%iomode=IO_MODE_MPI
329 #ifdef HAVE_NETCDF_DEFAULT
330      dtsets(idtset)%iomode=IO_MODE_ETSF
331 #endif
332    end if
333 
334    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'pawmixdg',tread0,'INT')
335    if(tread0==1) then
336      dtsets(idtset)%pawmixdg=intarr(1)
337    else if (dtsets(idtset)%npfft>1.and.usepaw==1) then
338      dtsets(idtset)%pawmixdg=1
339    end if
340 
341    call initmpi_img(dtsets(idtset),mpi_enregs(idtset),-1)
342    nproc=mpi_enregs(idtset)%nproc_cell
343 
344 !  Cycle if the processor is not used
345    if (mpi_enregs(idtset)%me<0) then
346      ABI_DEALLOCATE(intarr)
347      ABI_DEALLOCATE(dprarr)
348      cycle
349    end if
350 
351    response=0
352    if (dtsets(idtset)%rfddk/=0 .or. dtsets(idtset)%rf2_dkdk/=0 .or. dtsets(idtset)%rf2_dkde/=0 .or. &
353 &   dtsets(idtset)%rfelfd/=0 .or. dtsets(idtset)%rfphon/=0 .or. dtsets(idtset)%rfstrs/=0 .or. &
354 &   dtsets(idtset)%rfuser/=0 .or. dtsets(idtset)%rfmagn/=0) response=1
355 
356 !  If no MPI, set all npxxx variables to 1
357    if (nproc==1) then
358      dtsets(idtset)%npkpt    = 1 ; dtsets(idtset)%npband   = 1
359      dtsets(idtset)%npfft    = 1 ; dtsets(idtset)%npspinor = 1
360      dtsets(idtset)%nphf     = 1
361    end if
362 
363 !    --IF CUDA AND RECURSION:ONLY BAND PARALLELISATION
364    if(dtsets(idtset)%tfkinfunc==2 .and. nproc/=1)then
365      dtsets(idtset)%npband = dtsets(idtset)%npband*dtsets(idtset)%npkpt*dtsets(idtset)%npspinor*dtsets(idtset)%npfft
366      dtsets(idtset)%npkpt = 1
367      dtsets(idtset)%npfft = 1
368      dtsets(idtset)%npspinor = 1
369      write(message, '(5a,i6,a)' )&
370 &     'If HAVE_GPU_CUDA and recursion are used ',ch10,&
371 &     'only the band parallelisation is active, we set:',ch10,&
372 &     'npfft= 1, npkpt= 1, npband=',dtsets(idtset)%npband,' .'
373      MSG_WARNING(message)
374    end if
375 
376    if (dtsets(idtset)%npspinor>=2.and.dtsets(idtset)%nspinor==1) then
377      dtsets(idtset)%npspinor=1
378      dtsets(idtset)%npfft=2*dtsets(idtset)%npfft
379      write(message,'(3a)')&
380 &     'npspinor is bigger than nspinor !',ch10,&
381 &     'We set npspinor to 1 ; we set npfft to 2*npfft'
382      MSG_WARNING(message)
383    end if
384 
385 !  Some checks on parallelization data
386    if(dtsets(idtset)%paral_kgb < 0 ) then
387      cycle
388    else if(dtsets(idtset)%paral_kgb/=0.and.(dtsets(idtset)%bandpp/=1.or.dtsets(idtset)%npband/=1.or.&
389 &     dtsets(idtset)%npfft/=1.or.dtsets(idtset)%npkpt/=1.or.dtsets(idtset)%npspinor/=1))then
390      if(dtsets(idtset)%npkpt*dtsets(idtset)%npfft*dtsets(idtset)%npband*dtsets(idtset)%npspinor > nproc )then
391        write(message,'(7a)')&
392 &       'The product of npkpt, npfft, npband and npspinor is bigger than the number of processors.',ch10,&
393 &       'The user-defined values of npkpt, npfft, npband or npspinor will be modified,',ch10,&
394 &       'in order to bring this product below nproc .',ch10,&
395 &       'At present, only a very simple algorithm is used ...'
396        MSG_WARNING(message)
397 
398        if(dtsets(idtset)%npkpt*dtsets(idtset)%npband*dtsets(idtset)%npspinor <= nproc) then
399          dtsets(idtset)%npfft=1
400          MSG_WARNING('Set npfft to 1')
401        else if(dtsets(idtset)%npkpt*dtsets(idtset)%npspinor <= nproc)then
402          dtsets(idtset)%npfft=1
403          dtsets(idtset)%npband=1
404          MSG_WARNING('Set npfft and npband to 1')
405        else if(dtsets(idtset)%npkpt <= nproc)then
406          dtsets(idtset)%npfft=1
407          dtsets(idtset)%npband=1
408          dtsets(idtset)%npspinor=1
409          MSG_WARNING('Set npfft ,npband and npspinor to 1')
410        else
411          dtsets(idtset)%npfft=1
412          dtsets(idtset)%npband=1
413          dtsets(idtset)%npkpt=1
414          dtsets(idtset)%npspinor=1
415          MSG_WARNING('Set npfft, npband, nspinor and npkpt to 1')
416        end if
417      else if(dtsets(idtset)%npkpt*dtsets(idtset)%npfft*dtsets(idtset)%npband*dtsets(idtset)%npspinor < nproc)then
418        write(message,'(2a)')&
419 &       'The number of processor must not be greater than npfft*npband*npkpt*npsinor ',&
420 &       'when npfft or npkpt or npband or nspinor are chosen manually in the input file.'
421        MSG_ERROR(message)
422      end if
423    end if
424 
425 !  LOBPCG and ChebFi need paral_kgb=1 in parallel
426    if ((dtsets(idtset)%npband*dtsets(idtset)%npfft>1).and. &
427 &   (mod(dtsets(idtset)%wfoptalg,10)==1.or.mod(dtsets(idtset)%wfoptalg,10)==4)) then
428      dtsets(idtset)%paral_kgb=1
429    end if
430 
431 !  Check size of Scalapack communicator
432 #ifdef HAVE_LINALG_ELPA
433    if(dtsets(idtset)%paral_kgb>0.and.dtsets(idtset)%np_slk>0) then
434      np=min(dtsets(idtset)%np_slk,dtsets(idtset)%npband*dtsets(idtset)%npfft*dtsets(idtset)%npspinor)
435      irow=int(sqrt(float(np)))
436      do while(mod(np,irow)/=0)
437        irow=irow-1
438      end do
439      icol=nproc/irow
440      if (icol>mband_lower) then
441        do while(icol>mband_lower)
442          icol=icol-1
443          do while(mod(np,icol)/=0)
444            icol=icol-1
445          end do
446        end do
447        dtsets(idtset)%np_slk=icol
448        write(message,'(5a,i6,a)')&
449 &       'The number of band*fft*spinor processors was not consistent with',ch10,&
450 &       'the size of communicator used for ELPA library (np_slk).',ch10,&
451 &       'np_slk value has been adjusted to ',dtsets(idtset)%np_slk,'.'
452        MSG_COMMENT(message)
453      end if
454    end if
455 #endif
456 
457    !Additional check in case of a parallelized Hartree-Fock calculation
458    !   %usefock == option to perform Fock exchange calculation
459    !   %nphf   == number of processors for Fock exchange calculation
460    if ((dtsets(idtset)%usefock==1).and.(dtsets(idtset)%nphf/=1)) then
461 
462      if ((dtsets(idtset)%nphf<0).or.(dtsets(idtset)%nphf==0)) then
463        MSG_ERROR('The value of variable nphf should be a non negative integer.')
464      end if
465      if (dtsets(idtset)%paral_kgb/=0) then
466        message = 'Option paral_kgb should be turned off (value 0) for a parallelized Hartree-Fock calculation.'
467        MSG_ERROR(message)
468      end if
469      if (response/=0) then
470        message = 'A response function calculation is not yet possible with a parallelized Hartree-Fock calculation.'
471        MSG_ERROR(message)
472      end if
473      if (dtsets(idtset)%npspinor>1) then
474        message = 'The parallelism on spinors is not supported by a parallelized Hartree-Fock calculation.'
475        MSG_ERROR(message)
476      end if
477      if (dtsets(idtset)%npkpt*dtsets(idtset)%nphf > nproc )then
478        write(message,'(a,3(a,i0))') ch10,&
479 &       'The product of variables npkpt and nphf is bigger than the number of processors: nkpt= ',&
480 &       dtsets(idtset)%npkpt,' nphf= ',dtsets(idtset)%nphf  ,' and nproc= ', nproc
481        MSG_ERROR(message)
482      end if
483    end if ! Fock
484 
485    !When using chebfi, the number of blocks is equal to the number of processors
486    if((dtsets(idtset)%wfoptalg == 1)) then
487      !Nband might have different values for different kpoint, but not bandpp.
488      !In this case, we just use the largest nband, andthe input will probably fail
489      !at the bandpp check later on
490      dtsets(idtset)%bandpp = mband_upper / dtsets(idtset)%npband
491    end if
492 
493 !  Set mpi_enreg
494    mpi_enregs(idtset)%paral_kgb=dtsets(idtset)%paral_kgb
495    if(dtsets(idtset)%paral_kgb/=0)then
496      mpi_enregs(idtset)%nproc_kpt=dtsets(idtset)%npkpt
497      mpi_enregs(idtset)%nproc_fft=dtsets(idtset)%npfft
498      mpi_enregs(idtset)%nproc_band=dtsets(idtset)%npband
499      mpi_enregs(idtset)%nproc_spinor=min(dtsets(idtset)%npspinor,dtsets(idtset)%nspinor)
500      mpi_enregs(idtset)%bandpp=dtsets(idtset)%bandpp
501 !    Additional setting in case of hybrid functional calculation => not yet tested (CMartins)
502 !     if (dtsets(idtset)%usefock==1) then
503 !       mpi_enregs(idtset)%nproc_hf = dtsets(idtset)%nphf
504 !       if (dtsets(idtset)%nphf>1) mpi_enregs(idtset)%paral_hf=1
505 !     end if
506    else
507      mpi_enregs(idtset)%bandpp = dtsets(idtset)%bandpp
508 !    Additional setting in case of a Fock exchange of PBE0 calculation
509      if (dtsets(idtset)%usefock==1) then
510        if (dtsets(idtset)%nphf>1) mpi_enregs(idtset)%paral_hf=1
511        mpi_enregs(idtset)%nproc_hf = dtsets(idtset)%nphf
512        if (dtsets(idtset)%npkpt/=1) then
513          mpi_enregs(idtset)%nproc_kpt = dtsets(idtset)%npkpt
514        else
515          mpi_enregs(idtset)%nproc_kpt = mpi_enregs(idtset)%nproc_cell/mpi_enregs(idtset)%nproc_hf
516        end if
517      else
518        mpi_enregs(idtset)%nproc_kpt = mpi_enregs(idtset)%nproc_cell
519      end if
520    end if
521 
522    if(dtsets(idtset)%paral_kgb>=0) then
523 
524 !    Compute processor distribution over perturbations
525      mpi_enregs(idtset)%paral_pert=dtsets(idtset)%paral_rf
526      if (mpi_enregs(idtset)%paral_pert==1) then
527        dtsets(idtset)%nppert=max(1,dtsets(idtset)%nppert)
528        if(dtsets(idtset)%nppert>mpi_enregs(idtset)%nproc) then
529          message=' The number of processors must not be smaller than nppert !'
530          MSG_ERROR(message)
531        end if
532        call initmpi_pert(dtsets(idtset),mpi_enregs(idtset))
533        mpi_enregs(idtset)%nproc_kpt = mpi_enregs(idtset)%nproc_cell
534        nproc=mpi_enregs(idtset)%nproc_cell
535      end if
536 !    Cycle if the processor is not used
537      if (mpi_enregs(idtset)%me<0) then
538        ABI_DEALLOCATE(intarr)
539        ABI_DEALLOCATE(dprarr)
540        cycle
541      end if
542 
543 !    Compute processor distribution over kpt (and eventually band-fft)
544      call initmpi_grid(mpi_enregs(idtset))
545      if(dtsets(idtset)%usewvl==1) mpi_enregs(idtset)%comm_fft=mpi_enregs(idtset)%comm_cell
546 
547 !    Initialize tabs used for k/spin parallelism (with sequential-type values)
548      ABI_ALLOCATE(mpi_enregs(idtset)%proc_distrb,(nkpt,mband_upper,nsppol))
549      ABI_ALLOCATE(mpi_enregs(idtset)%my_kpttab,(nkpt))
550      mpi_enregs(idtset)%proc_distrb(:,:,:)=0
551      mpi_enregs(idtset)%my_kpttab(:)=(/(ii,ii=1,nkpt)/)
552      mpi_enregs(idtset)%my_isppoltab(:)=1;if (dtsets(idtset)%nsppol==1) mpi_enregs(idtset)%my_isppoltab(2)=0
553 
554 !    HF or hybrid calculation : initialization of the array distrb_hf
555      if (dtsets(idtset)%usefock==1) then
556        ABI_ALLOCATE(mpi_enregs(idtset)%distrb_hf,(dtsets(idtset)%nkpthf,dtsets(idtset)%nbandhf,1))
557 !      The dimension of distrb_hf are given by %nkpthf and %nbandhf.
558 !      We assume that there will be no dependence in spinpol for all the occupied states.
559        mpi_enregs(idtset)%distrb_hf=0
560      end if
561 
562 !    Define k-points distribution (determine who I am)
563 !    Note that nkpt_me may differ from processor to processor
564 !    This fact will NOT be taken into account when
565 !    the memory needs will be evaluated in the subroutine memory.
566 !    Also, the reduction of k points due to symmetry in RF calculations
567 !    is NOT taken into account. This should be changed later ...
568      nkpt_me=nkpt
569      if(xmpi_paral==1 .and. dtsets(idtset)%usewvl == 0) then
570        nkpt_me=0
571        if(response==0 .or. (response==1 .and. dtsets(idtset)%efmas==1))then
572          mpi_enregs(idtset)%paralbd=0
573          call distrb2(mband_upper,dtsets(idtset)%nband,nkpt,nproc,nsppol,mpi_enregs(idtset))
574          do iikpt=1,nkpt
575            if(.not.(proc_distrb_cycle(mpi_enregs(idtset)%proc_distrb,iikpt,1,1,-1,mpi_enregs(idtset)%me_kpt)))&
576 &           nkpt_me=nkpt_me+1
577          end do ! ikpt=1,nkpt
578 !        HF or hybrid calculation : define the occupied states distribution (in array distrb_hf)
579          if (dtsets(idtset)%usefock==1) then
580            call distrb2_hf(dtsets(idtset)%nbandhf,dtsets(idtset)%nkpthf,nproc,nsppol,mpi_enregs(idtset))
581          end if
582        else ! response==1
583 !        Wrongly assumes that the number of elements of the
584 !        k-point sets of the two spin polarizations is the maximal
585 !        value of one of these k-point sets ...
586 !        This is to be corrected when RF is implemented
587 !        for spin-polarized case.
588          mpi_enregs(idtset)%paralbd=1
589 !        nproc=mpi_enregs(idtset)%nproc_cell*mpi_enregs(idtset)%nproc_pert
590          call distrb2(mband_upper,dtsets(idtset)%nband,nkpt,nproc,nsppol,mpi_enregs(idtset))
591          do isppol=1,nsppol
592            nspink=0
593            do iikpt=1,nkpt
594              do iband=1,dtsets(idtset)%nband(iikpt+(isppol-1)*nkpt)
595                if(mpi_enregs(idtset)%proc_distrb(iikpt,iband,isppol)==mpi_enregs(idtset)%me_cell)then
596                  nspink=nspink+1
597                  exit
598                end if
599              end do ! iband
600            end do ! iikpt
601            if(nspink>nkpt_me)nkpt_me=nspink
602          end do ! isppol
603 !        Is nband present in input file or automatically estimated ?
604          tnband=0
605          call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nband',tnband,'INT')
606 !        If the number of bands was estimated, there might be a side effect
607 !        when the definitive number of bands is known. k points
608 !        might be attributed to different processors than the present
609 !        proc_distrb describes. At most, the number of k points could increase by 1 ...
610          if(tnband==0)nkpt_me=nkpt_me+1
611 !        In any case, the maximal number of k points is nkpt
612          if(nkpt_me>nkpt)nkpt_me=nkpt
613        end if
614      end if
615    end if
616 
617 !  Take care of mkmems. Use the generic name -mkmem- for mkmem as well as mkqmem
618 !  and mk1mem.
619    nm_mkmem(1)='mkmem '
620    nm_mkmem(2)='mkqmem'
621    nm_mkmem(3)='mk1mem'
622 
623    do ii=1,3
624 
625 !    Read in mkmem here if it is in the input file
626      if(ii==1)then
627        call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mkmem',tread0,'INT')
628      else if(ii==2)then
629        call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mkqmem',tread0,'INT')
630      else if(ii==3)then
631        call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mk1mem',tread0,'INT')
632      end if
633 
634 !    Note that mkmem is used as a dummy variable, representing mkmem as well
635 !    as mkqmem, and mk1mem.
636      if(tread0==1) then
637        mkmem=intarr(1)
638        if (mkmem<0) then
639 !        mkmem is unreasonable; must be zero or positive
640          write(message, '(4a,i0,4a)')&
641 &         nm_mkmem(ii),' must be positive or null but ',nm_mkmem(ii),' =',mkmem,ch10,&
642 &         'Use default ',nm_mkmem(ii),' = nkpt .'
643          MSG_WARNING(message)
644          mkmem=nkpt
645        end if
646 
647      else
648 
649 !      mkmem was not set in the input file so default to incore solution
650        write(message,'(6a)') &
651 &       'mpi_setup: ',nm_mkmem(ii),' undefined in the input file.','Use default ',nm_mkmem(ii),' = nkpt'
652        call wrtout(std_out,message,'COLL')
653        mkmem=nkpt
654      end if
655 
656 !    Check whether nkpt distributed on the processors <= mkmem;
657 !    if so then may run entirely in core,
658 !    avoiding i/o to disk for wavefunctions and kg data.
659 !    mkmem/=0 to avoid i/o; mkmem==0 to use disk i/o for nkpt>=1.
660      if (nkpt_me<=mkmem .and. mkmem/=0 ) then
661        write(message, '(a,i0,a,a,a,i0,a)' ) &
662 &       ' mpi_setup: With nkpt_me=',nkpt_me,' and ',nm_mkmem(ii),' = ',mkmem,', ground state wf handled in core.'
663        call wrtout(std_out,message,'COLL')
664        if(nkpt_me<mkmem .and. nkpt_me/=0)then
665          write(message,'(3a)')' Resetting ',nm_mkmem(ii),' to nkpt_me to save memory space.'
666          mkmem=nkpt_me
667          call wrtout(std_out,message,'COLL')
668        end if
669      else if(mkmem/=0)then
670        write(message, '(a,i0,3a,i0,5a)' ) &
671 &       ' mpi_setup: With nkpt_me=',nkpt_me,'and ',nm_mkmem(ii),' = ',mkmem,&
672 &       ' ground state wf require disk i/o.',ch10,&
673 &       ' Resetting ',nm_mkmem(ii),' to zero to save memory space.'
674        mkmem=0
675        call wrtout(std_out,message,'COLL')
676      end if
677      if(dtsets(idtset)%usewvl == 0 .or. dtsets(idtset)%usepaw==1)then
678        if(ii==1)dtsets(idtset)%mkmem=mkmem
679      end if
680      if(ii==2)dtsets(idtset)%mkqmem=mkmem
681      if(ii==3)dtsets(idtset)%mk1mem=mkmem
682 
683      if(dtsets(idtset)%usewvl == 1 .and. dtsets(idtset)%usepaw==1 )then
684        if(dtsets(idtset)%mkmem .ne. dtsets(idtset)%nkpt) then
685          MSG_ERROR("mkmem is not allowed for WVL+PAW")
686        end if
687      end if
688 
689    end do  ! End the loop on the three possiblities mkmem, mkqmem, mk1mem.
690 
691    if(dtsets(idtset)%paral_kgb==1) mpi_enregs(idtset)%paralbd=0
692 
693 !  Check if some MPI processes are empty (MBPT code uses a complete different MPI algorithm)
694    do_check = all(optdriver /= [RUNL_SCREENING, RUNL_SIGMA, RUNL_BSE])
695    if (dtsets(idtset)%usewvl == 0 .and. do_check) then
696      if (.not.mpi_distrib_is_ok(mpi_enregs(idtset),mband_upper,&
697 &     dtsets(idtset)%nkpt,dtsets(idtset)%mkmem,nsppol,msg=message)) then
698        write(message,'(5a)') trim(message),ch10,&
699 &       'YOU ARE STRONGLY ADVICED TO ACTIVATE AUTOMATIC PARALLELIZATION!',ch10,&
700 &       'PUT "AUTOPARAL=1" IN THE INPUT FILE.'
701        MSG_WARNING(message)
702      end if
703    end if
704 
705 !  call mpi_setup1(dtsets(idtset),jdtset,lenstr,mband_upper,mpi_enregs(idtset),string)
706 !  Printing of processor distribution
707 !  MPIWF : here, set up the complete ngfft, containing the information
708 !  for the parallelisation of the FFT
709    call abi_io_redirect(new_io_comm=mpi_enregs(idtset)%comm_world)
710    call libpaw_write_comm_set(mpi_enregs(idtset)%comm_world)
711 
712 !  Default values for sequential case
713    paral_fft=0; nproc_fft=1; me_fft=0
714 
715    if(dtsets(idtset)%usewvl == 0)then
716      if(optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS) then
717        paral_fft=1           ! parallelisation over FFT
718        if (mpi_enregs(idtset)%nproc_cell>0) then
719          if(mpi_enregs(idtset)%paral_kgb == 1) then
720 
721            if((dtsets(idtset)%use_gpu_cuda==1).and.(mpi_enregs(idtset)%nproc_fft/=1))then
722              write(message,'(3a,i0)') &
723 &             'When use_gpu_cuda is on, the number of FFT processors, npfft, must be 1',ch10,&
724 &             'However, npfft=',mpi_enregs(idtset)%nproc_fft
725              MSG_ERROR(message)
726            end if
727 
728            if(modulo(dtsets(idtset)%ngfft(2),mpi_enregs(idtset)%nproc_fft)/=0)then
729              write(message,'(3a,i0,a,i0)') &
730 &             'The number of FFT processors, npfft, should be a multiple of ngfft(2).',ch10,&
731 &             'However, npfft=',mpi_enregs(idtset)%nproc_fft,' and ngfft(2)=',dtsets(idtset)%ngfft(2)
732              MSG_BUG(message)
733            end if
734 
735            do iikpt=1,nkpt*nsppol
736              iikpt_modulo = modulo(iikpt,nkpt)+1
737              if ((dtsets(idtset)%istwfk(iikpt_modulo)==2)) then !.and.(dtsets(idtset)%ngfft(7)==401)) then
738                if ((mpi_enregs(idtset)%bandpp==0).or. &
739                ((mpi_enregs(idtset)%bandpp/=1).and.(modulo(mpi_enregs(idtset)%bandpp,2)/=0))) then
740                  write(message,'(3a,i0)') &
741 &                 'The number bandpp should be 1 or a multiple of 2',ch10,&
742 &                 'However, bandpp=',mpi_enregs(idtset)%bandpp
743                  MSG_BUG(message)
744                end if
745                if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then
746                  write(message,'(3a,i0,a,i0)') &
747 &                 'The number of bands for the k-point, nband_k, should be a multiple of nproc_band*bandpp.',ch10,&
748 &                 'However, nband_k=',dtsets(idtset)%nband(iikpt),' and nproc_band*bandpp=', &
749 &                 mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp
750                  MSG_BUG(message)
751                end if
752              else if ((dtsets(idtset)%istwfk(iikpt_modulo)==2) .and. (dtsets(idtset)%ngfft(7)==400)) then
753                MSG_BUG('The fftalg=400 with istwfk=2 is not valid')
754              else
755                if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then
756                  write(message,'(3a,i0,a,i0)') &
757 &                 'The number of band for the k-point, nband_k, should be a multiple of nproc_band*bandpp.',ch10,&
758 &                 'However, nband_k=',dtsets(idtset)%nband(iikpt),' and nproc_band*bandpp=', &
759 &                 mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp
760                  MSG_BUG(message)
761                end if
762                if ((mpi_enregs(idtset)%bandpp==0)) then
763                  write(message,'(a,i0,2a,i0,2a,i0)')&
764 &                 'The number bandpp should not be 0 with fftalg=',dtsets(idtset)%ngfft(7),ch10,&
765 &                 'and istwfk=',dtsets(idtset)%istwfk(iikpt_modulo),ch10,&
766 &                 'However, bandpp=',mpi_enregs(idtset)%bandpp
767                  MSG_BUG(message)
768                end if
769              end if
770            end do
771 
772            if (xmpi_paral==1) then
773              if(modulo(nkpt*nsppol,mpi_enregs(idtset)%nproc_kpt)/=0)then
774                write(message,'(3a,i0,a,i0)') &
775 &               'The number of KPT processors, npkpt, should be a multiple of nkpt*nsppol.',ch10,&
776 &               'However, npkpt=',mpi_enregs(idtset)%nproc_kpt,' and nkpt*nsppol=',nkpt*nsppol
777                MSG_WARNING(message)
778              end if
779            end if
780          else
781            do iikpt=1,nkpt*nsppol
782              iikpt_modulo = modulo(iikpt,nkpt)+1
783              if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then
784                write(message,'(3a,i0,a,i0)') &
785 &               'The number of band for the k-point, nband_k, should be a multiple of npband*bandpp.',ch10,&
786 &               'However, nband_k=',dtsets(idtset)%nband(iikpt),' and npband*bandpp=', &
787 &               mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp
788                MSG_BUG(message)
789              end if
790            end do
791          end if
792        end if
793        nproc_fft=mpi_enregs(idtset)%nproc_fft
794        me_fft=mpi_enregs(idtset)%me_fft
795      end if
796    end if
797 
798 !  Compute mgfft,mpw,nfft for this data set (it is dependent of mpi_enreg)
799    ABI_ALLOCATE(istwfk,(nkpt))
800    ABI_ALLOCATE(kpt_with_shift,(3,nkpt))
801 
802    ! Set the default value of fftalg for given npfft but allow the user to override it.
803    ! Warning: If you need to change npfft, **DO IT** before this point so that here we get the correct fftalg
804    dtsets(idtset)%ngfft(7) = fftalg_for_npfft(dtsets(idtset)%npfft)
805    dtsets(idtset)%ngfftdg(7) = fftalg_for_npfft(dtsets(idtset)%npfft)
806 
807    fftalg_read=.false.
808    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'fftalg',tread0,'INT')
809 
810    if (tread0==1) then
811      dtsets(idtset)%ngfft(7)=intarr(1)
812      if (usepaw==1) dtsets(idtset)%ngfftdg(7)=intarr(1)
813      fftalg_read=.true.
814    end if
815 
816    ecut     =dtsets(idtset)%ecut
817    dilatmx  =dtsets(idtset)%dilatmx
818    ngfft(:) =dtsets(idtset)%ngfft(:)
819    istwfk(:)=dtsets(idtset)%istwfk(1:nkpt)
820    nsym     =dtsets(idtset)%nsym
821 
822    nqpt=dtsets(idtset)%nqpt
823    qphon(:)=zero;if(nqpt/=0) qphon(:)=dtsets(idtset)%qptn(:)
824 
825    ABI_ALLOCATE(symrel,(3,3,nsym))
826    symrel(:,:,1:nsym)=dtsets(idtset)%symrel(:,:,1:nsym)
827    ecut_eff=ecut*dilatmx**2
828 
829    if (usepaw==1) then
830      call wrtout(std_out,'getng is called for the coarse grid:','COLL')
831    end if
832    kpt=k0; if (response==1.and.usepaw==1) kpt=qphon ! this is temporary
833 
834    call getng(dtsets(idtset)%boxcutmin,ecut_eff,gmet,kpt,me_fft,mgfft,nfft,&
835 &   ngfft,nproc_fft,nsym,paral_fft,symrel,&
836 &   use_gpu_cuda=dtsets(idtset)%use_gpu_cuda)
837 
838    dtsets(idtset)%ngfft(:)=ngfft(:)
839    dtsets(idtset)%mgfft=mgfft
840    dtsets(idtset)%nfft=nfft
841    kpt_with_shift(:,:)=dtsets(idtset)%kpt(:,1:nkpt)/dtsets(idtset)%kptnrm
842 
843    exchn2n3d=dtsets(idtset)%exchn2n3d
844    nproc_fft=ngfft(10) ; me_fft=ngfft(11)
845    fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10)
846 
847    ! Initialize tables for MPI-FFT.
848    call init_distribfft(mpi_enregs(idtset)%distribfft,'c',mpi_enregs(idtset)%nproc_fft,ngfft(2),ngfft(3))
849 
850    if(response/=0)then
851 !    This value of mpw is used in the first part of respfn.f
852      call getmpw(ecut_eff,exchn2n3d,gmet,istwfk,kpt_with_shift,mpi_enregs(idtset),mpw_k,nkpt)
853    end if
854    if(nqpt/=0)then
855      kpt_with_shift(1,:)=kpt_with_shift(1,:)+qphon(1)
856      kpt_with_shift(2,:)=kpt_with_shift(2,:)+qphon(2)
857      kpt_with_shift(3,:)=kpt_with_shift(3,:)+qphon(3)
858    end if
859    if (dtsets(idtset)%usewvl == 0) then
860      call getmpw(ecut_eff,exchn2n3d,gmet,istwfk,kpt_with_shift,mpi_enregs(idtset),mpw,nkpt)
861      ! Allocate tables for parallel IO of the wavefunctions.
862      if( xmpi_mpiio==1 .and. mpi_enregs(idtset)%paral_kgb == 1 .and. &
863 &     any(dtsets(idtset)%iomode == [IO_MODE_MPI, IO_MODE_ETSF])) then
864        ABI_ALLOCATE(mpi_enregs(idtset)%my_kgtab,(mpw,dtsets(idtset)%mkmem))
865      end if
866    else
867      mpw = 0
868    end if
869 
870 !  The dimensioning, in the RF case, should be done only with mpw,
871 !  but mpw is used in the first part of respfn.f, and should at least
872 !  be equal to mpw_k . The chosen way to code is not optimal, only convenient :
873 !  it leads to a small waste of memory.
874    if(response/=0 .and. mpw_k>mpw)mpw=mpw_k
875    dtsets(idtset)%ngfft(:)=ngfft(:)
876 
877 !  Initialize ngfftc to the initial guess for the coarse mesh
878    ngfftc(:) = 2
879 
880 !  In case of PAW, compute fine FFT parameters
881    if (usepaw==1) then
882      ecutdg_eff=dtsets(idtset)%pawecutdg*dtsets(idtset)%dilatmx**2
883      ngfftdg(:)=dtsets(idtset)%ngfftdg(:)
884      call wrtout(std_out,'getng is called for the fine grid:','COLL')
885 !    Start with the coarse mesh as an initial guess for the fine mesh
886 !    This ensures that the fine mesh will not be any coarser than the coarse mesh in each dimension
887      ngfftc(:) = ngfft(1:3)
888      kpt=k0; if (response==1.and.usepaw==1) kpt=qphon  ! this is temporary
889 
890      call getng(dtsets(idtset)%bxctmindg,ecutdg_eff,gmet,kpt,me_fft,mgfftdg,&
891 &     nfftdg,ngfftdg,nproc_fft,nsym,paral_fft,symrel,ngfftc,&
892 &     use_gpu_cuda=dtsets(idtset)%use_gpu_cuda)
893 
894      dtsets(idtset)%ngfftdg(:)=ngfftdg(:)
895      dtsets(idtset)%mgfftdg=mgfftdg
896      dtsets(idtset)%nfftdg=nfftdg
897 !    Compute fft distribution for fine grid
898      fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10)
899      call init_distribfft(mpi_enregs(idtset)%distribfft,'f', mpi_enregs(idtset)%nproc_fft,ngfftdg(2),ngfftdg(3))
900    end if
901 
902    dtsets(idtset)%mpw=mpw
903    ABI_DEALLOCATE(symrel)
904    ABI_DEALLOCATE(istwfk)
905    ABI_DEALLOCATE(kpt_with_shift)
906    ABI_DEALLOCATE(intarr)
907    ABI_DEALLOCATE(dprarr)
908 
909 !  Initialize data for the parallelization over atomic sites (PAW)
910    if (dtsets(idtset)%natom==1) dtsets(idtset)%paral_atom=0
911    if (dtsets(idtset)%usepaw==0) dtsets(idtset)%paral_atom=0
912    if (dtsets(idtset)%usewvl/=0) dtsets(idtset)%paral_atom=0
913    if (dtsets(idtset)%usedmft==1) dtsets(idtset)%paral_atom=0
914    if (optdriver/=RUNL_GSTATE.and.optdriver/=RUNL_RESPFN.and.optdriver/=RUNL_GWLS) dtsets(idtset)%paral_atom=0
915    if (dtsets(idtset)%macro_uj/=0) dtsets(idtset)%paral_atom=0
916 
917    call initmpi_atom(dtsets(idtset),mpi_enregs(idtset))
918 
919 !  In case of the use of a GPU (Cuda), some defaults can change
920 !  according to a threshold on matrix sizes
921    if (dtsets(idtset)%use_gpu_cuda==1.or.dtsets(idtset)%use_gpu_cuda==-1) then
922      if (optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS) then
923        vectsize=dtsets(idtset)%mpw*dtsets(idtset)%nspinor/dtsets(idtset)%npspinor
924        if (all(dtsets(idtset)%istwfk(:)==2)) vectsize=2*vectsize
925        blocksize=dtsets(idtset)%npband*dtsets(idtset)%bandpp
926        if (dtsets(idtset)%paral_kgb==0) blocksize=dtsets(idtset)%npfft
927        if ((vectsize*blocksize**2)>=dtsets(idtset)%gpu_linalg_limit) then
928          if (.not.wfoptalg_read) then
929            dtsets(idtset)%wfoptalg=14
930            if (.not.fftalg_read) then
931              dtsets(idtset)%ngfft(7) = fftalg_for_npfft(dtsets(idtset)%npfft)
932              if (usepaw==1) dtsets(idtset)%ngfftdg(7) = fftalg_for_npfft(dtsets(idtset)%npfft)
933            end if
934            if (.not.ortalg_read) dtsets(idtset)%ortalg=-abs(dtsets(idtset)%ortalg)
935          end if
936        end if
937      end if
938    end if
939 
940 !  initialize data for the parallelization for WVL:
941    if(dtsets(idtset)%usewvl==1) then
942      mpi_enregs(idtset)%comm_wvl=mpi_enregs(idtset)%comm_cell
943      mpi_enregs(idtset)%nproc_wvl=xmpi_comm_size(mpi_enregs(idtset)%comm_wvl)
944      mpi_enregs(idtset)%me_wvl=xmpi_comm_rank(mpi_enregs(idtset)%comm_wvl)
945    end if
946 
947  end do
948 
949 !This is not a very clean exit in case of paral_kgb<0
950  if (iexit/=0)then
951    MSG_ERROR_NODUMP("aborting now")
952  end if
953 
954  DBG_EXIT("COLL")
955 
956 end subroutine mpi_setup