TABLE OF CONTENTS


ABINIT/compute_kgb_indicator [ Functions ]

[ Top ] [ Functions ]

NAME

 compute_kgb_indicator

FUNCTION

 Only for "KGB" parallelism (LOBPCG algorithm for Ground-state):
  Give an indicator of performance for a given distribution of processors
  (npband, npfft and bandpp).
  Determine best choice of parameters for Scalapack and/or Magma Linear Algebra routines.

INPUTS

  bandpp=internal lobpcg optimization variable
  glb_comm=communicator for global MPI communications
  mband=maximum number of bands.
  mband=maximum number of plane waves
  npband=number of processor 'band'
  npfft = number of processor 'fft'
  use_linalg_gpu=indicate if we also test the gpu linear algebra (compatible only with the legacy 2013 GPU code)

OUTPUT

  acc_kgb = indicator of performance
  npslk = number of process to used in communicators

SIDE EFFECTS

 This routine can be used to find an indicator in order to refine automatic process distribution.
   This indicator is returned in acc_kgb
 This routine can be used to find the optimal values of np_slk parameter (ScaLapack)
   and wheter or not we should use Magma for Linear Algebra in lobpcgwf

SOURCE

2076 subroutine compute_kgb_indicator(acc_kgb,bandpp,glb_comm,mband,mpw,npband,npfft,npslk,use_linalg_gpu)
2077 
2078  use m_abi_linalg
2079 
2080 !Arguments ------------------------------------
2081 !scalars
2082  integer,intent(in) :: bandpp,glb_comm,mband,mpw,npband,npfft
2083  integer,intent(inout) :: npslk,use_linalg_gpu
2084  real(dp),intent(inout) :: acc_kgb
2085 
2086 !Local variables-------------------------------
2087 !scalars
2088  integer,parameter :: max_number_of_npslk=10,max_number_of_iter=10
2089  integer :: blocksize,bigorder,ierr,ii,islk,islk1,iter,jj,keep_gpu
2090  integer :: kgb_comm,my_rank,np_slk,np_slk_max,np_slk_best,nranks
2091  integer :: use_lapack_gpu,use_slk,vectsize,wfoptalg
2092  real(dp) :: min_eigen,min_ortho,time_xeigen,time_xortho
2093  character(len=500) :: msg
2094 !arrays
2095  integer,allocatable :: ranks(:),val_npslk(:)
2096  real(dp),allocatable :: eigen(:),grama(:,:),gramb(:,:)
2097  complex(dpc),allocatable :: blockvectorbx(:,:),blockvectorx(:,:),sqgram(:,:)
2098 
2099 !******************************************************************
2100 
2101  DBG_ENTER("COLL")
2102 
2103 #ifdef DEBUG_MODE
2104  write(msg,'(a,3i3)') 'compute_kgb_indicator : (bpp,npb,npf) = ', bandpp, npband, npfft
2105  call wrtout(std_out,msg,'PERS')
2106 #endif
2107 
2108 !Create local communicator for test
2109  if (xmpi_paral==1) then
2110    nranks=npfft*npband
2111    ABI_MALLOC(ranks,(nranks))
2112    ranks=(/((my_rank-1),my_rank=1,nranks)/)
2113    kgb_comm=xmpi_subcomm(glb_comm,nranks,ranks,my_rank_in_group=my_rank)
2114    ABI_FREE(ranks)
2115  else
2116    kgb_comm=xmpi_comm_self
2117    my_rank=0
2118  end if
2119 
2120 !Only for process in the new subgroup
2121  if (my_rank/=xmpi_undefined) then
2122 
2123 !  We enforce vectsize >=blocksize  (This is not true in lobpcg but
2124 !  these are rare cases and this simplify the matrix constructions below...)
2125    blocksize=npband*bandpp
2126    vectsize=max(1+mpw/(npband*npfft),blocksize)
2127    bigorder=3*blocksize
2128 
2129    ABI_MALLOC(blockvectorx,(vectsize,blocksize))
2130    ABI_MALLOC(blockvectorbx,(vectsize,blocksize))
2131    ABI_MALLOC(sqgram,(blocksize,blocksize))
2132    ABI_MALLOC(grama,(2*bigorder,bigorder))
2133    ABI_MALLOC(gramb,(2*bigorder,bigorder))
2134    ABI_MALLOC(eigen,(bigorder))
2135    ABI_MALLOC(val_npslk,(max_number_of_npslk)) ! not too much values tested
2136 
2137    min_eigen=greatest_real
2138    min_ortho=greatest_real
2139    np_slk_best=-1 ; np_slk_max=0
2140 #ifdef HAVE_LINALG_SCALAPACK
2141    np_slk_max=min(max_number_of_npslk,npband*npfft)
2142 #endif
2143 
2144 !  Preselect a range of available np_slk values
2145    val_npslk(1:)=0 ; val_npslk(2)=1
2146    do islk=3,np_slk_max
2147      np_slk=val_npslk(islk-1)*2
2148      do while ((modulo(npband*npfft,np_slk)>0).and.(np_slk<(npband*npfft)))
2149        np_slk=np_slk+1
2150      end do
2151      if(np_slk>(npband*npfft).or.np_slk>mband) exit
2152      val_npslk(islk)=np_slk
2153    end do
2154    np_slk_max=islk-1
2155 
2156 !  Loop over np_slk values
2157    islk1=1
2158 #ifdef HAVE_LINALG_MAGMA
2159    if (use_linalg_gpu==ABI_GPU_LEGACY) islk1=0
2160 #endif
2161    do islk=islk1,np_slk_max
2162 
2163      time_xortho=zero ; time_xeigen=zero
2164 
2165      use_slk=0
2166      if (islk==0) then
2167 !      This is the test for the GPU
2168        use_lapack_gpu=1 ; np_slk=0
2169      else
2170        use_lapack_gpu=0 ; np_slk=val_npslk(islk)
2171        if (np_slk>0) use_slk=1
2172      end if
2173 
2174 !    Initialize linalg parameters for this np_slk value
2175 !    For the first np_slk value, everything is initialized
2176 !    For the following np_slk values, only Scalapack parameters are updated
2177      wfoptalg=14 ! Simulate use of LOBPCG
2178      call abi_linalg_init(bigorder,RUNL_GSTATE,wfoptalg,1,&
2179 &                         use_lapack_gpu,use_slk,np_slk,kgb_comm)
2180 
2181 !    We could do mband/blocksize iter as in lobpcg but it's too long
2182      do iter=1,max_number_of_iter
2183 
2184 !      Build matrixes
2185        do ii=1,vectsize
2186          do jj=1,blocksize
2187            if (ii>jj) then
2188              blockvectorx(ii,jj) =czero
2189              blockvectorbx(ii,jj)=czero
2190            else
2191              blockvectorx(ii,jj) =cone
2192              blockvectorbx(ii,jj)=cone
2193            end if
2194          end do
2195        end do
2196        grama=zero;gramb=zero
2197        do jj=1,bigorder
2198          do ii=jj,bigorder
2199            if (ii==jj) then
2200              grama(2*ii-1,jj)=one
2201              gramb(2*ii-1,jj)=one
2202            else
2203              grama(2*ii-1:2*ii,jj)=one
2204              grama(2*jj-1,ii)= one
2205              grama(2*jj  ,ii)=-one
2206            end if
2207          end do
2208        end do
2209 
2210 !      Call to abi_xorthonormalize
2211        time_xortho=time_xortho-abi_wtime()
2212        call abi_xorthonormalize(blockvectorx,blockvectorbx,blocksize,kgb_comm,sqgram,vectsize)
2213        time_xortho = time_xortho + abi_wtime()
2214 
2215 !      Call to abi_xhegv
2216        time_xeigen=time_xeigen-abi_wtime()
2217        call abi_xhegv(1,'v','u',bigorder,grama,bigorder,gramb,bigorder,eigen,&
2218 &       x_cplx=2,use_slk=use_slk,use_gpu_magma=use_lapack_gpu)
2219        time_xeigen=time_xeigen+abi_wtime()
2220 
2221      end do ! iter
2222 
2223 !    Finalize linalg parameters for this np_slk value
2224 !    For the last np_slk value, everything is finalized
2225 !    For the previous np_slk values, only Scalapack parameters are updated
2226      call abi_linalg_finalize(use_lapack_gpu)
2227 
2228      time_xortho= time_xortho*mband/blocksize
2229      time_xeigen= time_xeigen*mband/blocksize
2230      if (time_xortho<min_ortho) min_ortho=time_xortho
2231      if (time_xeigen<min_eigen) then
2232        min_eigen=time_xeigen
2233        np_slk_best=np_slk
2234        keep_gpu=use_lapack_gpu
2235      end if
2236 
2237    end do ! np_slk
2238 
2239 #ifdef DEBUG_MODE
2240    write(msg,'(2(a,es15.3),a,i3)') ' In the best case, xortho took ',min_ortho,&
2241     ' and xeigen took ',min_eigen,' for np_slk=',np_slk_best
2242    call wrtout(std_out,msg,'PERS')
2243 #endif
2244 
2245 !  Final values to be sent to others process
2246    acc_kgb=min_ortho+four*min_eigen
2247    npslk=max(np_slk_best,1)
2248    use_linalg_gpu=keep_gpu
2249 
2250    ABI_FREE(blockvectorx)
2251    ABI_FREE(blockvectorbx)
2252    ABI_FREE(sqgram)
2253    ABI_FREE(grama)
2254    ABI_FREE(gramb)
2255    ABI_FREE(eigen)
2256    ABI_FREE(val_npslk)
2257 
2258  end if ! my_rank in group
2259 
2260 !Free local MPI communicator
2261  call xmpi_comm_free(kgb_comm)
2262 
2263 !Broadcast of results to be sure every process has them
2264  call xmpi_bcast(acc_kgb,0,glb_comm,ierr)
2265  call xmpi_bcast(npslk,0,glb_comm,ierr)
2266  call xmpi_bcast(use_linalg_gpu,0,glb_comm,ierr)
2267 
2268 #ifndef DEBUG_MODE
2269  ABI_UNUSED(msg)
2270 #endif
2271 
2272  DBG_EXIT("COLL")
2273 
2274 end subroutine compute_kgb_indicator

ABINIT/finddistrproc [ Functions ]

[ Top ] [ Functions ]

NAME

 finddistrproc

FUNCTION

   Given a total number of processors, find a suitable distribution
   that fill all the different levels of parallelization
   (npimage, nppert, np_spkpt, npspinor, npband, npfft, bandpp)
   Also determine parameters of parallel Linear Algebra routines
   (use_slk, np_slk, gpu_linalg_limit)

INPUTS

  dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables,
   for all datasets; at this stage only datasets with index lower than
   idtset are already initalized
  filnam(5)=character strings giving file names
  idtset=number of the current dataset
  mpi_enreg=information about MPI parallelization
  mband=maximum number of bands.
  ndtset_alloc=number of datasets, corrected for allocation of at least one data set
  tread(11)=flags indicating wether parallel input parameters were read from input file
            tread(1)  : paral_kgb      tread(6) : npfft
            tread(2)  : npimage        tread(7) : npband
            tread(3)  : nppert         tread(8) : bandpp
            tread(4)  : np_spkpt       tread(9) : use_slk
            tread(5)  : nspinor        tread(10): np_slk
            tread(11) : gpu_linalg_limit

SIDE EFFECTS

  iexit= if incremented, an exit is required
  dtset%paral_kgb= flag for band-fft parallelism
  dtset%npimage  = number of processors for parallelisation over image
  dtset%nppert   = number of processors for parallelisation over perturbations
  dtset%npspinor = number of processors for parallelisation on spinor components
  dtset%np_spkpt  = number of processors for parallelisation on spin / k points
  dtset%npfft    = number of processors for parallelisation on fft grid
  dtset%npband   = number of processors for parallelisation on bands
  dtset%nphf     = number of processors for parallelisation on occupied states for fock exchange
  dtset%bandpp   = internal parameter for lobpcg parallelisation algorithm
  dtset%use_slk  = flag for ScalaPAck use
  dtset%np_slk   = number of processors used in ScaLapack routines
  dtset%gpu_linalg_limit=threshold activating Linear Algebra on GPU

SOURCE

1116  subroutine finddistrproc(dtsets,filnam,idtset,iexit,mband,mpi_enreg,ndtset_alloc,tread)
1117 
1118 !Arguments ------------------------------------
1119 !scalars
1120  integer,intent(in) :: idtset,mband,ndtset_alloc
1121  integer,intent(inout) :: iexit
1122  type(dataset_type),intent(inout),target :: dtsets(0:ndtset_alloc)
1123  type(MPI_type),intent(inout) :: mpi_enreg
1124 !arrays
1125  integer,intent(in) :: tread(11)
1126  character(len=fnlen),intent(in) :: filnam(5)
1127 
1128 !Local variables-------------------------------
1129 !scalars
1130 !128 should be a reasonable maximum for npfft (scaling is very poor for npfft>20)
1131  integer,parameter :: ALGO_NOT_SET=-1, ALGO_DEFAULT_PAR=2
1132  integer,parameter :: ALGO_CG=0, ALGO_LOBPCG_OLD=1, ALGO_LOBPCG_NEW=2, ALGO_CHEBFI=3, ALGO_CHEBFI_NEW=4
1133  integer,parameter :: NPFMAX=128,BLOCKSIZE_MAX=3000,MAXBAND_PRINT=10
1134  integer,parameter :: MAXCOUNT=250,MAXPRINT=10,MAXBENCH=25,MAXABIPY=25,NPF_CUTOFF=20
1135  real(dp),parameter :: relative_nband_range=0.025
1136  integer :: wf_algo,wf_algo_global,bpp,bpp_max,bpp_min,optdriver,autoparal,nblocks,blocksize
1137  integer :: npi_max,npi_min,npc,npc_max,npc_min
1138  integer :: np_sk,np_sk_max,np_sk_min,npp_max,npp_min
1139  integer :: nps,nps_max,nps_min,npf,npf_max,npf_min
1140  integer :: npb,npb_max,npb_min,max_ncpus,ount,paral_kgb
1141  integer :: work_size,nks_per_proc,tot_ncpus
1142  integer :: ib1,ib2,ibest,icount,ii,imin,jj,kk,mcount,mcount_eff,mpw
1143  integer :: n2,n3,ncell_eff,ncount,nimage_eff,nkpt_eff,npert_eff
1144  integer :: nproc,nproc1,nprocmin,np_slk,nthreads,use_linalg_gpu,omp_ncpus
1145  logical :: dtset_found,file_found,first_bpp,iam_master
1146  logical :: with_image,with_pert,with_kpt,with_spinor,with_fft,with_band,with_bandpp,with_thread
1147  real(dp):: acc_c,acc_k,acc_kgb,acc_kgb_0,acc_s,ecut_eff,eff,ucvol,weight0
1148  character(len=10) :: suffix
1149  character(len=20) :: strg
1150  character(len=500) :: msg,msgttl
1151  character(len=fnlen) :: filden
1152  type(hdr_type) :: hdr0
1153 !arrays
1154  integer :: idum(1),idum3(3),ngmax(3),ngmin(3)
1155  integer,allocatable :: nband_best(:),isort(:),jdtset_(:)
1156  integer,allocatable :: my_algo(:),my_distp(:,:),nproc_best(:)
1157  integer,pointer :: nkpt_rbz(:)
1158  real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3),rprimd(3,3)
1159  real(dp),allocatable :: weight(:)
1160  real(dp),pointer :: nband_rbz(:,:)
1161  type(dataset_type),pointer :: dtset
1162 
1163 !******************************************************************
1164 
1165  DBG_ENTER("COLL")
1166 
1167 !Select current dataset
1168  dtset => dtsets(idtset)
1169 
1170 !Is automatic parallelization activated?
1171  autoparal = dtset%autoparal
1172  if (autoparal==0) return
1173 
1174 !Is it available
1175  if ((dtset%usefock==1).AND.(dtset%nphf/=1)) then
1176    ABI_ERROR("autoparal>0 not available for Hartree-Fock or hybrid XC calculations!")
1177  end if
1178  if ((autoparal>1).and.dtset%wfoptalg/=4.and.dtset%wfoptalg/=14) then
1179    ABI_ERROR("autoparal>1 only available for the old LOBPCG algorithm (wfoptalg=4/14)!")
1180  end if
1181 
1182  ! Unit number used for outputting the autoparal sections
1183  ount = ab_out
1184 
1185  ! From the documentation:
1186  !
1187  !   If autoparal > 1 and max_ncpus is greater than 0, ABINIT analyzes the
1188  !   efficiency of the process distribution for each possible number of processors
1189  !   from 2 to max_ncpus. After having printed out the efficiency, the code stops.
1190 
1191  ! Handy local variables
1192  iam_master = (mpi_enreg%me==0)
1193  optdriver = dtset%optdriver
1194  max_ncpus = dtset%max_ncpus ; if (dtset%paral_kgb<0) max_ncpus=abs(dtset%paral_kgb)
1195  nthreads=xomp_get_max_threads()
1196  nproc=mpi_enreg%nproc
1197  if (max_ncpus>0) nproc = dtset%max_ncpus/nthreads
1198  if (xmpi_paral==0.and.max_ncpus<=0) nproc=1
1199 
1200  nprocmin=2
1201  if (xmpi_paral==1.and.max_ncpus<=0) nprocmin=max(2,nproc-100)
1202  if (max_ncpus>0.and.autoparal/=0) nprocmin=1
1203 
1204  wf_algo_global=ALGO_NOT_SET
1205  if (dtset%wfoptalg==0.and.tread(1)==1) wf_algo_global=ALGO_CG
1206  if (dtset%wfoptalg==4.or.dtset%wfoptalg==14) wf_algo_global=ALGO_LOBPCG_OLD
1207  if (dtset%wfoptalg==114) wf_algo_global=ALGO_LOBPCG_NEW
1208  if (dtset%wfoptalg==1) wf_algo_global=ALGO_CHEBFI
1209  if (dtset%wfoptalg==111) wf_algo_global=ALGO_CHEBFI_NEW
1210 
1211  ! Some peculiar cases (with direct exit)
1212  ! MG: What is the meaning of max_ncpus < 0. This is not documented!
1213  if (max_ncpus<=0) then
1214    if (nproc==1.and.max_ncpus<=0) then
1215      if (tread(1)==0.or.xmpi_paral==0) dtset%paral_kgb= 0
1216      if (tread(2)==0.or.xmpi_paral==0) dtset%npimage  = 1
1217      if (tread(3)==0.or.xmpi_paral==0) dtset%nppert   = 1
1218      if (tread(4)==0.or.xmpi_paral==0) dtset%npspinor = 1
1219      if (tread(5)==0.or.xmpi_paral==0) dtset%np_spkpt = 1
1220      if (tread(6)==0.or.xmpi_paral==0) dtset%npfft    = 1
1221      if (tread(7)==0.or.xmpi_paral==0) dtset%npband   = 1
1222      if (tread(8)==0.or.xmpi_paral==0) dtset%bandpp   = 1
1223      if (tread(9)==0.or.xmpi_paral==0) dtset%use_slk  = 0
1224      if (tread(10)==0.or.xmpi_paral==0) dtset%np_slk  = 1000000
1225      return
1226    end if
1227    if ((optdriver/=RUNL_GSTATE.and. optdriver/=RUNL_RESPFN.and. optdriver/=RUNL_GWLS).or. &
1228        (optdriver==RUNL_GSTATE.and.dtset%usewvl==1)) then
1229      dtset%paral_kgb= 0
1230      dtset%npimage  = max(1,dtset%npimage)
1231      dtset%nppert   = max(1,dtset%nppert)
1232      dtset%npspinor = max(1,dtset%npspinor)
1233      dtset%np_spkpt = max(1,dtset%np_spkpt)
1234      dtset%npfft    = max(1,dtset%npfft)
1235      dtset%npband   = max(1,dtset%npband)
1236      dtset%bandpp   = max(1,dtset%bandpp)
1237      return
1238    end if
1239  end if
1240 
1241  ! Need the metric tensor
1242  call mkrdim(dtset%acell_orig(1:3,1),dtset%rprim_orig(1:3,1:3,1),rprimd)
1243  call metric(gmet,gprimd,-1,rmet,rprimd,ucvol)
1244 
1245  ! Determine some quantities related to plane waves
1246  !  - Crude estimation of the number of PW
1247  !  - Number of G vectors in each direction
1248  mpw=0;ngmin=0;ngmax=0
1249  if (optdriver==RUNL_GSTATE) then
1250    ecut_eff = dtset%ecut*dtset%dilatmx**2
1251    mpw = nint(ucvol*((two*ecut_eff)**1.5_dp)/(six*pi**2)) ! Crude estimation
1252    if (all(dtset%istwfk(1:dtset%nkpt)>1)) mpw=mpw/2+1
1253    call kpgcount(ecut_eff,dtset%exchn2n3d,gmet,dtset%istwfk,dtset%kpt,ngmax,ngmin,dtset%nkpt)
1254    write(msg,'(a,i0)') ' getmpw sequential formula gave: ',mpw
1255    call wrtout(std_out,msg)
1256  end if
1257 
1258  ! Parallelization over images
1259  npi_min=1;npi_max=1;nimage_eff=1
1260  if (optdriver==RUNL_GSTATE) then
1261    nimage_eff=dtset%ndynimage
1262    if (dtset%ntimimage<=1) nimage_eff=dtset%nimage
1263    npi_min=max(1,dtset%npimage)
1264    npi_max=min(nproc,nimage_eff)
1265    if (tread(2)==1) npi_max=dtset%npimage
1266  end if
1267 
1268 !Parallelization over k-points and spin components (GS)
1269  np_sk_min=1;np_sk_max=1;nkpt_eff=0
1270  if (optdriver==RUNL_GSTATE) then
1271    nkpt_eff=dtset%nkpt*dtset%nsppol
1272    np_sk_min=max(1,dtset%np_spkpt)
1273    np_sk_max=min(nproc,nkpt_eff)
1274    if (tread(4)==1) np_sk_max=dtset%np_spkpt
1275  end if
1276 
1277 !Parallelization over perturbations, k-points and spin components (DFPT)
1278  npp_min=1;npp_max=1;npert_eff=1
1279  if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then
1280    if (dtset%paral_rf==1) then
1281      call dtset%get_npert_rbz(nband_rbz, nkpt_rbz, npert_eff)
1282      do jj=1,npert_eff
1283        ii=dtset%nsppol*nkpt_rbz(jj)*maxval(nband_rbz(:,jj))
1284        nkpt_eff=max(nkpt_eff,ii)
1285      end do
1286      npp_min=max(1,dtset%nppert)
1287      npp_max=min(nproc,npert_eff)
1288      if (tread(3)==1) then
1289        npp_max=dtset%nppert
1290        if (npp_max>npert_eff) then
1291          npp_min=npert_eff;npp_max=npert_eff
1292          ABI_WARNING('nppert is bigger than npert; we set nppert=npert')
1293        end if
1294      end if
1295      np_sk_min=1
1296      np_sk_max=min(nproc,nkpt_eff)
1297      ABI_FREE(nkpt_rbz)
1298      ABI_FREE(nband_rbz)
1299    else
1300      nkpt_eff=nproc
1301      np_sk_min=nproc-5
1302      np_sk_max=nproc
1303    end if
1304  end if
1305 
1306 !Parallelization over spinorial components
1307  nps_min=1;nps_max=1
1308  if (optdriver==RUNL_GSTATE) then
1309    nps_min=max(1,dtset%npspinor)
1310    nps_max=min(nproc,dtset%nspinor)
1311    if (tread(5)==1) nps_max=dtset%npspinor
1312  end if
1313 
1314 !KGB Parallelization
1315 
1316  npf_min=1;npf_max=1
1317  npb_min=1;npb_max=1
1318  bpp_min=1;bpp_max=1
1319  n2=0;n3=0
1320  if (optdriver==RUNL_GSTATE) then
1321 
1322 !  >> FFT level
1323    npf_min=max(1,dtset%npfft)
1324    npf_min=min(npf_min,ngmin(2))
1325    npf_max=min(nproc,NPFMAX)
1326    if (tread(6)==1) then
1327      npf_max=dtset%npfft
1328      if (npf_max>ngmin(2)) then
1329        write(msg,'(3a)') &
1330         "Value of npfft given in input file is too high for the FFT grid!",ch10,&
1331         "Action: decrease npfft or increase FFT grid (ecut, ngfft, ...)."
1332        ABI_ERROR(msg)
1333      end if
1334    end if
1335    npf_max=min(npf_max,ngmin(2))
1336    ! Deactivate MPI FFT parallelism for GPU
1337    if (dtset%gpu_option/=ABI_GPU_DISABLED) then
1338      npf_min=1;npf_max=1
1339    end if
1340    !Deactivate MPI FFT parallelism for GPU
1341    if (tread(1)==1.and.dtset%paral_kgb==0) then
1342      npf_min=1;npf_max=1
1343    end if
1344    !Deactivate MPI FFT parallelism for multi-threaded LOBPCG / CHEBFI
1345    if ((wf_algo_global==ALGO_LOBPCG_NEW.or.wf_algo_global==ALGO_CHEBFI.or.wf_algo_global==ALGO_CHEBFI_NEW).and.nthreads>1) then
1346      npf_min=1;npf_max=1
1347    end if
1348 
1349    ! Number of FFT procs has to be a multiple of FFT grid sizes
1350    ! In case of a restart from a density file, it has to be
1351    ! compatible with the FFT grid used for the density
1352    n2=dtset%ngfft(2) ; n3=dtset%ngfft(3)
1353    if (n2==0.and.n3==0) then
1354      if (dtset%getden/=0.or.dtset%irdden/=0.or.&
1355 &        dtset%getkden/=0.or.dtset%irdkden/=0.or.dtset%iscf<0) then
1356        dtset_found=.false.;file_found=.false.
1357        !1-Try to find ngfft from previous dataset
1358        if (dtset%getden/=0.or.dtset%getkden/=0) then
1359          do ii=1,ndtset_alloc
1360            jj=dtset%getden;if (jj==0) jj=dtset%getkden
1361            if (jj<0) jj=dtset%jdtset+jj
1362            if (dtsets(ii)%jdtset==jj) then
1363              dtset_found=.true.
1364              n2=dtsets(ii)%ngfftdg(2);n3=dtsets(ii)%ngfftdg(3)
1365            end if
1366          end do
1367        end if
1368        !2-If not found, try to extract ngfft from density file
1369        if (.not.dtset_found) then
1370          !Retrieve file name
1371          if (dtset%getden/=0.or.dtset%irdden/=0) then
1372            suffix='_DEN';if (dtset%nimage>1) suffix='_IMG1_DEN'
1373          else if (dtset%getkden/=0.or.dtset%irdkden/=0) then
1374            suffix='_KDEN';if (dtset%nimage>1) suffix='_IMG1_KDEN'
1375          end if
1376          ABI_MALLOC(jdtset_,(0:ndtset_alloc))
1377          jdtset_=0;if(ndtset_alloc/=0) jdtset_(0:ndtset_alloc)=dtsets(0:ndtset_alloc)%jdtset
1378          call mkfilename(filnam,filden,dtset%getden,idtset,dtset%irdden,jdtset_,ndtset_alloc,suffix,'den',ii)
1379          ABI_FREE(jdtset_)
1380          !Retrieve ngfft from file header
1381          idum3=0
1382          if (mpi_enreg%me==0) then
1383            inquire(file=trim(filden),exist=file_found)
1384            if (file_found) then
1385              call hdr_read_from_fname(hdr0,filden,ii,xmpi_comm_self)
1386              idum3(1:2)=hdr0%ngfft(2:3);if (file_found) idum3(3)=1
1387              call hdr0%free()
1388              ABI_WARNING("Cannot find filden "//filden)
1389            end if
1390          end if
1391          call xmpi_bcast(idum3,0,mpi_enreg%comm_world,ii)
1392          n2=idum3(1);n3=idum3(2);file_found=(idum3(3)/=0)
1393        end if
1394      end if
1395    end if
1396 
1397 !  >> Band level
1398    npb_min=max(1,dtset%npband)
1399    npb_max=min(nproc,mband)
1400    if (tread(7)==1) npb_max=dtset%npband
1401    if (tread(1)==1.and.dtset%paral_kgb==0) then
1402      npb_min=1;npb_max=1
1403    end if
1404 
1405 !  >> banddp level
1406    if (tread(8)==1) then
1407      bpp_min = dtset%bandpp
1408    else
1409      bpp_min = 1
1410    end if
1411    bpp_max=mband
1412    if (wf_algo_global==ALGO_LOBPCG_OLD) bpp_max=max(4,nint(mband/10.)) ! reasonable bandpp max
1413    if (tread(8)==1) bpp_max=dtset%bandpp
1414    if (wf_algo_global==ALGO_CHEBFI) bpp_min=1 ! bandpp not used with ChebFi
1415    if (wf_algo_global==ALGO_CHEBFI) bpp_max=1
1416    if (wf_algo_global==ALGO_CHEBFI_NEW) bpp_min=1 ! bandpp not used with ChebFi
1417    if (wf_algo_global==ALGO_CHEBFI_NEW) bpp_max=1 ! bandpp not used with ChebFi
1418 
1419  end if ! RUNL_GSTATE
1420 
1421 !Disable KGB parallelisation in some cases:
1422 !  - no GS
1423 !  - paral_kgb=0 present in input file
1424 !  - nstep=0
1425 !  - Hartree-Fock or hybrid calculation (for now on)
1426  if ( (optdriver/=RUNL_GSTATE).or.(dtset%paral_kgb==0.and.tread(1)==1).or. &
1427       (dtset%nstep==0).or.(dtset%usefock==1)) then
1428    nps_min=1; nps_max=1
1429    npf_min=1; npf_max=1
1430    npb_min=1; npb_max=1
1431    bpp_min=1; bpp_max=1
1432  end if
1433 
1434  ! Which levels of parallelism do we have?
1435  with_image =(npi_min/=1.or.npi_max/=1)
1436  with_pert  =(npp_min/=1.or.npp_max/=1)
1437  with_kpt   =(np_sk_min/=1.or.np_sk_max/=1)
1438  with_spinor=(nps_min/=1.or.nps_max/=1)
1439  with_fft   =(npf_min/=1.or.npf_max/=1)
1440  with_band  =(npb_min/=1.or.npb_max/=1)
1441  with_bandpp=(bpp_min/=1.or.bpp_max/=1)
1442  with_thread=(nthreads>1)
1443 
1444 !Allocate lists
1445  ABI_MALLOC(my_distp,(10,MAXCOUNT))
1446  ABI_MALLOC(weight,(MAXCOUNT))
1447  ABI_MALLOC(my_algo,(MAXCOUNT))
1448  my_distp(1:7,:)=0;weight(:)=zero
1449  my_distp(8,:)=dtset%use_slk
1450  my_distp(9,:)=dtset%np_slk
1451  my_distp(10,:)=dtset%gpu_linalg_limit
1452  my_algo(:)=wf_algo_global
1453  icount=0;imin=1
1454 
1455 !Cells= images or perturbations
1456  npc_min=1;npc_max=1;ncell_eff=1
1457  if (optdriver==RUNL_GSTATE) then
1458    ncell_eff=nimage_eff;npc_min=npi_min;npc_max=npi_max
1459  end if
1460  if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then
1461    ncell_eff=npert_eff;npc_min=npp_min;npc_max=npp_max
1462  end if
1463 
1464 !Loop over all possibilities
1465 !Computation of weight~"estimated acceleration"
1466 !================================================================
1467 
1468 !Cells= images or perturbations
1469  npc_min=1;npc_max=1;ncell_eff=1
1470  if (optdriver==RUNL_GSTATE) then
1471    ncell_eff=nimage_eff;npc_min=npi_min;npc_max=npi_max
1472  end if
1473  if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then
1474    ncell_eff=npert_eff;npc_min=npp_min;npc_max=npp_max
1475  end if
1476 
1477 !>>>>> CELLS
1478  do npc=npc_min,npc_max
1479    acc_c=one;if (npc>1) acc_c=0.99_dp*speedup_fdp(ncell_eff,npc)
1480 
1481 !  >>>>> K-POINTS
1482    do np_sk=np_sk_min,np_sk_max
1483 !    -> for DFPT runs, impose that nsppol divides np_sk
1484      if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE]) .and. modulo(np_sk,dtset%nsppol)>0.and.np_sk>1) cycle
1485      acc_k=one;if (np_sk>1) acc_k=0.96_dp*speedup_fdp(nkpt_eff,np_sk)
1486 
1487 !    >>>>> SPINORS
1488      do nps=nps_min,nps_max
1489        acc_s=one;if (nps>1) acc_s=0.85_dp*speedup_fdp(dtset%nspinor,nps)
1490 
1491 !      >>>>> FFT
1492        do npf=npf_min,npf_max
1493 !        -> npf should divide ngfft if set (if unset, ngfft=0 so the modulo test is ok)
1494          if((modulo(n2,npf)>0).or.(modulo(n3,npf)>0)) cycle
1495 !        -> npf should be only divisible by 2, 3 or 5
1496          ii=npf
1497          do while (modulo(ii,2)==0)
1498            ii=ii/2
1499          end do
1500          do while (modulo(ii,3)==0)
1501            ii=ii/3
1502          end do
1503          do while (modulo(ii,5)==0)
1504            ii=ii/5
1505          end do
1506          if(ii/=1) cycle
1507 
1508 !        Change algo if npfft>1
1509          wf_algo=wf_algo_global
1510          if (optdriver==RUNL_GSTATE.and.npf>1.and. wf_algo_global==ALGO_NOT_SET) wf_algo=ALGO_DEFAULT_PAR
1511 
1512 !        FFT parallelism not compatible with multithreading
1513          if (wf_algo==ALGO_LOBPCG_NEW.or.wf_algo==ALGO_CHEBFI.or.wf_algo==ALGO_CHEBFI_NEW) then
1514            if (nthreads>1.and.npf>1) cycle
1515          end if
1516 
1517 !        >>>>> BANDS
1518          do npb=npb_min,npb_max
1519            nproc1=npc*np_sk*nps*npf*npb
1520            if (nproc1<nprocmin)     cycle
1521            if (nproc1>nproc)        cycle
1522            if (modulo(mband,npb)>0) cycle
1523 
1524 !          Change algo if npband>1
1525            if (optdriver==RUNL_GSTATE.and.npb>1.and. wf_algo_global==ALGO_NOT_SET) wf_algo=ALGO_DEFAULT_PAR
1526 
1527 !          Base speedup
1528            acc_kgb_0=one;if (npb*npf*nthreads>1) acc_kgb_0=0.7_dp*speedup_fdp(mpw,(npb*npf*nthreads))
1529 
1530            if (npb*npf>4.and.wf_algo==ALGO_LOBPCG_OLD) then
1531 !            Promote npb=npf
1532              acc_kgb_0=acc_kgb_0*min((one*npf)/(one*npb),(one*npb)/(one*npf))
1533 !            Promote npf<=20
1534              if (npf>20)then
1535                acc_kgb_0=acc_kgb_0* &
1536 &                 0.2_dp+(one-0.2_dp)*(sin((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))) &
1537 &                 /((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))))**2
1538              end if
1539            end if
1540 
1541            first_bpp=.true.
1542            do bpp=bpp_min,bpp_max
1543 
1544              if (wf_algo==ALGO_LOBPCG_NEW) then
1545                blocksize=npb*bpp;nblocks=mband/blocksize
1546                if (modulo(bpp,nthreads)>0) cycle
1547                if ((bpp>1).and.(modulo(bpp,2)>0)) cycle
1548                if (modulo(mband,npb*bpp)>0) cycle
1549              else if (wf_algo==ALGO_LOBPCG_OLD) then
1550                blocksize=npb*bpp;nblocks=mband/blocksize
1551                if (modulo(mband/npb,bpp)>0) cycle
1552                if ((bpp>1).and.(modulo(bpp,2)>0)) cycle
1553                if (one*npb*bpp >max(1.,mband/3.).and.(mband>30)) cycle
1554                if (npb*npf<=4.and.(.not.first_bpp)) cycle
1555              else if (wf_algo==ALGO_CHEBFI .or. wf_algo==ALGO_CHEBFI_NEW) then
1556                !Nothing
1557              else
1558                if (bpp/=1.or.npb/=1) cycle
1559              end if
1560 
1561              first_bpp=.false.
1562 
1563              acc_kgb=acc_kgb_0
1564 !            OLD LOBPCG: promote bpp*npb>mband/3
1565              if (wf_algo==ALGO_LOBPCG_OLD) then
1566                if (npb*npf>4.and.mband>30) acc_kgb=acc_kgb*(one-(three*bpp*npb)/(one*mband))
1567              end if
1568 !            NEW LOBPCG: promote minimal number of blocks
1569 !                        promote block size <= BLOCKSIZE_MAX
1570              if (wf_algo==ALGO_LOBPCG_NEW) then
1571                acc_kgb=acc_kgb*(one-0.9_dp*dble(nblocks-1)/dble(mband-1))
1572                if (blocksize>BLOCKSIZE_MAX) acc_kgb=acc_kgb*max(0.1_dp,one-dble(blocksize)/dble(10*BLOCKSIZE_MAX))
1573                if (nthreads==1) then
1574 !                Promote npband vs bandpp & npfft
1575                  if (blocksize>1) acc_kgb=acc_kgb*(0.1_dp*bpp+0.9_dp-blocksize)/(one-blocksize)
1576                  if (npb*npf>4.and.mband>100) acc_kgb=acc_kgb*(one-0.8_dp*((three*bpp*npb)/(one*mband)-one)**2)
1577                  tot_ncpus=max(npb,npf);if (tot_ncpus==2) tot_ncpus=0
1578                  acc_kgb=acc_kgb*(one-0.8_dp*((dble(npb)/dble(npf))-2_dp)**2/(tot_ncpus-2_dp)**2)
1579                  eff=max(npf,20);acc_kgb=acc_kgb*(one-0.8_dp*min(one,(eff-20)**2))
1580                end if
1581              end if
1582 
1583 !            CHEBFI: promote npfft=npband and nband>=npfft
1584              if (wf_algo==ALGO_CHEBFI .or. wf_algo==ALGO_CHEBFI_NEW) then
1585                if (npf>1) then
1586                  if (npb>npf) then
1587                    acc_kgb=acc_kgb*(one-0.8_dp*0.25_dp*((dble(npb)/dble(npf))-one)**2/(nproc1-one)**2)
1588                  else
1589                    acc_kgb=acc_kgb*(one-0.8_dp*nproc1**2*((dble(npb)/dble(npf))-one)**2/(nproc1-one)**2)
1590                  end if
1591                end if
1592              end if
1593 
1594 !            Resulting "weight"
1595 !            weight0=acc_c*acc_k*acc_s*acc_kgb
1596              weight0=nproc1*(acc_c+acc_k+acc_s+acc_kgb)/(npc+np_sk+nps+(npf*npb))
1597 
1598 !            Store data
1599              icount=icount+1
1600              if (icount<=MAXCOUNT) then
1601                my_algo(icount)=merge(ALGO_CG,wf_algo,wf_algo==ALGO_NOT_SET)
1602                my_distp(1:7,icount)=(/npc,np_sk,nps,npf,npb,bpp,nproc1/)
1603                weight(icount)=weight0
1604                if (weight0<weight(imin)) imin=icount
1605              else
1606                if (weight0>weight(imin)) then
1607                  my_algo(imin)=merge(ALGO_CG,wf_algo,wf_algo==ALGO_NOT_SET)
1608                  my_distp(1:7,imin)=(/npc,np_sk,nps,npf,npb,bpp,nproc1/)
1609                  weight(imin)=weight0
1610                  idum=minloc(weight);imin=idum(1)
1611                end if
1612              end if
1613 
1614            end do ! bpp
1615          end do ! npb
1616        end do ! npf
1617      end do ! nps
1618    end do ! np_sk
1619  end do ! npc
1620 
1621 !Compute number of selected distributions
1622  mcount_eff=icount
1623  mcount=min(mcount_eff,MAXCOUNT)
1624 
1625 !Stop if no solution found
1626  if (mcount==0) then
1627 !  Override here the 0 default value changed in indefo1
1628    dtset%npimage  = max(1,dtset%npimage)
1629    dtset%nppert   = max(1,dtset%nppert)
1630    dtset%np_spkpt = max(1,dtset%np_spkpt)
1631    dtset%npspinor = max(1,dtset%npspinor)
1632    dtset%npfft    = max(1,dtset%npfft)
1633    dtset%npband   = max(1,dtset%npband)
1634    dtset%bandpp   = max(1,dtset%bandpp)
1635    write(msg,'(a,i0,2a,i0,a)')  &
1636 &  'Your input dataset does not let Abinit find an appropriate process distribution with nCPUs=',nproc*nthreads,ch10, &
1637 &  'Try to comment all the np* vars and set max_ncpus=',nthreads*nproc,' to have advices on process distribution.'
1638    ABI_WARNING(msg)
1639    if (max_ncpus>0) call wrtout(ab_out,msg, do_flush=.True.)
1640    iexit=iexit+1
1641  end if
1642 
1643 !Sort data by increasing weight
1644  if (mcount>0) then
1645    ABI_MALLOC(isort,(mcount))
1646    isort=(/(ii,ii=1,mcount)/)
1647    call sort_dp(mcount,weight,isort,tol6)
1648    ncount=min(mcount,MAXPRINT)
1649  end if
1650 
1651 !Deduce a global value for paral_kgb
1652  paral_kgb=dtset%paral_kgb
1653  if (tread(1)==0) then
1654    if (any(my_algo(:)/=ALGO_CG)) paral_kgb=1
1655  end if
1656 
1657  ! ======================================
1658  ! Print output for abipy in Yaml format
1659  ! ======================================
1660 
1661  ! Please DO NOT CHANGE this part without contacting gmatteo first
1662  ! since ANY CHANGE can easily break the interface with AbiPy.
1663  if (iam_master .and. max_ncpus > 0.and. (mcount>0 .or. wf_algo_global == ALGO_CG)) then
1664    write(ount,'(2a)')ch10,"--- !Autoparal"
1665    if (optdriver==RUNL_GSTATE .and. paral_kgb == 0) then
1666      write(ount,"(a)")"# Autoparal section for GS run (band-by-band CG method)"
1667    else if (optdriver==RUNL_GSTATE) then
1668      write(ount,'(a)')'# Autoparal section for GS calculations with paral_kgb 1'
1669    else if (optdriver==RUNL_RESPFN) then
1670      write(ount,'(a)')'# Autoparal section for DFPT calculations'
1671    else if (optdriver==RUNL_LONGWAVE) then
1672      write(ount,'(a)')'# Autoparal section for LONGWAVE calculations'
1673    else
1674      ABI_ERROR(sjoin('Unsupported optdriver:', itoa(optdriver)))
1675    end if
1676    write(ount,"(a)")   "info:"
1677    write(ount,"(a,i0)")"    autoparal: ",autoparal
1678    write(ount,"(a,i0)")"    paral_kgb: ",paral_kgb
1679    write(ount,"(a,i0)")"    max_ncpus: ",max_ncpus
1680    write(ount,"(a,i0)")"    nspinor: ",dtset%nspinor
1681    write(ount,"(a,i0)")"    nsppol: ",dtset%nsppol
1682    write(ount,"(a,i0)")"    nkpt: ",dtset%nkpt
1683    write(ount,"(a,i0)")"    mband: ",mband
1684    write(ount,"(a)")"configurations:"
1685 
1686    if (optdriver==RUNL_GSTATE.and.paral_kgb==0) then
1687      work_size = dtset%nkpt * dtset%nsppol
1688      do ii=1,max_ncpus
1689        if (ii > work_size) cycle
1690        do omp_ncpus=1,nthreads
1691          nks_per_proc = work_size / ii
1692          nks_per_proc = nks_per_proc + MOD(work_size, ii)
1693          eff = (one * work_size) / (ii * nks_per_proc)
1694          write(ount,"(a,i0)")"    - tot_ncpus: ",ii * omp_ncpus
1695          write(ount,"(a,i0)")"      mpi_ncpus: ",ii
1696          write(ount,"(a,i0)")"      omp_ncpus: ",omp_ncpus
1697          write(ount,"(a,f12.9)")"      efficiency: ",eff
1698          !write(ount,"(a,f12.2)")"      mem_per_cpu: ",mempercpu_mb
1699        end do
1700      end do
1701 
1702    else if (optdriver==RUNL_GSTATE) then
1703      omp_ncpus=nthreads
1704      do jj=mcount,mcount-min(ncount,MAXABIPY)+1,-1
1705        ii=isort(jj)
1706        tot_ncpus = my_distp(7,ii)
1707        eff = weight(jj) / tot_ncpus
1708        write(ount,'(a,i0)')'    - tot_ncpus: ',tot_ncpus
1709        write(ount,'(a,i0)')'      mpi_ncpus: ',tot_ncpus
1710        write(ount,"(a,i0)")"      omp_ncpus: ",omp_ncpus
1711        write(ount,'(a,f12.9)')'      efficiency: ',eff
1712        !write(ount,'(a,f12.2)')'      mem_per_cpu: ',mempercpu_mb
1713        write(ount,'(a)'   )'      vars: {'
1714        write(ount,'(a,i0,a)')'            npimage: ',my_distp(1,ii),','
1715        ! Keep on using legacy npkpt instead of np_spkpt to maintain compatibility with AbiPy
1716        write(ount,'(a,i0,a)')'            npkpt: ',my_distp(2,ii),','
1717        !write(ount,'(a,i0,a)')'            np_spkpt: ',my_distp(2,ii),','
1718        write(ount,'(a,i0,a)')'            npspinor: ',my_distp(3,ii),','
1719        write(ount,'(a,i0,a)')'            npfft: ', my_distp(4,ii),','
1720        write(ount,'(a,i0,a)')'            npband: ',my_distp(5,ii),','
1721        write(ount,'(a,i0,a)')'            bandpp: ',my_distp(6,ii),','
1722        write(ount,'(a)')   '            }'
1723      end do
1724 
1725    else if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then
1726      do jj=mcount,mcount-min(ncount,MAXABIPY)+1,-1
1727        ii=isort(jj)
1728        tot_ncpus = my_distp(7,ii)
1729        eff = weight(jj) / tot_ncpus
1730        write(ount,'(a,i0)')'    - tot_ncpus: ',tot_ncpus
1731        write(ount,'(a,i0)')'      mpi_ncpus: ',tot_ncpus
1732        !write(ount,'(a,i0)')'      omp_ncpus: ',omp_ncpus !OMP not supported  (yet)
1733        write(ount,'(a,f12.9)')'      efficiency: ',eff
1734        !write(ount,'(a,f12.2)')'      mem_per_cpu: ',mempercpu_mb
1735        write(ount,'(a)'   )'      vars: {'
1736        write(ount,'(a,i0,a)')'             nppert: ', my_distp(1,ii),','
1737        ! Keep on using legacy npkpt instead of np_spkpt to maintain compatibility with AbiPy
1738        write(ount,'(a,i0,a)')'             npkpt: ', my_distp(2,ii),','
1739        !write(ount,'(a,i0,a)')'             np_spkpt: ', my_distp(2,ii),','
1740        write(ount,'(a)')   '            }'
1741       end do
1742    end if
1743    write(ount,'(a)')"..."
1744  end if
1745 
1746 !Print out tab with selected choices
1747  if (mcount>0.and.iam_master) then
1748    if (nthreads==1) then
1749      write(msg,'(a,1x,100("="),2a,i0,2a)') ch10,ch10,&
1750 &     ' Searching for all possible proc distributions for this input with #CPUs<=',nthreads*nproc,':',ch10
1751    else
1752      write(msg,'(a,1x,100("="),2a,i0,a,i0,2a)')  ch10,ch10,&
1753 &     ' Searching for all possible proc distributions for this input with #CPUs<=',nthreads*nproc,&
1754 &     ' and ',nthreads,' openMP threads:',ch10
1755    end if
1756    call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1757    !Titles of columns
1758    msgttl='~'
1759    if (with_image)  msgttl=trim(msgttl)//'~~~~~~~~~~~'
1760    if (with_pert)   msgttl=trim(msgttl)//'~~~~~~~~~~~'
1761    msgttl=trim(msgttl)//'~~~~~~~~~~~~~' ! kpt
1762    if (with_spinor) msgttl=trim(msgttl)//'~~~~~~~~~~'
1763    if (with_fft)    msgttl=trim(msgttl)//'~~~~~~~~~~~~~'
1764    if (with_band)   msgttl=trim(msgttl)//'~~~~~~~~~~~~~'
1765    if (with_bandpp) msgttl=trim(msgttl)//'~~~~~~~~~~~~~'
1766    if (with_thread) msgttl=trim(msgttl)//'~~~~~~~~~~'
1767    msgttl=trim(msgttl)//'~~~~~~~~~~~~~' ! nproc
1768    if (with_thread) msgttl=trim(msgttl)//'~~~~~~~~~~~~~'
1769    msgttl=trim(msgttl)//'~~~~~~~~~~~'   ! CPUs
1770    msgttl=' '//trim(msgttl)
1771    call wrtout(std_out,msgttl);if(max_ncpus>0) call wrtout(ab_out,msgttl)
1772    msg='|'
1773    if (with_image)  msg=trim(msg)//'   npimage|'
1774    if (with_pert)   msg=trim(msg)//'    nppert|'
1775    msg=trim(msg)//'       np_spkpt|'
1776    if (with_spinor) msg=trim(msg)//' npspinor|'
1777    if (with_fft)    msg=trim(msg)//'       npfft|'
1778    if (with_band)   msg=trim(msg)//'      npband|'
1779    if (with_bandpp) msg=trim(msg)//'      bandpp|'
1780    if (with_thread) msg=trim(msg)//' #Threads|'
1781    msg=trim(msg)//'  #MPI(proc)|'
1782    if (with_thread) msg=trim(msg)//'       #CPUs|'
1783    msg=trim(msg)//'    WEIGHT|'
1784    msg=' '//trim(msg)
1785    call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1786    msg='|'
1787    write(strg,'(i4,a,i4,a)') npi_min,'<<',npi_max,'|';if (with_image)  msg=trim(msg)//trim(strg)
1788    write(strg,'(i4,a,i4,a)') npp_min,'<<',npp_max,'|';if (with_pert)   msg=trim(msg)//trim(strg)
1789    write(strg,'(i5,a,i5,a)') np_sk_min,'<<',np_sk_max,'|';                 msg=trim(msg)//trim(strg)
1790    write(strg,'(i5,a,i2,a)') nps_min,'<<',nps_max,'|';if (with_spinor) msg=trim(msg)//trim(strg)
1791    write(strg,'(i5,a,i5,a)') npf_min,'<<',npf_max,'|';if (with_fft)    msg=trim(msg)//trim(strg)
1792    write(strg,'(i5,a,i5,a)') npb_min,'<<',npb_max,'|';if (with_band)   msg=trim(msg)//trim(strg)
1793    write(strg,'(i5,a,i5,a)') bpp_min,'<<',bpp_max,'|';if (with_bandpp) msg=trim(msg)//trim(strg)
1794    write(strg,'(i9,a)'     ) nthreads            ,'|';if (with_thread) msg=trim(msg)//trim(strg)
1795    write(strg,'(i5,a,i5,a)') 1      ,'<<',nproc  ,'|';                 msg=trim(msg)//trim(strg)
1796    write(strg,'(i4,a,i6,a)') nthreads,'<<',nthreads*nproc,'|';if (with_thread) msg=trim(msg)//trim(strg)
1797    write(strg,'(a,i6,a)')   '  <=',nthreads*nproc,'|';                 msg=trim(msg)//trim(strg)
1798    msg=' '//trim(msg)
1799    call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1800    call wrtout(std_out,msgttl);if(max_ncpus>0) call wrtout(ab_out,msgttl)
1801    !Loop over selected choices
1802    do jj=mcount,mcount-ncount+1,-1
1803      ii=isort(jj)
1804      msg='|'
1805      write(strg,'(i10,a)') my_distp(1,ii),'|';if (with_image)  msg=trim(msg)//trim(strg)
1806      write(strg,'(i10,a)') my_distp(1,ii),'|';if (with_pert)   msg=trim(msg)//trim(strg)
1807      write(strg,'(i12,a)') my_distp(2,ii),'|';                 msg=trim(msg)//trim(strg)
1808      write(strg,'(i9,a)')  my_distp(3,ii),'|';if (with_spinor) msg=trim(msg)//trim(strg)
1809      write(strg,'(i12,a)') my_distp(4,ii),'|';if (with_fft)    msg=trim(msg)//trim(strg)
1810      write(strg,'(i12,a)') my_distp(5,ii),'|';if (with_band)   msg=trim(msg)//trim(strg)
1811      write(strg,'(i12,a)') my_distp(6,ii),'|';if (with_bandpp) msg=trim(msg)//trim(strg)
1812      write(strg,'(i9,a)')  nthreads      ,'|';if (with_thread) msg=trim(msg)//trim(strg)
1813      write(strg,'(i12,a)') my_distp(7,ii),'|';                 msg=trim(msg)//trim(strg)
1814      write(strg,'(i12,a)') nthreads*my_distp(7,ii),'|';if (with_thread) msg=trim(msg)//trim(strg)
1815      write(strg,'(f10.3,a)') weight(jj)  ,'|';                 msg=trim(msg)//trim(strg)
1816      msg=' '//trim(msg)
1817      call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1818    end do
1819    !End of tab
1820    call wrtout(std_out,msgttl);if(max_ncpus>0) call wrtout(ab_out,msgttl)
1821    write(msg,'(a,i6,a,i6,a)')' Only the best possible choices for nproc are printed...'
1822    call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1823  end if ! mcount>0
1824 
1825 !Determine an optimal number of bands
1826  if (optdriver==RUNL_GSTATE.and. &
1827 &    (any(my_algo(1:mcount)==ALGO_LOBPCG_OLD.or. &
1828 &         my_algo(1:mcount)==ALGO_LOBPCG_NEW.or. &
1829 &         my_algo(1:mcount)==ALGO_CHEBFI.or. &
1830 &         my_algo(1:mcount)==ALGO_CHEBFI_NEW))) then
1831    if (mcount>0) then
1832      icount=isort(mcount)
1833      npc=my_distp(1,icount);np_sk=my_distp(2,icount)
1834      nps=my_distp(3,icount);npf=my_distp(4,icount)
1835    else
1836      npc=1;if (with_image ) npc=npi_min
1837      np_sk=1;if (with_kpt   ) np_sk=np_sk_min
1838      nps=1;if (with_spinor) nps=nps_min
1839      npf=1;if (with_fft   ) npf=npf_min
1840    end if
1841    nproc1=npc*np_sk*nps*npf
1842    msg=ch10//' >>> Possible (best) choices for the number of bands (nband) are:'
1843    if (with_image.or.with_kpt.or.with_spinor.or.with_fft) msg=trim(msg)//ch10//'     with:'
1844    write(strg,'(a,i0)') ' npimage=' ,npc;if (with_image)  msg=trim(msg)//trim(strg)
1845    write(strg,'(a,i0)') ' np_spkpt=' ,np_sk;if (with_kpt)    msg=trim(msg)//trim(strg)
1846    write(strg,'(a,i0)') ' npspinor=',nps;if (with_spinor) msg=trim(msg)//trim(strg)
1847    write(strg,'(a,i0)') ' npfft='   ,npf;if (with_fft)    msg=trim(msg)//trim(strg)
1848    call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1849    ib1=mband-int(mband*relative_nband_range);if (my_algo(icount)==ALGO_CHEBFI .or. my_algo(icount)==ALGO_CHEBFI_NEW) ib1=mband
1850    ib2=mband+int(mband*relative_nband_range)
1851    ABI_MALLOC(nproc_best,(1+ib2-ib1))
1852    ABI_MALLOC(nband_best,(1+ib2-ib1))
1853    nproc_best(:)=1
1854    nband_best=(/(ii,ii=ib1,ib2)/)
1855    bpp=merge(1,nthreads,my_algo(icount)==ALGO_CHEBFI .or. my_algo(icount)==ALGO_CHEBFI_NEW)
1856    do ii=ib1,ib2
1857      do jj=1,nproc/nproc1
1858        ibest=1
1859        do kk=1,jj
1860          if (mod(jj,kk)/=0) cycle
1861          if (mod(ii,kk*bpp)==0) ibest=max(ibest,kk)
1862        end do
1863        nproc_best(1+ii-ib1)=max(nproc_best(1+ii-ib1),ibest)
1864      end do
1865    end do
1866    call sort_int(1+ib2-ib1,nproc_best,nband_best)
1867    kk=-1
1868    do ii=1+ib2-ib1,max(ib2-ib1-MAXBAND_PRINT,1),-1
1869      write(msg,'(3(a,i6),a,i3,a,i5,a)') '     nband=',nband_best(ii),' using ',nproc1*nproc_best(ii)*nthreads,&
1870 &        ' CPUs =',nproc1*nproc_best(ii),' MPI x',nthreads,' threads (npband=',nproc_best(ii),')'
1871      call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1872      if (nband_best(ii)==mband) kk=nproc_best(ii)
1873    end do
1874    if (kk==maxval(nproc_best(:))) then
1875      if (my_algo(icount)/=ALGO_CHEBFI .or. my_algo(icount)/=ALGO_CHEBFI_NEW) then
1876        write(msg,'(a,i6,a)') ' >>> The present nband value (',mband,') seems to be the best choice!'
1877      end if
1878      if (my_algo(icount)==ALGO_CHEBFI .or. my_algo(icount)/=ALGO_CHEBFI_NEW) then
1879        write(msg,'(a,i6,a)') ' >>> The present nband value (',mband,') seems to be a good choice!'
1880      end if
1881      call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1882    end if
1883    ABI_FREE(nproc_best)
1884    ABI_FREE(nband_best)
1885  end if
1886 
1887  if (optdriver==RUNL_GSTATE.and.(any(my_algo(1:mcount)==ALGO_CHEBFI .or. my_algo(1:mcount)==ALGO_CHEBFI_NEW))) then
1888    write(msg,'(5a)') &
1889 &   ' >>> Note that with the "Chebyshev Filtering" algorithm, it is often',ch10,&
1890 &   '     better to increase the number of bands (10% more or a few tens more).',ch10,&
1891 &   '     Advice: increase nband and put nbdbuf input variable to (nband_new-nband_old).'
1892    call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1893  end if
1894 
1895 !Refinement of the process distribution by mean of a LinAlg routines benchmarking
1896  if (mcount>0.and.optdriver==RUNL_GSTATE.and.autoparal/=1) then
1897    icount=isort(mcount)
1898    if (autoparal/=3) then
1899      if (autoparal==2) then
1900        write(msg,'(5a,9(a10,a1))') ch10, &
1901 &       ' Values below have been tested with respect to Linear Algebra performance;',ch10,&
1902 &       ' Weights below are corrected according:',ch10,&
1903 &       'npimage','|','np_spkpt' ,'|','npspinor'  ,'|','npfft'     ,'|','npband','|',' bandpp ' ,'|',&
1904 &       'nproc'  ,'|','weight','|','new weight','|'
1905      else
1906        write(msg,'(5a,11(a10,a1))') ch10, &
1907 &       ' Values below have been tested with respect to Linear Algebra performance;',ch10,&
1908 &       ' Weights below are corrected according:',ch10,&
1909 &       'npimage','|','np_spkpt' ,'|','npspinor'  ,'|','npfft'     ,'|','npband','|',' bandpp ' ,'|',&
1910 &       'nproc'  ,'|','weight','|','new weight','|','best npslk','|','linalggpu' ,'|'
1911      end if
1912      call wrtout(std_out,msg);if (max_ncpus > 0) call wrtout(ab_out,msg)
1913    end if
1914    acc_k=zero
1915    ncount=min(MAXBENCH,mcount);if (autoparal==3) ncount=1
1916    do jj=mcount,mcount-ncount+1,-1
1917      ii=isort(jj)
1918      npf=my_distp(4,ii);npb=my_distp(5,ii);bpp=my_distp(6,ii)
1919      if ((npb*npf*bpp>1).and.(npf*npb<=mpi_enreg%nproc)) then
1920        use_linalg_gpu=dtset%gpu_option
1921        call compute_kgb_indicator(acc_kgb,bpp,xmpi_world,mband,mpw,npb,npf,np_slk,use_linalg_gpu)
1922        if (autoparal/=2) then
1923          my_distp(9,ii)=np_slk
1924          if (np_slk>0) my_distp(8,ii)=1
1925 !        * gpu_linalg_limit:
1926 !        No use of GPU: htgspw_01.outuge value ~2  *vectsize*blocksize**2 tested
1927 !        Use of GPU:    tiny value ~0.5*vectsize*blocksize**2 tested
1928          my_distp(10,ii)=2*dtset%mpw*(npb*bpp)**2/npf
1929          if (use_linalg_gpu/=ABI_GPU_DISABLED) my_distp(10,ii)=my_distp(10,ii)/4
1930        end if
1931        if (abs(acc_k)<=tol12) acc_k=acc_kgb ! Ref value : the first one computed
1932 !      * Weight (corrected by 10% of the computed ratio)
1933        weight0=weight(jj)*(one + 0.1_dp*acc_k/acc_kgb)
1934        if (autoparal==2) then
1935          write(msg, '(7(i10,a1),f9.2,a2,f9.5,a2)') &
1936 &         my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',&
1937 &         my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |'
1938        else if (autoparal==3) then
1939          write(msg,'(a,5(a,i3))') ch10,' For npband=',npb,', npfft=',npf,' and bandpp=',bpp, &
1940 &         ', compute_kgb_indicator recommends you to set np_slk=',my_distp(9,ii),&
1941 &         ' and use_linalg_gpu=',use_linalg_gpu
1942        else
1943          write(msg, '(7(i10,a1),f9.2,a2,f9.5,a2,2(i10,a1))') &
1944 &         my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',&
1945 &         my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |',&
1946 &         my_distp(9,ii),'|',use_linalg_gpu,'|'
1947        end if
1948        call wrtout(std_out,msg);if (max_ncpus>0) call wrtout(ab_out,msg)
1949 !      We store the best value in weight(mcount) and keep icount
1950        if (weight0 > weight(mcount)) then
1951          icount=ii;weight(mcount)=weight0
1952        end if
1953      end if
1954    end do
1955  end if
1956 
1957 !Store new process distribution
1958  if (mcount>0.and.max_ncpus<=0) then
1959    icount=isort(mcount)
1960    nproc1=my_distp(7,icount)
1961 !  Work load distribution
1962    if (optdriver==RUNL_GSTATE) then
1963      dtset%npimage= my_distp(1,icount)
1964      dtset%nppert = 1
1965      dtset%np_spkpt  = my_distp(2,icount)
1966    end if
1967    if (optdriver==RUNL_RESPFN) then
1968      dtset%npimage= 1
1969      dtset%nppert = my_distp(1,icount)
1970      dtset%np_spkpt  = 1
1971    end if
1972    dtset%npspinor = my_distp(3,icount)
1973    dtset%npfft    = my_distp(4,icount)
1974    dtset%npband   = my_distp(5,icount)
1975    dtset%bandpp   = my_distp(6,icount)
1976    if (dtset%wfoptalg==114.or.dtset%wfoptalg==14.or.dtset%wfoptalg==4) then !if LOBPCG
1977      dtset%nblock_lobpcg = mband / (dtset%npband*dtset%bandpp)
1978    end if
1979    if (tread(1)==0)  dtset%paral_kgb= merge(0,1,my_algo(icount)==ALGO_CG)
1980 !  The following lines are mandatory : the DFT+DMFT must use ALL the
1981 !  available procs specified by the user. So nproc1=nproc.
1982 !  Works only if paral_kgb is not activated??
1983    if (dtset%usedmft/=0.and.optdriver==RUNL_GSTATE) then
1984      if (dtset%paral_kgb==0) then
1985        dtset%npspinor = 1 ; dtset%npfft    = 1
1986        dtset%npband   = 1 ; dtset%bandpp   = 1
1987        dtset%npimage  = 1
1988      end if
1989      nproc1 = nproc
1990    end if
1991    if (dtset%npband*dtset%npfft*dtset%bandpp>1) dtset%paral_kgb=1
1992 !  LinAlg parameters: we change values only if they are not present in input file
1993    if (dtset%paral_kgb==1) then
1994      if (tread(9)==0) dtset%use_slk=my_distp(8,icount)
1995      if (tread(10)==0) dtset%np_slk=my_distp(9,icount)
1996      if (tread(11)==0) dtset%gpu_linalg_limit=my_distp(10,icount)
1997    end if
1998 !  New definition of "world" MPI communicator
1999    if (optdriver==RUNL_RESPFN.and.dtset%paral_rf==1) then
2000      nproc1=max(nproc1,dtset%nsppol*dtset%nkpt) ! Take into account the code in respfn.F90
2001      nproc1=min(nproc1,nproc)
2002      nproc1=(nproc1/dtset%nppert)*dtset%nppert
2003    end if
2004    call initmpi_world(mpi_enreg,nproc1)
2005  end if
2006 
2007 !Final advice in case max_ncpus > 0
2008  if (max_ncpus>0.and.mcount>0) then
2009    write(msg,'(6a)') ch10,&
2010    ' Launch a parallel version of ABINIT with a distribution of processors among the above list,',ch10,&
2011    ' and the associated input variables (np_spkpt, npband, npfft, bandpp, etc.).',ch10,&
2012    ' The higher weight should be better.'
2013    call wrtout(std_out,msg);if (max_ncpus>0) call wrtout(ab_out,msg)
2014  end if
2015 
2016  if (mcount>0) then
2017    ABI_FREE(isort)
2018  end if
2019  ABI_FREE(my_distp)
2020  ABI_FREE(my_algo)
2021  ABI_FREE(weight)
2022 
2023 !Final line
2024  write(msg,'(a,100("="),2a)') " ",ch10,ch10
2025  call wrtout(std_out,msg);if (max_ncpus>0) call wrtout(ab_out,msg)
2026 
2027 !max_ncpus requires a stop
2028  if (max_ncpus>0) then
2029    iexit = iexit + 1 ! will stop in the parent.
2030  end if
2031 
2032  DBG_EXIT("COLL")
2033 
2034 contains
2035 
2036 real(dp) pure function speedup_fdp(nn, mm)
2037   ! Expected linear speedup for a nn-sized problem and mm processes
2038   integer,intent(in) :: nn, mm
2039   speedup_fdp = (one*nn) / (one* ((nn / mm) + merge(0, 1, mod(nn, mm) == 0)))
2040 end function speedup_fdp
2041 
2042 end subroutine finddistrproc

ABINIT/m_mpi_setup [ Modules ]

[ Top ] [ Modules ]

NAME

 m_mpi_setup

FUNCTION

  Initialize MPI parameters and datastructures for parallel execution

COPYRIGHT

  Copyright (C) 1999-2024 ABINIT group (FJ, MT, FD)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

16 #if defined HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include "abi_common.h"
21 
22 module m_mpi_setup
23 
24  use defs_basis
25  use m_distribfft
26  use m_xmpi
27  use m_xomp
28  use m_hdr
29  use m_sort
30  use m_errors
31  use m_abicore
32 
33  use defs_abitypes,  only : MPI_type
34  use m_fstrings,     only : sjoin, itoa
35  use m_time,         only : abi_wtime
36  use m_parser,       only : intagm
37  use m_geometry,     only : mkrdim, metric
38  use m_fftcore,      only : fftalg_for_npfft, getng,  kpgcount
39  use m_mpinfo,       only : init_mpi_enreg, mpi_distrib_is_ok, initmpi_atom, proc_distrb_cycle, &
40                             initmpi_grid, initmpi_pert, initmpi_img, distrb2, distrb2_hf, initmpi_world
41  use m_libpaw_tools, only : libpaw_write_comm_set
42  use m_dtset,        only : dataset_type
43  use m_kg,           only : getmpw
44  use m_dtfil,        only : mkfilename
45 
46  implicit none
47 
48  private

ABINIT/mpi_setup [ Functions ]

[ Top ] [ Functions ]

NAME

 mpi_setup

FUNCTION

 Big loop on the datasets:
 - compute mgfft,mpw,nfft,... for this data set;
 - fill mpi_enreg
  *** At the output of this routine, all the dtsets input variables are known ***
 The content of dtsets should not be modified anymore afterwards.

INPUTS

  filnam(5)=character strings giving file names
  ndtset= number of datasets to be read; if 0, no multi-dataset mode
  ndtset_alloc=number of datasets, corrected for allocation of at least
      one data set.

OUTPUT

  dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables,
   some of which are initialized here, while other were already
   initialized previously.

SIDE EFFECTS

   mpi_enregs=information about MPI parallelization

SOURCE

  84 subroutine mpi_setup(dtsets,filnam,lenstr,mpi_enregs,ndtset,ndtset_alloc,string)
  85 
  86 !Arguments ------------------------------------
  87 !scalars
  88  integer,intent(in) :: lenstr,ndtset,ndtset_alloc
  89  type(MPI_type),intent(inout) :: mpi_enregs(0:ndtset_alloc)
  90  character(len=*),intent(in) :: string
  91 !arrays
  92  character(len=fnlen),intent(in) :: filnam(5)
  93  type(dataset_type),intent(inout) :: dtsets(0:ndtset_alloc)
  94 
  95 !Local variables -------------------------------
  96 !scalars
  97  integer :: blocksize,exchn2n3d,iband,idtset,iexit,ii,iikpt,iikpt_modulo, prtvol
  98  integer :: isppol,jdtset,marr,mband_lower,mband_upper
  99  integer :: me_fft,mgfft,mgfftdg,mkmem,mpw,mpw_k,optdriver
 100  integer :: mband_mem
 101  integer :: nfft,nfftdg,nkpt,nkpt_me,npert,nproc,nproc_fft,nqpt
 102  integer :: nspink,nsppol,nsym,paral_fft,response,tnband,tread0,usepaw,vectsize
 103  integer :: fftalg,fftalga,fftalgc
 104 #ifdef HAVE_LINALG_ELPA
 105  integer :: icol,irow,np
 106 #endif
 107  logical :: fftalg_read,ortalg_read,paral_kgb_read,wfoptalg_read,do_check
 108  real(dp) :: dilatmx,ecut,ecut_eff,ecutdg_eff,ucvol
 109  character(len=500) :: msg
 110 !arrays
 111  integer :: ngfft(18),ngfftdg(18),ngfftc(3),tread(12)
 112  integer,allocatable :: intarr(:),istwfk(:),symrel(:,:,:)
 113  integer,allocatable :: mybands(:)
 114  integer,pointer :: nkpt_rbz(:)
 115  real(dp),parameter :: k0(3)=(/zero,zero,zero/)
 116  real(dp) :: gmet(3,3),gprimd(3,3),kpt(3),qphon(3),rmet(3,3),rprimd(3,3)
 117  real(dp),allocatable :: dprarr(:),kpt_with_shift(:,:)
 118  real(dp),pointer :: nband_rbz(:,:)
 119  character(len=6) :: nm_mkmem(3)
 120 
 121 !*************************************************************************
 122 
 123  DBG_ENTER("COLL")
 124 
 125  iexit=0;mpw_k=0
 126 
 127  call init_mpi_enreg(mpi_enregs(0))
 128  call initmpi_img(dtsets(0),mpi_enregs(0),-1)
 129 
 130  do idtset=1,ndtset_alloc
 131    call init_mpi_enreg(mpi_enregs(idtset))
 132 
 133    ! Handy read-only variables.
 134    optdriver = dtsets(idtset)%optdriver
 135    prtvol = dtsets(idtset)%prtvol
 136 
 137 !  Read parallel input parameters
 138    marr=max(12,dtsets(idtset)%npsp,dtsets(idtset)%nimage)
 139    ABI_MALLOC(intarr,(marr))
 140    ABI_MALLOC(dprarr,(marr))
 141    nkpt  =dtsets(idtset)%nkpt
 142    nsppol=dtsets(idtset)%nsppol
 143    jdtset=dtsets(idtset)%jdtset ; if(ndtset==0)jdtset=0
 144    usepaw=dtsets(idtset)%usepaw
 145    mband_upper=maxval(dtsets(idtset)%nband(1:nkpt*nsppol))
 146    mband_lower=minval(dtsets(idtset)%nband(1:nkpt*nsppol))
 147 
 148 !  Compute metric for this dataset
 149    call mkrdim(dtsets(idtset)%acell_orig(1:3,1),dtsets(idtset)%rprim_orig(1:3,1:3,1),rprimd)
 150    call metric(gmet,gprimd,-1,rmet,rprimd,ucvol)
 151 
 152    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'max_ncpus',tread0,'INT')
 153    if (tread0==1) dtsets(idtset)%max_ncpus=intarr(1)
 154 
 155    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_atom',tread0,'INT')
 156    if(tread0==1) dtsets(idtset)%paral_atom=intarr(1)
 157 
 158    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_rf',tread0,'INT')
 159    if (tread0==1.and.any(optdriver==[RUNL_RESPFN, RUNL_NONLINEAR])) dtsets(idtset)%paral_rf=intarr(1)
 160 
 161    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npimage',tread(2),'INT')
 162    if(tread(2)==1) dtsets(idtset)%npimage=intarr(1)
 163 
 164    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nppert',tread(3),'INT')
 165    if (tread(3)==1.and.optdriver==RUNL_RESPFN) dtsets(idtset)%nppert=intarr(1)
 166 
 167    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'np_spkpt',tread(4),'INT')
 168    if(tread(4)==1)then
 169      dtsets(idtset)%np_spkpt=intarr(1)
 170    else
 171 !    npkpt is obsolete, but still read
 172      call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npkpt',tread(4),'INT')
 173      if(tread(4)==1)then
 174        dtsets(idtset)%np_spkpt=intarr(1)
 175      endif
 176    endif
 177 
 178    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npspinor',tread(5),'INT')
 179    if(tread(5)==1) dtsets(idtset)%npspinor=intarr(1)
 180 
 181    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npfft',tread(6),'INT')
 182    if(tread(6)==1) dtsets(idtset)%npfft=intarr(1)
 183 
 184    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npband',tread(7),'INT')
 185    if(tread(7)==1) dtsets(idtset)%npband=intarr(1)
 186 
 187    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'bandpp',tread(8),'INT')
 188    !Nband might have different values for different kpoint, but not bandpp.
 189    !In this case, we just use the largest nband (mband_upper), and the input will probably fail
 190    !at the bandpp check later on
 191    if(tread(8)==1) then
 192      dtsets(idtset)%bandpp=intarr(1)
 193      ! check if nblock_lobpcg is read from the input, if so error msg
 194      call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nblock_lobpcg',tread0,'INT')
 195      if (tread0==1) then
 196        write(msg,'(3a)') 'Both bandpp and nblock_lobpcg are defined for the same dataset, this is confusing.',ch10,&
 197          'Change the input to keep only nblock_lobpcg (preferably) or bandpp.'
 198        ABI_ERROR(msg)
 199      end if
 200      if (dtsets(idtset)%wfoptalg==114.or.dtsets(idtset)%wfoptalg==14.or.dtsets(idtset)%wfoptalg==4) then !if LOBPCG
 201        if (mod(mband_upper,dtsets(idtset)%bandpp*dtsets(idtset)%npband)==0) then
 202          dtsets(idtset)%nblock_lobpcg=mband_upper/(dtsets(idtset)%bandpp*dtsets(idtset)%npband)
 203        else
 204          write(msg,'(5a)') 'mband_upper( =max_{kpt}(nband) ) should be a mutltiple of npband*bandpp.',ch10,&
 205            'Change nband, npband or bandpp in the input.',ch10,&
 206            'A simpler solution is to use nblock_lobpcg instead of bandpp.'
 207          ABI_ERROR(msg)
 208        end if
 209      end if
 210    else if (dtsets(idtset)%wfoptalg==114.or.dtsets(idtset)%wfoptalg==14.or.dtsets(idtset)%wfoptalg==4) then !if LOBPCG
 211      if (mod(mband_upper,dtsets(idtset)%nblock_lobpcg*dtsets(idtset)%npband)==0) then
 212        dtsets(idtset)%bandpp=mband_upper/(dtsets(idtset)%nblock_lobpcg*dtsets(idtset)%npband)
 213      else
 214        write(msg,'(3a)') 'mband_upper( =max_{kpt}(nband) ) should be a mutltiple of nblock_lobpcg*npband.',ch10,&
 215          'Change nband, npband or nblock_lobpcg in the input.'
 216        ABI_ERROR(msg)
 217      end if
 218    end if
 219 
 220    ! Warning when using different number of bands for different kpoints (occopt=2)
 221    if ( dtsets(idtset)%occopt==2 .and. dtsets(idtset)%nkpt>1 .and. &
 222      & ((dtsets(idtset)%bandpp > 1) .or. (dtsets(idtset)%npband > 1)) ) then
 223      write(msg, '(4a)' ) &
 224        'When working with blocks of bands (bandpp>1 or npband>1),'&
 225       &' the number of bands should be the same for every kpoints.',ch10,&
 226       &' The run will most probably fail on a other check. If it does not fail, ignore this message.'
 227      ABI_WARNING(msg)
 228    end if
 229 
 230    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'use_slk',tread(9),'INT')
 231    if(tread(9)==1) dtsets(idtset)%use_slk=intarr(1)
 232 
 233    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'np_slk',tread(10),'INT')
 234    if(tread(10)==1) dtsets(idtset)%np_slk=intarr(1)
 235 
 236    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'slk_rankpp',tread(12),'INT')
 237    if(tread(12)==1) dtsets(idtset)%slk_rankpp=intarr(1)
 238 
 239    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'pw_unbal_thresh',tread0,'DPR')
 240    if(tread0==1) dtsets(idtset)%pw_unbal_thresh=dprarr(1)
 241    mpi_enregs(idtset)%pw_unbal_thresh=dtsets(idtset)%pw_unbal_thresh
 242 
 243    call intagm(dprarr,intarr,jdtset,marr,12,string(1:lenstr),'gpu_devices',tread0,'INT')
 244    if(tread0==1) dtsets(idtset)%gpu_devices(1:12)=intarr(1:12)
 245 
 246    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'gpu_kokkos_nthrd',tread0,'INT')
 247    if(tread0==1) dtsets(idtset)%gpu_kokkos_nthrd=intarr(1)
 248 
 249    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'gpu_linalg_limit',tread(11),'INT')
 250    if(tread(11)==1) dtsets(idtset)%gpu_linalg_limit=intarr(1)
 251 
 252    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'gpu_nl_distrib',tread0,'INT')
 253    if(tread0==1) dtsets(idtset)%gpu_nl_distrib=intarr(1)
 254 
 255    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'gpu_nl_splitsize',tread0,'INT')
 256    if(tread0==1) dtsets(idtset)%gpu_nl_splitsize=intarr(1)
 257 
 258    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nphf',tread0,'INT')
 259    if(tread0==1) dtsets(idtset)%nphf=intarr(1)
 260 
 261    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'autoparal',tread0,'INT')
 262    if(tread0==1) dtsets(idtset)%autoparal=intarr(1)
 263 
 264 !  Read paral_kgb and disable it if not supported in optdriver
 265    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_kgb',tread(1),'INT')
 266    paral_kgb_read=(tread(1)==1)
 267    if (paral_kgb_read) dtsets(idtset)%paral_kgb=intarr(1)
 268    if (xmpi_paral==0.and.dtsets(idtset)%paral_kgb==1) then
 269      dtsets(idtset)%paral_kgb=0
 270      write(msg, '(5a)' ) &
 271      'When ABINIT is compiled without MPI flag,',ch10,&
 272      'setting paral_kgb/=0 is useless. paral_kgb has been reset to 0.',ch10,&
 273      'Action: modify compilation option or paral_kgb in the input file.'
 274      ABI_WARNING(msg)
 275    end if
 276    if (ALL(optdriver /= [RUNL_GSTATE, RUNL_GWLS, RUNL_RTTDDFT]) .and. dtsets(idtset)%paral_kgb/=0) then
 277      dtsets(idtset)%paral_kgb=0
 278      write(msg, '(a,i0,a)') &
 279       "paral_kgb != 0 is not available in optdriver ",optdriver,". Setting paral_kgb to 0"
 280      ABI_COMMENT(msg)
 281    end if
 282 
 283    wfoptalg_read=.false.
 284    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'wfoptalg',tread0,'INT')
 285    if(tread0==1) then
 286      dtsets(idtset)%wfoptalg=intarr(1)
 287      wfoptalg_read=.true.
 288    else
 289      if (dtsets(idtset)%usepaw==0) dtsets(idtset)%wfoptalg=0
 290      if (dtsets(idtset)%usepaw/=0) dtsets(idtset)%wfoptalg=10
 291      if ((optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS).and.dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%wfoptalg=114
 292    end if
 293 
 294    ! Dump the list of irreducible perturbations and exit.
 295    if (dtsets(idtset)%paral_rf==-1.and.optdriver/=RUNL_NONLINEAR) then
 296      call dtsets(idtset)%get_npert_rbz(nband_rbz, nkpt_rbz, npert)
 297      ABI_FREE(nband_rbz)
 298      ABI_FREE(nkpt_rbz)
 299      iexit = iexit + 1
 300    end if
 301 
 302    ! From total number of procs, compute all possible distributions
 303    ! Ignore exit flag if GW/EPH calculations because autoparal section is performed in screening/sigma/bethe_salpeter/eph
 304    if (any(optdriver == [RUNL_SCREENING, RUNL_SIGMA, RUNL_BSE, RUNL_EPH, RUNL_GWR, RUNL_NONLINEAR])) then
 305      iexit = 0
 306    else
 307      call finddistrproc(dtsets,filnam,idtset,iexit,mband_upper,mpi_enregs(idtset),ndtset_alloc,tread)
 308    end if
 309 
 310    call initmpi_img(dtsets(idtset),mpi_enregs(idtset),-1)
 311    nproc=mpi_enregs(idtset)%nproc_cell
 312 
 313 !  Set paral_kgb to 1 when band-fft parallelism is activated
 314    if (ANY(optdriver == [RUNL_GSTATE, RUNL_GWLS, RUNL_RTTDDFT])) then
 315      if (mpi_enregs(idtset)%nproc_cell>1) then
 316        if (dtsets(idtset)%npband>1.or.dtsets(idtset)%npfft>1) then
 317          if (.not.paral_kgb_read) dtsets(idtset)%paral_kgb=1
 318        end if
 319      end if
 320    end if
 321      
 322    if ((optdriver/=RUNL_GSTATE.and.optdriver/=RUNL_GWLS.and.optdriver/=RUNL_RTTDDFT).and. &
 323 &   (dtsets(idtset)%np_spkpt/=1   .or.dtsets(idtset)%npband/=1.or.dtsets(idtset)%npfft/=1.or. &
 324 &   dtsets(idtset)%npspinor/=1.or.dtsets(idtset)%bandpp/=1)) then
 325 !&   .or.(dtsets(idtset)%iscf<0)) then
 326      dtsets(idtset)%np_spkpt=1 ; dtsets(idtset)%npspinor=1 ; dtsets(idtset)%npfft=1
 327      dtsets(idtset)%npband=1; dtsets(idtset)%bandpp=1  ; dtsets(idtset)%nphf=1
 328      dtsets(idtset)%paral_kgb=0
 329      ABI_COMMENT('For non ground state calculations, set bandpp, npfft, npband, npspinor, np_spkpt and nphf to 1')
 330    end if
 331 
 332 !  Take into account a possible change of paral_kgb (change of the default algorithm)
 333    if (.not.wfoptalg_read) then
 334      if (dtsets(idtset)%usepaw==0) dtsets(idtset)%wfoptalg=0
 335      if (dtsets(idtset)%usepaw/=0) dtsets(idtset)%wfoptalg=10
 336      if ((optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS).and.dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%wfoptalg=114
 337      if (mod(dtsets(idtset)%wfoptalg,10)==4) then
 338        do iikpt=1,dtsets(idtset)%nkpt
 339          if (any(abs(dtsets(idtset)%kpt(:,iikpt))>tol8)) dtsets(idtset)%istwfk(iikpt)=1
 340        end do
 341      end if
 342    end if
 343 
 344    dtsets(idtset)%densfor_pred=2
 345    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'densfor_pred',tread0,'INT')
 346    if(tread0==1) then
 347      dtsets(idtset)%densfor_pred=intarr(1)
 348    else
 349      if (dtsets(idtset)%paral_kgb==1) dtsets(idtset)%densfor_pred=6
 350    end if
 351    if((dtsets(idtset)%iscf==5.or.dtsets(idtset)%iscf==6) &
 352       .and. dtsets(idtset)%ionmov==4 .and. dtsets(idtset)%densfor_pred/=3 )then
 353      dtsets(idtset)%densfor_pred=3
 354      write(msg, '(a,a,a)' )&
 355      'When ionmov==4 and iscf==5 or 6, densfor_pred must be 3.',ch10,&
 356      'Set densfor_pred to 3.'
 357      ABI_COMMENT(msg)
 358    end if
 359 
 360 #ifdef HAVE_LOTF
 361 !  LOTF need densfor_pred=2
 362    if(dtsets(idtset)%ionmov==23) dtsets(idtset)%densfor_pred=2
 363 #endif
 364 
 365    if (usepaw==0) then
 366      dtsets(idtset)%ortalg=2
 367    else
 368      dtsets(idtset)%ortalg=-2
 369    end if
 370    ortalg_read=.false.
 371    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'ortalg',tread0,'INT')
 372    if(tread0==1) then
 373      dtsets(idtset)%ortalg=intarr(1)
 374      ortalg_read=.true.
 375    else if (dtsets(idtset)%wfoptalg>=10 .and. dtsets(idtset)%ortalg>0) then
 376      dtsets(idtset)%ortalg=-dtsets(idtset)%ortalg
 377    end if
 378 
 379    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'iomode',tread0,'INT')
 380    if(tread0==1) then
 381      dtsets(idtset)%iomode=intarr(1)
 382    else
 383      if ((xmpi_mpiio==1).and.(dtsets(idtset)%paral_kgb==1)) dtsets(idtset)%iomode=IO_MODE_MPI
 384 #ifdef HAVE_NETCDF_DEFAULT
 385      dtsets(idtset)%iomode=IO_MODE_ETSF
 386 #endif
 387    end if
 388 
 389    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'pawmixdg',tread0,'INT')
 390    if(tread0==1) then
 391      dtsets(idtset)%pawmixdg=intarr(1)
 392    else if (dtsets(idtset)%npfft>1.and.usepaw==1) then
 393      dtsets(idtset)%pawmixdg=1
 394    end if
 395      
 396 !  Cycle if the processor is not used
 397    if (mpi_enregs(idtset)%me<0.or.iexit>0) then
 398      ABI_FREE(intarr)
 399      ABI_FREE(dprarr)
 400      cycle
 401    end if
 402 
 403    ! Set cprj_in_memory to zero if paral_kgb has been activated by autoparal
 404    if (dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%cprj_in_memory = 0
 405 
 406    response=0
 407    if (dtsets(idtset)%rfddk/=0 .or. dtsets(idtset)%rf2_dkdk/=0 .or. dtsets(idtset)%rf2_dkde/=0 .or. &
 408 &   dtsets(idtset)%rfelfd/=0 .or. dtsets(idtset)%rfphon/=0 .or. dtsets(idtset)%rfstrs/=0 .or. &
 409 &   dtsets(idtset)%rfuser/=0 .or. dtsets(idtset)%rfmagn/=0) response=1
 410 
 411 !  If no MPI, set all npxxx variables to 1
 412    if (nproc==1) then
 413      dtsets(idtset)%np_spkpt    = 1 ; dtsets(idtset)%npband   = 1
 414      dtsets(idtset)%npfft    = 1 ; dtsets(idtset)%npspinor = 1
 415      dtsets(idtset)%nphf     = 1
 416    end if
 417 
 418 !    --IF CUDA AND RECURSION:ONLY BAND PARALLELISATION
 419    if(dtsets(idtset)%tfkinfunc==2 .and. nproc/=1)then
 420      dtsets(idtset)%npband = dtsets(idtset)%npband*dtsets(idtset)%np_spkpt*dtsets(idtset)%npspinor*dtsets(idtset)%npfft
 421      dtsets(idtset)%bandpp=1
 422      dtsets(idtset)%np_spkpt = 1
 423      dtsets(idtset)%npfft = 1
 424      dtsets(idtset)%npspinor = 1
 425      write(msg, '(5a,i6,a)' )&
 426      'If the recursion scheme is used (tfkinfunc==2),',ch10,&
 427      'only the band parallelisation is active, we set:',ch10,&
 428      'npfft= 1, np_spkpt= 1, npband=',dtsets(idtset)%npband,' .'
 429      ABI_WARNING(msg)
 430    end if
 431 
 432    if (dtsets(idtset)%npspinor>=2.and.dtsets(idtset)%nspinor==1) then
 433      dtsets(idtset)%npspinor=1
 434      dtsets(idtset)%npfft=2*dtsets(idtset)%npfft
 435      write(msg,'(3a)')&
 436      'npspinor is bigger than nspinor !',ch10,&
 437      'We set npspinor to 1 ; we set npfft to 2*npfft'
 438      ABI_WARNING(msg)
 439    end if
 440 
 441 !  Some checks on parallelization data
 442    if(dtsets(idtset)%paral_kgb < 0 ) then
 443      cycle
 444    else if(dtsets(idtset)%paral_kgb/=0.and.(dtsets(idtset)%bandpp/=1.or.dtsets(idtset)%npband/=1.or.&
 445            dtsets(idtset)%npfft/=1.or.dtsets(idtset)%np_spkpt/=1.or.dtsets(idtset)%npspinor/=1))then
 446      if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npfft*dtsets(idtset)%npband*dtsets(idtset)%npspinor > nproc )then
 447        write(msg,'(7a)')&
 448        'The product of np_spkpt, npfft, npband and npspinor is bigger than the number of processors.',ch10,&
 449        'The user-defined values of np_spkpt, npfft, npband or npspinor will be modified,',ch10,&
 450        'in order to bring this product below nproc .',ch10,&
 451        'At present, only a very simple algorithm is used ...'
 452        ABI_WARNING(msg)
 453 
 454        if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npband*dtsets(idtset)%npspinor <= nproc) then
 455          dtsets(idtset)%npfft=1
 456          ABI_WARNING('Set npfft to 1')
 457        else if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npspinor <= nproc)then
 458          dtsets(idtset)%npfft=1
 459          dtsets(idtset)%npband=1
 460          ABI_WARNING('Set npfft and npband to 1')
 461        else if(dtsets(idtset)%np_spkpt <= nproc)then
 462          dtsets(idtset)%npfft=1
 463          dtsets(idtset)%npband=1
 464          dtsets(idtset)%npspinor=1
 465          ABI_WARNING('Set npfft ,npband and npspinor to 1')
 466        else
 467          dtsets(idtset)%npfft=1
 468          dtsets(idtset)%npband=1
 469          dtsets(idtset)%np_spkpt=1
 470          dtsets(idtset)%npspinor=1
 471          ABI_WARNING('Set npfft, npband, nspinor and np_spkpt to 1')
 472        end if
 473      else if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npfft*dtsets(idtset)%npband*dtsets(idtset)%npspinor < nproc)then
 474        write(msg,'(a,5i6,4a)')&
 475        'np_spkpt,npband,npspinor,npfft,nproc=',&
 476 &      dtsets(idtset)%np_spkpt,dtsets(idtset)%npfft,dtsets(idtset)%npband,dtsets(idtset)%npspinor,nproc,ch10,&
 477        'The number of processors must not be greater than npfft*npband*np_spkpt*npspinor ',ch10,&
 478        'when npfft or np_spkpt or npband or npspinor are chosen manually in the input file.'
 479        ABI_ERROR(msg)
 480      end if
 481    end if
 482 
 483 !  LOBPCG and ChebFi need paral_kgb=1 in parallel
 484    if ((dtsets(idtset)%npband*dtsets(idtset)%npfft>1).and. &
 485 &   (mod(dtsets(idtset)%wfoptalg,10)==1.or.mod(dtsets(idtset)%wfoptalg,10)==4)) then
 486      dtsets(idtset)%paral_kgb=1
 487    end if
 488 
 489 !  Check size of Scalapack communicator
 490 #ifdef HAVE_LINALG_ELPA
 491    if(dtsets(idtset)%paral_kgb>0.and.dtsets(idtset)%np_slk>0) then
 492      np=min(dtsets(idtset)%np_slk,dtsets(idtset)%npband*dtsets(idtset)%npfft*dtsets(idtset)%npspinor)
 493      irow=int(sqrt(float(np)))
 494      do while(mod(np,irow)/=0)
 495        irow=irow-1
 496      end do
 497      icol=nproc/irow
 498      if (icol>mband_lower) then
 499        do while(icol>mband_lower)
 500          icol=icol-1
 501          do while(mod(np,icol)/=0)
 502            icol=icol-1
 503          end do
 504        end do
 505        dtsets(idtset)%np_slk=icol
 506        write(msg,'(5a,i6,a)')&
 507        'The number of band*fft*spinor processors was not consistent with',ch10,&
 508        'the size of communicator used for ELPA library (np_slk).',ch10,&
 509        'np_slk value has been adjusted to ',dtsets(idtset)%np_slk,'.'
 510        ABI_COMMENT(msg)
 511      end if
 512    end if
 513 #endif
 514 
 515    !Additional check in case of a parallelized Hartree-Fock calculation
 516    !   %usefock == option to perform Fock exchange calculation
 517    !   %nphf   == number of processors for Fock exchange calculation
 518    if ((dtsets(idtset)%usefock==1).and.(dtsets(idtset)%nphf/=1)) then
 519 
 520      if ((dtsets(idtset)%nphf<0).or.(dtsets(idtset)%nphf==0)) then
 521        ABI_ERROR('The value of variable nphf should be a non negative integer.')
 522      end if
 523      if (dtsets(idtset)%paral_kgb/=0) then
 524        ABI_ERROR('Option paral_kgb should be turned off (value 0) for a parallelized Hartree-Fock calculation.')
 525      end if
 526      if (response/=0) then
 527        ABI_ERROR('A response function calculation is not yet possible with a parallelized Hartree-Fock calculation.')
 528      end if
 529      if (dtsets(idtset)%npspinor>1) then
 530        ABI_ERROR('The parallelism on spinors is not supported by a parallelized Hartree-Fock calculation.')
 531      end if
 532      if (dtsets(idtset)%np_spkpt*dtsets(idtset)%nphf > nproc )then
 533        write(msg,'(a,3(a,i0))') ch10,&
 534        'The product of variables np_spkpt and nphf is bigger than the number of processors: np_spkpt= ',&
 535        dtsets(idtset)%np_spkpt,' nphf= ',dtsets(idtset)%nphf  ,' and nproc= ', nproc
 536        ABI_ERROR(msg)
 537      end if
 538    end if ! Fock
 539 
 540    !When using chebfi, the number of blocks is equal to the number of processors
 541    if((dtsets(idtset)%wfoptalg == 1) .or. (dtsets(idtset)%wfoptalg == 111)) then
 542      !Nband might have different values for different kpoint, but not bandpp.
 543      !In this case, we just use the largest nband (mband_upper), and the input will probably fail
 544      !at the bandpp check later on
 545      dtsets(idtset)%bandpp = mband_upper / dtsets(idtset)%npband
 546      if(tread(8)==1) then
 547        write(msg, '(a,i8,3a)' ) &
 548        'bandpp has been internally set to ',dtsets(idtset)%bandpp,'.',ch10,&
 549        'Indeed, there is no need to specify bandpp in the input when using chebfi (wfoptalg=1,111).'
 550        ABI_COMMENT(msg)
 551      end if
 552    end if
 553 
 554    !Check parallelization in case of RTTDDFT
 555    !In particular ensure that bandpp = nband / npband
 556    if (optdriver == RUNL_RTTDDFT) then
 557       dtsets(idtset)%bandpp = mband_upper / dtsets(idtset)%npband
 558       if ( tread(8) == 1 ) then
 559          write(msg, '(a,a)') 'Setting bandpp is useless in RT-TDDFT because it is automatically set to nband/npband.', ch10
 560          ABI_WARNING(msg)
 561       end if
 562       if (dtsets(idtset)%npfft/=1) then
 563          dtsets(idtset)%npfft=1
 564          write(msg, '(a,a)') 'RT-TDDFT is not compatible with FFT-parallelization. Remove npfft or set it to 1.', ch10
 565          ABI_ERROR(msg)
 566       end if
 567       if (dtsets(idtset)%npspinor/=1) then
 568          dtsets(idtset)%npspinor=1
 569          write(msg, '(a,a)') 'RT-TDDFT is not compatible with spinor parallelization. Remove npspinor or set it to 1.', ch10
 570          ABI_ERROR(msg)
 571       end if
 572       if (dtsets(idtset)%nphf/=1) then
 573          dtsets(idtset)%nphf=1
 574          write(msg, '(a,a)') 'RT-TDDFT is not compatible with HF parallelization. Remove nphf or set it to 1.', ch10
 575          ABI_ERROR(msg)
 576       end if
 577    end if
 578 
 579 !  Set mpi_enreg
 580    mpi_enregs(idtset)%paral_kgb=dtsets(idtset)%paral_kgb
 581    if(dtsets(idtset)%paral_kgb/=0)then
 582      mpi_enregs(idtset)%nproc_spkpt=dtsets(idtset)%np_spkpt
 583      mpi_enregs(idtset)%nproc_fft=dtsets(idtset)%npfft
 584      mpi_enregs(idtset)%nproc_band=dtsets(idtset)%npband
 585      mpi_enregs(idtset)%nproc_spinor=min(dtsets(idtset)%npspinor,dtsets(idtset)%nspinor)
 586      mpi_enregs(idtset)%bandpp=dtsets(idtset)%bandpp
 587 !    Additional setting in case of hybrid functional calculation => not yet tested (CMartins)
 588 !     if (dtsets(idtset)%usefock==1) then
 589 !       mpi_enregs(idtset)%nproc_hf = dtsets(idtset)%nphf
 590 !       if (dtsets(idtset)%nphf>1) mpi_enregs(idtset)%paral_hf=1
 591 !     end if
 592    else
 593      mpi_enregs(idtset)%bandpp = dtsets(idtset)%bandpp
 594 !    Additional setting in case of a Fock exchange of PBE0 calculation
 595      if (dtsets(idtset)%usefock==1) then
 596        if (dtsets(idtset)%nphf>1) mpi_enregs(idtset)%paral_hf=1
 597        mpi_enregs(idtset)%nproc_hf = dtsets(idtset)%nphf
 598        if (dtsets(idtset)%np_spkpt/=1) then
 599          mpi_enregs(idtset)%nproc_spkpt = dtsets(idtset)%np_spkpt
 600        else
 601          mpi_enregs(idtset)%nproc_spkpt = mpi_enregs(idtset)%nproc_cell/mpi_enregs(idtset)%nproc_hf
 602        end if
 603      else
 604        mpi_enregs(idtset)%nproc_spkpt = mpi_enregs(idtset)%nproc_cell
 605      end if
 606    end if
 607 
 608    if(dtsets(idtset)%paral_kgb>=0) then
 609 
 610 !    Compute processor distribution over perturbations
 611      mpi_enregs(idtset)%paral_pert=dtsets(idtset)%paral_rf
 612      if (mpi_enregs(idtset)%paral_pert==1) then
 613        dtsets(idtset)%nppert=max(1,dtsets(idtset)%nppert)
 614        if(dtsets(idtset)%nppert>mpi_enregs(idtset)%nproc) then
 615          ABI_ERROR('The number of processors must not be smaller than nppert !')
 616        end if
 617        call initmpi_pert(dtsets(idtset),mpi_enregs(idtset))
 618        mpi_enregs(idtset)%nproc_spkpt = mpi_enregs(idtset)%nproc_cell
 619        nproc=mpi_enregs(idtset)%nproc_cell
 620      end if
 621 !    Cycle if the processor is not used
 622      if (mpi_enregs(idtset)%me<0) then
 623        ABI_FREE(intarr)
 624        ABI_FREE(dprarr)
 625        cycle
 626      end if
 627 
 628 !    Compute processor distribution over kpt (and eventually band-fft)
 629      call initmpi_grid(mpi_enregs(idtset))
 630      if(dtsets(idtset)%usewvl==1) mpi_enregs(idtset)%comm_fft=mpi_enregs(idtset)%comm_cell
 631 
 632 !    Initialize tabs used for k/spin parallelism (with sequential-type values)
 633      ABI_MALLOC(mpi_enregs(idtset)%proc_distrb,(nkpt,mband_upper,nsppol))
 634      ABI_MALLOC(mpi_enregs(idtset)%my_kpttab,(nkpt))
 635      mpi_enregs(idtset)%proc_distrb(:,:,:)=0
 636      mpi_enregs(idtset)%my_kpttab(:)=(/(ii,ii=1,nkpt)/)
 637      mpi_enregs(idtset)%my_isppoltab(:)=1;if (dtsets(idtset)%nsppol==1) mpi_enregs(idtset)%my_isppoltab(2)=0
 638 
 639 !    HF or hybrid calculation : initialization of the array distrb_hf
 640      if (dtsets(idtset)%usefock==1) then
 641        ABI_MALLOC(mpi_enregs(idtset)%distrb_hf,(dtsets(idtset)%nkpthf,dtsets(idtset)%nbandhf,1))
 642 !      The dimension of distrb_hf are given by %nkpthf and %nbandhf.
 643 !      We assume that there will be no dependence in spinpol for all the occupied states.
 644        mpi_enregs(idtset)%distrb_hf=0
 645      end if
 646 
 647 !    Define k-points distribution (determine who I am)
 648 !    Note that nkpt_me may differ from processor to processor
 649 !    This fact will NOT be taken into account when
 650 !    the memory needs will be evaluated in the subroutine memory.
 651 !    Also, the reduction of k points due to symmetry in RF calculations
 652 !    is NOT taken into account. This should be changed later ...
 653      nkpt_me=nkpt
 654      mband_mem=0
 655      if(xmpi_paral==1 .and. dtsets(idtset)%usewvl == 0) then
 656        nkpt_me=0
 657        if(response==0 .or. (response==1 .and. dtsets(idtset)%efmas==1))then
 658          mpi_enregs(idtset)%paralbd=0
 659          call distrb2(mband_upper,mband_mem,dtsets(idtset)%nband,nkpt,nproc,nsppol,mpi_enregs(idtset))
 660          do iikpt=1,nkpt
 661            if(.not.(proc_distrb_cycle(mpi_enregs(idtset)%proc_distrb,iikpt,1,1,-1,mpi_enregs(idtset)%me_kpt)))&
 662 &           nkpt_me=nkpt_me+1
 663          end do ! ikpt=1,nkpt
 664 !        HF or hybrid calculation : define the occupied states distribution (in array distrb_hf)
 665          if (dtsets(idtset)%usefock==1) then
 666            call distrb2_hf(dtsets(idtset)%nbandhf,dtsets(idtset)%nkpthf,nproc,nsppol,mpi_enregs(idtset))
 667          end if
 668        else ! response==1
 669 !  TODO: check or remove the following comment which seems outdated
 670 !        Wrongly assumes that the number of elements of the
 671 !        k-point sets of the two spin polarizations is the maximal
 672 !        value of one of these k-point sets ...
 673 !        This is to be corrected when RF is implemented
 674 !        for spin-polarized case.
 675 !  ENDTODO
 676          mpi_enregs(idtset)%paralbd=1
 677 !        nproc=mpi_enregs(idtset)%nproc_cell*mpi_enregs(idtset)%nproc_pert
 678          call distrb2(mband_upper,mband_mem,dtsets(idtset)%nband,nkpt,nproc,nsppol,mpi_enregs(idtset))
 679          do isppol=1,nsppol
 680            nspink=0
 681            do iikpt=1,nkpt
 682              do iband=1,dtsets(idtset)%nband(iikpt+(isppol-1)*nkpt)
 683                if(mpi_enregs(idtset)%proc_distrb(iikpt,iband,isppol)==mpi_enregs(idtset)%me_cell)then
 684                  nspink=nspink+1
 685                  exit
 686                end if
 687              end do ! iband
 688            end do ! iikpt
 689            if(nspink>nkpt_me)nkpt_me=nspink
 690          end do ! isppol
 691 !        Is nband present in input file or automatically estimated ?
 692          tnband=0
 693          call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nband',tnband,'INT')
 694 !        If the number of bands was estimated, there might be a side effect
 695 !        when the definitive number of bands is known. k points
 696 !        might be attributed to different processors than the present
 697 !        proc_distrb describes. At most, the number of k points could increase by 1 ...
 698          if(tnband==0)nkpt_me=nkpt_me+1
 699 !        In any case, the maximal number of k points is nkpt
 700          if(nkpt_me>nkpt)nkpt_me=nkpt
 701 
 702 !        mband_mem
 703          ABI_MALLOC (mybands, (mband_upper))
 704          mband_mem = 0
 705          do isppol=1,nsppol
 706            do iikpt=1,nkpt
 707              mybands = 0
 708              do iband=1,dtsets(idtset)%nband(iikpt+(isppol-1)*nkpt)
 709                if(mpi_enregs(idtset)%proc_distrb(iikpt,iband,isppol)==mpi_enregs(idtset)%me_band)then
 710                  mybands(iband)=1
 711                end if
 712              end do ! iband
 713              mband_mem = max(mband_mem, sum(mybands))
 714            end do ! iikpt
 715          end do ! isppol
 716          ABI_FREE (mybands)
 717        end if ! response case
 718      end if
 719    end if
 720    if (mband_mem == 0) mband_mem = mband_upper
 721    dtsets(idtset)%mband_mem = mband_mem
 722 
 723 !  Take care of mkmems. Use the generic name -mkmem- for mkmem as well as mkqmem
 724 !  and mk1mem.
 725    nm_mkmem(1)='mkmem '
 726    nm_mkmem(2)='mkqmem'
 727    nm_mkmem(3)='mk1mem'
 728 
 729    do ii=1,3
 730 
 731 !    Read in mkmem here if it is in the input file
 732 !    TODO: mkmem is not supported any longer. These variables can be removed.
 733      if(ii==1)then
 734        call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mkmem',tread0,'INT')
 735      else if(ii==2)then
 736        call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mkqmem',tread0,'INT')
 737      else if(ii==3)then
 738        call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mk1mem',tread0,'INT')
 739      end if
 740 
 741 
 742 !    Note that mkmem is used as a dummy variable, representing mkmem as well
 743 !    as mkqmem, and mk1mem.
 744      if(tread0==1) then
 745        mkmem=intarr(1)
 746        if (mkmem<0) then
 747 !        mkmem is unreasonable; must be zero or positive
 748          write(msg, '(4a,i0,4a)')&
 749          nm_mkmem(ii),' must be positive or null but ',nm_mkmem(ii),' =',mkmem,ch10,&
 750          'Use default ',nm_mkmem(ii),' = nkpt .'
 751          ABI_WARNING(msg)
 752          mkmem=nkpt
 753        end if
 754 
 755      else
 756 
 757        !  mkmem was not set in the input file so default to incore solution
 758        !write(msg,'(6a)') &
 759        !'mpi_setup: ',nm_mkmem(ii),' undefined in the input file.','Use default ',nm_mkmem(ii),' = nkpt'
 760        !call wrtout(std_out, msg)
 761        mkmem=nkpt
 762      end if
 763 
 764 !    Check whether nkpt distributed on the processors <= mkmem;
 765 !    if so then may run entirely in core,
 766 !    avoiding i/o to disk for wavefunctions and kg data.
 767 !    mkmem/=0 to avoid i/o; mkmem==0 to use disk i/o for nkpt>=1.
 768      if (nkpt_me<=mkmem .and. mkmem/=0 ) then
 769        write(msg, '(a,i0,a,a,a,i0,a)' ) &
 770         ' mpi_setup: With nkpt_me=',nkpt_me,' and ',nm_mkmem(ii),' = ',mkmem,', ground state wf handled in core.'
 771        if (prtvol > 0) call wrtout(std_out,msg)
 772        if(nkpt_me<mkmem .and. nkpt_me/=0)then
 773          write(msg,'(3a)')' Resetting ',nm_mkmem(ii),' to nkpt_me to save memory space.'
 774          mkmem=nkpt_me
 775          if (prtvol > 0) call wrtout(std_out,msg)
 776        end if
 777      else if(mkmem/=0)then
 778        write(msg, '(a,i0,3a,i0,5a)' ) &
 779        ' mpi_setup: With nkpt_me=',nkpt_me,'and ',nm_mkmem(ii),' = ',mkmem,&
 780        ' ground state wf require disk i/o.',ch10,&
 781        ' Resetting ',nm_mkmem(ii),' to zero to save memory space.'
 782        mkmem=0
 783        if (prtvol > 0) call wrtout(std_out,msg)
 784      end if
 785      if(dtsets(idtset)%usewvl == 0 .or. dtsets(idtset)%usepaw==1)then
 786        if(ii==1)dtsets(idtset)%mkmem=mkmem
 787      end if
 788      if(ii==2)dtsets(idtset)%mkqmem=mkmem
 789      if(ii==3)dtsets(idtset)%mk1mem=mkmem
 790 
 791      if(dtsets(idtset)%usewvl == 1 .and. dtsets(idtset)%usepaw==1 )then
 792        if(dtsets(idtset)%mkmem .ne. dtsets(idtset)%nkpt) then
 793          ABI_ERROR("mkmem is not allowed for WVL+PAW")
 794        end if
 795      end if
 796 
 797    end do  ! End the loop on the three possiblities mkmem, mkqmem, mk1mem.
 798 
 799    if(dtsets(idtset)%paral_kgb==1) mpi_enregs(idtset)%paralbd=0
 800 
 801 !  Check if some MPI processes are empty (MBPT codes uses a complete different MPI algorithm)
 802    do_check = all(optdriver /= [RUNL_SCREENING, RUNL_SIGMA, RUNL_BSE, RUNL_EPH, RUNL_GWR])
 803    if (dtsets(idtset)%usewvl == 0 .and. do_check) then
 804      if (.not.mpi_distrib_is_ok(mpi_enregs(idtset),mband_upper,&
 805           dtsets(idtset)%nkpt,dtsets(idtset)%mkmem,nsppol,msg=msg)) then
 806        write(msg,'(5a)') trim(msg),ch10,&
 807          'YOU ARE STRONGLY ADVISED TO ACTIVATE AUTOMATIC PARALLELIZATION!',ch10,&
 808          'USE "AUTOPARAL=1" IN THE INPUT FILE.'
 809        ABI_WARNING(msg)
 810      end if
 811    end if
 812 
 813 !  call mpi_setup1(dtsets(idtset),jdtset,lenstr,mband_upper,mpi_enregs(idtset),string)
 814 !  Printing of processor distribution
 815 !  MPIWF : here, set up the complete ngfft, containing the information
 816 !  for the parallelisation of the FFT
 817    call abi_io_redirect(new_io_comm=mpi_enregs(idtset)%comm_world)
 818    call libpaw_write_comm_set(mpi_enregs(idtset)%comm_world)
 819 
 820 !  Default values for sequential case
 821    paral_fft=0; nproc_fft=1; me_fft=0
 822 
 823    if(dtsets(idtset)%usewvl == 0)then
 824      if(optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS) then
 825        paral_fft=1           ! parallelisation over FFT
 826        if (mpi_enregs(idtset)%nproc_cell>0) then
 827          if(mpi_enregs(idtset)%paral_kgb == 1) then
 828 
 829            if((dtsets(idtset)%gpu_option/=ABI_GPU_DISABLED).and.(mpi_enregs(idtset)%nproc_fft/=1))then
 830              write(msg,'(3a,i0)') &
 831              'When the use of GPU is on, the number of FFT processors, npfft, must be 1',ch10,&
 832              'However, npfft=',mpi_enregs(idtset)%nproc_fft
 833              ABI_ERROR(msg)
 834            end if
 835 
 836            if(modulo(dtsets(idtset)%ngfft(2),mpi_enregs(idtset)%nproc_fft)/=0)then
 837              write(msg,'(3a,i0,a,i0)') &
 838              'The number of FFT processors, npfft, should be a multiple of ngfft(2).',ch10,&
 839              'However, npfft=',mpi_enregs(idtset)%nproc_fft,' and ngfft(2)=',dtsets(idtset)%ngfft(2)
 840              ABI_BUG(msg)
 841            end if
 842 
 843            do iikpt=1,nkpt*nsppol
 844              iikpt_modulo = modulo(iikpt,nkpt)+1
 845              if ((dtsets(idtset)%istwfk(iikpt_modulo)==2)) then !.and.(dtsets(idtset)%ngfft(7)==401)) then
 846                if ((mpi_enregs(idtset)%bandpp==0).or. &
 847                ((mpi_enregs(idtset)%bandpp/=1).and.(modulo(mpi_enregs(idtset)%bandpp,2)/=0))) then
 848                  write(msg,'(3a,i0)') &
 849                  'The number bandpp should be 1 or a multiple of 2',ch10,&
 850                  'However, bandpp=',mpi_enregs(idtset)%bandpp
 851                  ABI_BUG(msg)
 852                end if
 853                if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then
 854                  write(msg,'(3a,i0,a,i0)') &
 855                  'The number of bands for the k-point, nband_k, should be a multiple of nproc_band*bandpp.',ch10,&
 856                  'However, nband_k=',dtsets(idtset)%nband(iikpt),' and nproc_band*bandpp=', &
 857                  mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp
 858                  ABI_BUG(msg)
 859                end if
 860              else if ((dtsets(idtset)%istwfk(iikpt_modulo)==2) .and. (dtsets(idtset)%ngfft(7)==400)) then
 861                ABI_BUG('The fftalg=400 with istwfk=2 is not valid')
 862              else
 863                if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then
 864                  write(msg,'(3a,i0,a,i0)') &
 865                  'The number of band for the k-point, nband_k, should be a multiple of nproc_band*bandpp.',ch10,&
 866                  'However, nband_k=',dtsets(idtset)%nband(iikpt),' and nproc_band*bandpp=', &
 867                  mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp
 868                  ABI_BUG(msg)
 869                end if
 870                if ((mpi_enregs(idtset)%bandpp==0)) then
 871                  write(msg,'(a,i0,2a,i0,2a,i0)')&
 872                  'The number bandpp should not be 0 with fftalg=',dtsets(idtset)%ngfft(7),ch10,&
 873                  'and istwfk=',dtsets(idtset)%istwfk(iikpt_modulo),ch10,&
 874                  'However, bandpp=',mpi_enregs(idtset)%bandpp
 875                  ABI_BUG(msg)
 876                end if
 877              end if
 878            end do
 879 
 880            if (xmpi_paral==1) then
 881              if(modulo(nkpt*nsppol,mpi_enregs(idtset)%nproc_spkpt)/=0)then
 882                write(msg,'(3a,i0,a,i0)') &
 883                'The number of KPT processors, np_spkpt, should be a multiple of nkpt*nsppol.',ch10,&
 884                'However, np_spkpt=',mpi_enregs(idtset)%nproc_spkpt,' and nkpt*nsppol=',nkpt*nsppol
 885                ABI_WARNING(msg)
 886              end if
 887            end if
 888          else
 889            do iikpt=1,nkpt*nsppol
 890              iikpt_modulo = modulo(iikpt,nkpt)+1
 891              if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then
 892                write(msg,'(3a,i0,a,i0)') &
 893                'The number of band for the k-point, nband_k, should be a multiple of npband*bandpp.',ch10,&
 894                'However, nband_k=',dtsets(idtset)%nband(iikpt),' and npband*bandpp=', &
 895                mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp
 896                ABI_BUG(msg)
 897              end if
 898            end do
 899          end if
 900        end if
 901        nproc_fft=mpi_enregs(idtset)%nproc_fft
 902        me_fft=mpi_enregs(idtset)%me_fft
 903      end if
 904    end if
 905 
 906 !  Compute mgfft,mpw,nfft for this data set (it is dependent of mpi_enreg)
 907    ABI_MALLOC(istwfk,(nkpt))
 908    ABI_MALLOC(kpt_with_shift,(3,nkpt))
 909 
 910    ! Set the default value of fftalg for given npfft but allow the user to override it.
 911    ! Warning: If you need to change npfft, **DO IT** before this point so that here we get the correct fftalg
 912    dtsets(idtset)%ngfft(7) = fftalg_for_npfft(dtsets(idtset)%npfft)
 913    dtsets(idtset)%ngfftdg(7) = fftalg_for_npfft(dtsets(idtset)%npfft)
 914 
 915    fftalg_read=.false.
 916    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'fftalg',tread0,'INT')
 917 
 918    if (tread0==1) then
 919      dtsets(idtset)%ngfft(7)=intarr(1)
 920      if (usepaw==1) dtsets(idtset)%ngfftdg(7)=intarr(1)
 921      fftalg_read=.true.
 922    end if
 923 
 924    ecut     =dtsets(idtset)%ecut
 925    dilatmx  =dtsets(idtset)%dilatmx
 926    ngfft(:) =dtsets(idtset)%ngfft(:)
 927    istwfk(:)=dtsets(idtset)%istwfk(1:nkpt)
 928    nsym     =dtsets(idtset)%nsym
 929 
 930    nqpt=dtsets(idtset)%nqpt
 931    qphon(:)=zero;if(nqpt/=0) qphon(:)=dtsets(idtset)%qptn(:)
 932 
 933    ABI_MALLOC(symrel,(3,3,nsym))
 934    symrel(:,:,1:nsym)=dtsets(idtset)%symrel(:,:,1:nsym)
 935    ecut_eff=ecut*dilatmx**2
 936 
 937    if (usepaw==1) call wrtout(std_out,'getng is called for the coarse grid:')
 938    kpt=k0; if (response==1.and.usepaw==1) kpt=qphon ! this is temporary
 939 
 940    call getng(dtsets(idtset)%boxcutmin,dtsets(idtset)%chksymtnons,ecut_eff,gmet,kpt,me_fft,mgfft,nfft,&
 941 &   ngfft,nproc_fft,nsym,paral_fft,symrel,dtsets(idtset)%tnons,&
 942 &   gpu_option=dtsets(idtset)%gpu_option)
 943 
 944    dtsets(idtset)%ngfft(:)=ngfft(:)
 945    dtsets(idtset)%mgfft=mgfft
 946    dtsets(idtset)%nfft=nfft
 947    kpt_with_shift(:,:)=dtsets(idtset)%kpt(:,1:nkpt)/dtsets(idtset)%kptnrm
 948 
 949    exchn2n3d=dtsets(idtset)%exchn2n3d
 950    nproc_fft=ngfft(10) ; me_fft=ngfft(11)
 951    fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10)
 952 
 953    ! Initialize tables for MPI-FFT.
 954    call init_distribfft(mpi_enregs(idtset)%distribfft,'c',mpi_enregs(idtset)%nproc_fft,ngfft(2),ngfft(3))
 955 
 956    if(response/=0)then
 957 !    This value of mpw is used in the first part of respfn.f
 958      call getmpw(ecut_eff,exchn2n3d,gmet,istwfk,kpt_with_shift,mpi_enregs(idtset),mpw_k,nkpt)
 959    end if
 960    if(nqpt/=0)then
 961      kpt_with_shift(1,:)=kpt_with_shift(1,:)+qphon(1)
 962      kpt_with_shift(2,:)=kpt_with_shift(2,:)+qphon(2)
 963      kpt_with_shift(3,:)=kpt_with_shift(3,:)+qphon(3)
 964    end if
 965    if (dtsets(idtset)%usewvl == 0) then
 966      call getmpw(ecut_eff,exchn2n3d,gmet,istwfk,kpt_with_shift,mpi_enregs(idtset),mpw,nkpt)
 967      ! Allocate tables for parallel IO of the wavefunctions.
 968      if( xmpi_mpiio==1 .and. mpi_enregs(idtset)%paral_kgb == 1 .and. &
 969 &     any(dtsets(idtset)%iomode == [IO_MODE_MPI, IO_MODE_ETSF])) then
 970        ABI_MALLOC(mpi_enregs(idtset)%my_kgtab,(mpw,dtsets(idtset)%mkmem))
 971      end if
 972    else
 973      mpw = 0
 974    end if
 975 
 976 !  The dimensioning, in the RF case, should be done only with mpw,
 977 !  but mpw is used in the first part of respfn.f, and should at least
 978 !  be equal to mpw_k . The chosen way to code is not optimal, only convenient :
 979 !  it leads to a small waste of memory.
 980    if(response/=0 .and. mpw_k>mpw)mpw=mpw_k
 981    dtsets(idtset)%ngfft(:)=ngfft(:)
 982 
 983 !  Initialize ngfftc to the initial guess for the coarse mesh
 984    ngfftc(:) = 2
 985 
 986 !  In case of PAW, compute fine FFT parameters
 987    if (usepaw==1) then
 988      ecutdg_eff=dtsets(idtset)%pawecutdg*dtsets(idtset)%dilatmx**2
 989      ngfftdg(:)=dtsets(idtset)%ngfftdg(:)
 990      call wrtout(std_out,'getng is called for the fine grid:')
 991 !    Start with the coarse mesh as an initial guess for the fine mesh
 992 !    This ensures that the fine mesh will not be any coarser than the coarse mesh in each dimension
 993      ngfftc(:) = ngfft(1:3)
 994      kpt=k0; if (response==1.and.usepaw==1) kpt=qphon  ! this is temporary
 995 
 996      call getng(dtsets(idtset)%bxctmindg,dtsets(idtset)%chksymtnons,&
 997 &     ecutdg_eff,gmet,kpt,me_fft,mgfftdg,&
 998 &     nfftdg,ngfftdg,nproc_fft,nsym,paral_fft,symrel,dtsets(idtset)%tnons,ngfftc,&
 999 &     gpu_option=dtsets(idtset)%gpu_option)
1000 
1001      dtsets(idtset)%ngfftdg(:)=ngfftdg(:)
1002      dtsets(idtset)%mgfftdg=mgfftdg
1003      dtsets(idtset)%nfftdg=nfftdg
1004 !    Compute fft distribution for fine grid
1005      fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10)
1006      call init_distribfft(mpi_enregs(idtset)%distribfft,'f', mpi_enregs(idtset)%nproc_fft,ngfftdg(2),ngfftdg(3))
1007    end if
1008 
1009    dtsets(idtset)%mpw=mpw
1010    ABI_FREE(symrel)
1011    ABI_FREE(istwfk)
1012    ABI_FREE(kpt_with_shift)
1013    ABI_FREE(intarr)
1014    ABI_FREE(dprarr)
1015 
1016 !  Initialize data for the parallelization over atomic sites (PAW)
1017    if (dtsets(idtset)%natom==1) dtsets(idtset)%paral_atom=0
1018    if (dtsets(idtset)%usepaw==0) dtsets(idtset)%paral_atom=0
1019    if (dtsets(idtset)%usewvl/=0) dtsets(idtset)%paral_atom=0
1020    if (dtsets(idtset)%usedmft==1) dtsets(idtset)%paral_atom=0
1021    if (optdriver/=RUNL_GSTATE.and.optdriver/=RUNL_RESPFN.and.optdriver/=RUNL_GWLS) dtsets(idtset)%paral_atom=0
1022    if (dtsets(idtset)%macro_uj/=0) dtsets(idtset)%paral_atom=0
1023 
1024    call initmpi_atom(dtsets(idtset),mpi_enregs(idtset))
1025 
1026 !  In case of the use of a GPU (Cuda), some defaults can change
1027 !  according to a threshold on matrix sizes
1028    if (dtsets(idtset)%gpu_option/=ABI_GPU_DISABLED.or.dtsets(idtset)%gpu_option==ABI_GPU_UNKNOWN) then
1029      if (optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS) then
1030        vectsize=dtsets(idtset)%mpw*dtsets(idtset)%nspinor/dtsets(idtset)%npspinor
1031        if (all(dtsets(idtset)%istwfk(:)==2)) vectsize=2*vectsize
1032        blocksize=dtsets(idtset)%npband*dtsets(idtset)%bandpp
1033        if (dtsets(idtset)%paral_kgb==0) blocksize=dtsets(idtset)%npfft
1034        if ((vectsize*blocksize**2)>=dtsets(idtset)%gpu_linalg_limit) then
1035          if (.not.wfoptalg_read) then
1036            dtsets(idtset)%wfoptalg=14
1037            if (tread(8)==1) then ! bandpp read, so set nblock_lobpcg
1038              dtsets(idtset)%nblock_lobpcg=mband_upper/(dtsets(idtset)%bandpp*dtsets(idtset)%npband)
1039            else ! bandpp set by nblock_lobpcg
1040              dtsets(idtset)%bandpp=mband_upper/(dtsets(idtset)%nblock_lobpcg*dtsets(idtset)%npband)
1041            end if
1042            if (.not.fftalg_read) then
1043              dtsets(idtset)%ngfft(7) = fftalg_for_npfft(dtsets(idtset)%npfft)
1044              if (usepaw==1) dtsets(idtset)%ngfftdg(7) = fftalg_for_npfft(dtsets(idtset)%npfft)
1045            end if
1046            if (.not.ortalg_read) dtsets(idtset)%ortalg=-abs(dtsets(idtset)%ortalg)
1047          end if
1048        end if
1049      end if
1050    end if
1051 
1052 !  initialize data for the parallelization for WVL:
1053    if(dtsets(idtset)%usewvl==1) then
1054      mpi_enregs(idtset)%comm_wvl=mpi_enregs(idtset)%comm_cell
1055      mpi_enregs(idtset)%nproc_wvl=xmpi_comm_size(mpi_enregs(idtset)%comm_wvl)
1056      mpi_enregs(idtset)%me_wvl=xmpi_comm_rank(mpi_enregs(idtset)%comm_wvl)
1057    end if
1058 
1059  end do
1060 
1061 !This is not a very clean exit in case of paral_kgb<0
1062  if (iexit/=0)then
1063    ABI_STOP("Stopping now!")
1064  end if
1065 
1066  DBG_EXIT("COLL")
1067 
1068 end subroutine mpi_setup