TABLE OF CONTENTS


ABINIT/fftprof [ Programs ]

[ Top ] [ Programs ]

NAME

 fftprof

FUNCTION

  Utility for profiling the FFT libraries supported by ABINIT.

COPYRIGHT

 Copyright (C) 2004-2018 ABINIT group (MG)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .

INPUTS

  (main program)

OUTPUT

  Timing analysis of the different FFT libraries and algorithms.

NOTES

  Point-group symmetries are not taken into account in getng during the generation
  of the FFT mesh. Therefore the FFT mesh might differ from the one
  found by abinit for the same cutoff and Bravais lattice (actually it might be smaller).

 Description of the input file (Fortran NAMELIST):

   &CONTROL
     tasks = string specifying the tasks to perform i.e. the routines that should be tested or profiled.
             allowed values:
                 fourdp --> Test FFT transforms of density and potentials on the full box.
                 fourwf --> Test FFT transforms of wavefunctions using the zero-padded algorithm.
                 gw_fft --> Test the FFT transforms used in the GW code.
                 all    --> Test all FFT routines (DEFAULT)
     fftalgs = list of fftalg values (used to select the FFT libraries to use, see abinit doc for more info)
     ncalls = integer defining the number of calls for each tests. The final Wall time and CPU time
              are computed by averaging the final results over ncalls executions.
     max_nthreads = Maximum number of threads (DEFAULT 1, meaningful only if the code
                    uses threaded external libraries or OpenMP parallelization)
     ndat   = integer specifying how many FFT transforms should be executed for each call to the FFT routine
              (same meaning as the ndat input variable passed to fourwf)
     necut  = Used if tasks = "bench". Specifies the number of cutoff energies to profile (see also ecut_arth)
     ecut_arth = Used if tasks = "bench". Used to generate an arithmetic progression of cutoff energies
                 whose starting value is ecut_arth(1) and whose step is ecut_arth(2)

  &SYSTEM
     ecut = cutoff energy for wavefunctions (real, Hartree units)
     rprimd = Direct lattice vectors in Bohr. (3,3) matrix in Fortran column-major order
     kpoint = real(3) vector specifying the reduced coordinates of the k-point of the wavefunction (used if tasks = "fourwf").
               The value of the k-point defines the storage scheme (istwfk) of the u(G) coefficients and therefore
               the FFT algorithm used to perform the transform u(G) <--> u(R) in fourwf.
     osc_ecut  = cutoff energy (Hartree) for the oscillator matrix elements computed in the GW code
                 Corresponds to the input variables (ecuteps, ecutsigx) used in the main code.
     nsym     =Number of symmetry operations (DEFAULT 1)
     symrel(3,3,nsym) = Symmetry operation in real space used to select the FFT mesh in the routine getng (default: Identity matrix)

PARENTS

CHILDREN

      abi_io_redirect,abimem_init,abinit_doctor,destroy_mpi_enreg
      fft_test_free,fft_test_init,fft_test_nullify,fft_test_print
      fft_use_lib_threads,fftprof_free,fftprof_ncalls_per_test,fftprof_print
      fftw3_init_threads,flush_unit,get_kg,getng,herald,initmpi_seq,lower
      metric,prof_fourdp,prof_fourwf,prof_rhotwg,time_fftbox,time_fftu
      time_fourdp,time_fourwf,time_rhotwg,wrtout,xmpi_bcast,xmpi_init
      xomp_show_info

SOURCE

 70 #if defined HAVE_CONFIG_H
 71 #include "config.h"
 72 #endif
 73 
 74 #include "abi_common.h"
 75 
 76 program fftprof
 77 
 78  use defs_basis
 79  use defs_abitypes
 80  use m_build_info
 81  use m_xmpi
 82  use m_xomp
 83  use m_errors
 84  use m_FFT_prof
 85  use m_abicore
 86  use m_dfti
 87 
 88  use m_fstrings,   only : lower
 89  use m_specialmsg, only : specialmsg_getcount, herald
 90  use m_io_tools,   only : flush_unit
 91  use m_geometry,   only : metric
 92  use m_fftcore,    only : get_cache_kb, get_kg, fftalg_isavailable, fftalg_has_mpi, getng
 93  use m_fft,        only : fft_use_lib_threads, fftbox_utests, fftu_utests, fftbox_mpi_utests, fftu_mpi_utests
 94  use m_fftw3,      only : fftw3_init_threads
 95  use m_mpinfo,     only : destroy_mpi_enreg, initmpi_seq
 96 
 97 !This section has been created automatically by the script Abilint (TD).
 98 !Do not modify the following lines by hand.
 99 #undef ABI_FUNC
100 #define ABI_FUNC 'fftprof'
101 !End of the abilint section
102 
103  implicit none
104 
105 
106 !Arguments -----------------------------------
107 !Local variables-------------------------------
108 !scalars
109  integer,parameter :: MAX_NFFTALGS=50,MAX_NSYM=48
110  integer,parameter :: paral_kgb0=0,me_fft0=0,nproc_fft1=1,master=0
111  integer :: ii,fftcache,it,cplex,ntests,option_fourwf,osc_npw
112  integer :: map2sphere,use_padfft,isign,nthreads,comm,nprocs,my_rank
113  integer :: iset,iall,inplace,nsets,avail,ith,idx,ut_nfft,ut_mgfft
114  integer :: nfftalgs,alg,fftalg,fftalga,fftalgc,nfailed,ierr,paral_kgb
115  character(len=24) :: codename
116  character(len=500) :: header,msg
117  real(dp),parameter :: boxcutmin2=two
118  real(dp) :: ucvol
119  logical :: test_fourdp,test_fourwf,test_gw,do_seq_bench,do_seq_utests
120  logical :: iam_master,do_mpi_utests !,do_mpi_bench,
121  type(MPI_type) :: MPI_enreg
122 !arrays
123  integer :: ut_ngfft(18)
124  real(dp),parameter :: gamma_point(3)=(/zero,zero,zero/)
125  real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3)
126  type(FFT_test_t),allocatable :: Ftest(:)
127  type(FFT_prof_t),allocatable :: Ftprof(:)
128  integer,allocatable :: osc_gvec(:,:)
129  integer,allocatable :: fft_setups(:,:),fourwf_params(:,:)
130 ! ==========  INPUT FILE ==============
131  integer :: ncalls=10,max_nthreads=1,ndat=1,necut=0,nsym=1
132  character(len=500) :: tasks="all"
133  integer :: fftalgs(MAX_NFFTALGS) = 0
134  integer :: symrel(3,3,MAX_NSYM) = 0
135  real(dp),parameter :: k0(3)=(/zero,zero,zero/)
136  real(dp) :: ecut=30,osc_ecut=3
137  real(dp) :: ecut_arth(2)=zero
138  real(dp) :: rprimd(3,3)
139  real(dp) :: kpoint(3) = (/0.1,0.2,0.3/)
140  logical :: use_lib_threads = .FALSE.
141  namelist /CONTROL/ tasks, ncalls, max_nthreads, ndat, fftalgs, necut, ecut_arth, use_lib_threads
142  namelist /SYSTEM/ ecut, rprimd, kpoint, osc_ecut, nsym,symrel
143 
144 ! *************************************************************************
145 
146  ! Change communicator for I/O (mandatory!)
147  call abi_io_redirect(new_io_comm=xmpi_world)
148 
149  call xmpi_init()
150  comm = xmpi_world
151  my_rank = xmpi_comm_rank(comm); nprocs = xmpi_comm_size(comm)
152  iam_master = (my_rank == master)
153 
154 !Initialize memory profiling if it is activated
155 !if a full abimem.mocc report is desired, set the argument of abimem_init to "2" instead of "0"
156 !note that abimem.mocc files can easily be multiple GB in size so don't use this option normally
157 #ifdef HAVE_MEM_PROFILING
158  call abimem_init(0)
159 #endif
160 
161  if (iam_master) then
162    ! Print header and read input file.
163    codename='FFTPROF'//REPEAT(' ',17)
164    call herald(codename,abinit_version,std_out)
165 
166    write(std_out,'(a)')" Tool for profiling and testing the FFT libraries used in ABINIT."
167    write(std_out,'(a)')" Allowed options are: "
168    write(std_out,'(a)')"   fourdp --> Test FFT transforms of density and potentials on the full box."
169    write(std_out,'(a)')"   fourwf --> Test FFT transforms of wavefunctions using the zero-pad algorithm."
170    write(std_out,'(a)')"   gw_fft --> Test the FFT transforms used in the GW code."
171    write(std_out,'(a)')"   all    --> Test all FFT routines."
172    !write(std_out,'(a)')"   seq-utests --> Units tests for sequential FFTs."
173    !write(std_out,'(a)')"   mpi-utests --> Units tests for MPI FFTs."
174    !write(std_out,'(a)')"   seq-bench  --> Benchmark mode for sequential FFTs."
175    !write(std_out,'(a)')"   mpi-bench  --> Benchmark mode for MPI FFTs."
176    write(std_out,'(a)')" "
177 
178    !Read input file
179    read(std_in, NML=CONTROL)
180    read(std_in, NML=SYSTEM)
181    !write(std_out, NML=CONTROL)
182    !write(std_out, NML=SYSTEM)
183    call lower(tasks)
184  end if
185 
186  ! Broadcast variables.
187  if (nprocs > 1) then
188    ! CONTROL
189    call xmpi_bcast(tasks,master,comm,ierr)
190    call xmpi_bcast(ncalls,master,comm,ierr)
191    call xmpi_bcast(max_nthreads,master,comm,ierr)
192    call xmpi_bcast(ndat,master,comm,ierr)
193    call xmpi_bcast(fftalgs,master,comm,ierr)
194    call xmpi_bcast(necut,master,comm,ierr)
195    ! SYSTEM
196    call xmpi_bcast(ecut_arth,master,comm,ierr)
197    call xmpi_bcast(use_lib_threads,master,comm,ierr)
198    call xmpi_bcast(ecut,master,comm,ierr)
199    call xmpi_bcast(rprimd,master,comm,ierr)
200    call xmpi_bcast(kpoint,master,comm,ierr)
201    call xmpi_bcast(osc_ecut,master,comm,ierr)
202    call xmpi_bcast(nsym,master,comm,ierr)
203    call xmpi_bcast(symrel,master,comm,ierr)
204  end if
205 
206  ! replace "+" with white spaces
207  do ii=1,LEN_TRIM(tasks)
208    if (tasks(ii:ii) == "+") tasks(ii:ii) = " "
209  end do
210  !%call str_replace_chars(tasks,"+"," ")
211  tasks = " "//TRIM(tasks)//""
212 
213  iall=INDEX (tasks,"all")
214  test_fourdp = (iall>0 .or. INDEX(tasks," fourdp")>0 )
215  test_fourwf = (iall>0 .or. INDEX(tasks," fourwf")>0 )
216  test_gw     = (iall>0 .or. INDEX(tasks," gw_fft")>0 )
217 
218  do_seq_utests = INDEX(tasks," utests")>0
219  do_seq_bench  = INDEX(tasks," bench")>0
220 
221  do_mpi_utests = INDEX(tasks," mpi-utests")>0
222  !do_mpi_bench  = INDEX(tasks," mpi-bench")>0
223 
224 #ifdef HAVE_FFT_FFTW3_THREADS
225  call fftw3_init_threads()
226 #endif
227 
228 #ifndef HAVE_OPENMP
229  ABI_CHECK(max_nthreads<=1,"nthreads>1 but OMP support is not enabled!")
230 #endif
231  if (max_nthreads>1.and. iam_master) then
232    call xomp_show_info(std_out)
233  end if
234 
235  call fft_use_lib_threads(use_lib_threads)
236  !write(std_out,*)"use_lib_threads: ",use_lib_threads
237 
238  if (do_mpi_utests) then
239    ! Execute unit tests for MPI FFTs and terminate execution.
240    call wrtout(std_out,"=== MPI FFT Unit Tests ===","COLL")
241 
242    ! List the FFT libraries that will be tested.
243    ! Goedecker FFTs are always available, other libs are optional.
244    nfftalgs = COUNT(fftalgs/=0)
245    ABI_CHECK(nfftalgs > 0,"fftalgs must be specified")
246 
247    nfailed = 0; nthreads = 0
248    do ii=1,nfftalgs
249      fftalg = fftalgs(ii)
250      do paral_kgb=1,1
251        write(msg,"(5(a,i0))")&
252 &       "MPI fftu_utests with fftalg = ",fftalg,", paral_kgb = ",paral_kgb," ndat = ",ndat,", nthreads = ",nthreads
253        call wrtout(std_out,msg,"COLL")
254        nfailed = nfailed + fftu_mpi_utests(fftalg,ecut,rprimd,ndat,nthreads,comm,paral_kgb)
255      end do
256    end do
257    ! TEMPORARY
258    !goto 1
259 
260    nfailed = 0; nthreads = 0
261    do ii=1,nfftalgs
262      fftalg = fftalgs(ii)
263      do cplex=1,2
264        write(msg,"(4(a,i0))")"MPI fftbox_utests with fftalg = ",fftalg,", cplex = ",cplex," ndat = ",ndat,", nthreads = ",nthreads
265        call wrtout(std_out,msg,"COLL")
266        nfailed = nfailed + fftbox_mpi_utests(fftalg=fftalg,cplex=cplex,ndat=ndat,nthreads=nthreads,comm_fft=comm)
267      end do
268    end do
269 
270 !1  continue
271    write(msg,'(a,i0)')"Total number of failed tests = ",nfailed
272    call wrtout(std_out,msg,"COLL")
273 
274    goto 100 ! Jump to xmpi_end
275  end if
276 
277  call initmpi_seq(MPI_enreg)
278 
279  call metric(gmet,gprimd,std_out,rmet,rprimd,ucvol)
280 
281  ! symmetries (if given) are used for defining the FFT mesh.
282  if (nsym==1 .and. ALL(symrel==0)) then
283    symrel(:,:,1) = RESHAPE((/1,0,0,0,1,0,0,0,1/),(/3,3/))
284  end if
285 
286  ! Set the number of calls for test.
287  ncalls=ABS(ncalls); if (ncalls==0) ncalls=10
288  call fftprof_ncalls_per_test(ncalls)
289 
290  ! List the FFT libraries that will be tested.
291  ! Goedecker FFTs are always available, other libs are optional.
292  ! write(std_out,*)"ndat = ",ndat
293  nfftalgs = COUNT(fftalgs/=0)
294  ABI_CHECK(nfftalgs > 0,"fftalgs must be specified")
295 
296  ntests = max_nthreads * nfftalgs
297 
298  ! First dimension contains (fftalg,fftcache,ndat,nthreads,available).
299  ABI_MALLOC(fft_setups,(5,ntests))
300 
301  ! Default Goedecker library.
302  idx=0
303  do alg=1,nfftalgs
304    fftalg = fftalgs(alg)
305    fftalga=fftalg/100; fftalgc=mod(fftalg,10)
306    avail = 1
307    if (.not.fftalg_isavailable(fftalg)) avail = 0
308    do ith=1,max_nthreads
309      !fftcache is machine-dependent.
310      fftcache = get_cache_kb()
311      idx = idx + 1
312      fft_setups(:,idx) = (/fftalg,fftcache,ndat,ith,avail/)
313    end do
314  end do
315 
316  ! Compute FFT box.
317  ABI_DT_MALLOC(Ftest,(ntests))
318  ABI_DT_MALLOC(Ftprof,(ntests))
319 
320  do it=1,ntests ! Needed for crappy compilers that do not support => null() in type declarations.
321    call fft_test_nullify(Ftest(it))
322  end do
323 
324  do it=1,ntests
325    call fft_test_init(Ftest(it),fft_setups(:,it),kpoint,ecut,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
326     ! Ftest%results is allocated using nfftot and the consistency btw libs is tested assuming an equal number of FFT-points.
327    if ( ANY(Ftest(it)%ngfft(1:3) /= Ftest(1)%ngfft(1:3)) ) then
328      MSG_ERROR("Consistency check assumes equal FFT meshes. Cannot continue")
329    end if
330    if (it == 1) then
331      call fft_test_print(Ftest(it)) !,header)
332    else if (fft_setups(1,it) /= fft_setups(1,it-1)) then
333      call fft_test_print(Ftest(it)) !,header)
334    end if
335  end do
336  !
337  ! =======================
338  ! ==== fourdp timing ====
339  ! =======================
340  if (test_fourdp) then
341    do isign=-1,1,2
342      do cplex=1,2
343        do it=1,ntests
344          call time_fourdp(Ftest(it),isign,cplex,header,Ftprof(it))
345        end do
346        call fftprof_print(Ftprof,header)
347        call fftprof_free(Ftprof)
348      end do
349    end do
350  end if
351  !
352  ! =======================
353  ! ==== fourwf timing ====
354  ! =======================
355  if (test_fourwf) then
356    ! possible combinations of (option, cplex) supported in fourwf.
357    ! (cplex=2 only allowed for option=2, and istwf_k=1)
358    nsets=4; if (Ftest(1)%istwf_k==1) nsets=5
359    ABI_MALLOC(fourwf_params,(2,nsets))
360    fourwf_params(:,1) = (/0,0/)
361    fourwf_params(:,2) = (/1,1/)
362    fourwf_params(:,3) = (/2,1/)
363    fourwf_params(:,4) = (/3,0/)
364    if (nsets==5) fourwf_params(:,5) = (/2,2/)
365 
366    do iset=1,nsets
367      option_fourwf = fourwf_params(1,iset)
368      cplex         = fourwf_params(2,iset)
369      do it=1,ntests
370        call time_fourwf(Ftest(it),cplex,option_fourwf,header,Ftprof(it))
371      end do
372      call fftprof_print(Ftprof,header)
373      call fftprof_free(Ftprof)
374    end do
375    ABI_FREE(fourwf_params)
376  end if
377  !
378  ! ==========================
379  ! ==== Test GW routines ====
380  ! ==========================
381  ! These routines are used in the GW part, FFTW3 is expected to
382  ! be more efficient as the conversion complex(:) <--> real(2,:) is not needed.
383  if (test_gw) then
384    !
385    ! ==== fourdp timing with complex arrays ====
386    do isign=-1,1,2
387      do inplace=0,1
388        do it=1,ntests
389          call time_fftbox(Ftest(it),isign,inplace,header,Ftprof(it))
390        end do
391        call fftprof_print(Ftprof,header)
392        call fftprof_free(Ftprof)
393      end do
394    end do
395    !
396    ! ==== zero padded with complex arrays ====
397    do isign=-1,1,2
398      do it=1,ntests
399        call time_fftu(Ftest(it),isign,header,Ftprof(it))
400      end do
401      call fftprof_print(Ftprof,header)
402      call fftprof_free(Ftprof)
403    end do
404    !
405    ! ==== rho_tw_g timing ====
406    ABI_CHECK(osc_ecut > zero,"osc_ecut <= zero!")
407 
408    call get_kg(gamma_point,1,osc_ecut,gmet,osc_npw,osc_gvec)
409    ! TODO should reorder by shells to be consistent with the GW part!
410    ! Moreover I guess this ordering is more efficient when we have
411    ! to map the box to the G-sphere!
412    map2sphere=1; !map2sphere=0
413 
414    do use_padfft=0,1
415      do it=1,ntests
416        call time_rhotwg(Ftest(it),map2sphere,use_padfft,osc_npw,osc_gvec,header,Ftprof(it))
417      end do
418      call fftprof_print(Ftprof,header)
419      call fftprof_free(Ftprof)
420    end do
421 
422    ABI_FREE(osc_gvec)
423  end if ! test_gw
424 
425  if (do_seq_utests) then
426    call wrtout(std_out,"=== FFT Unit Tests ===","COLL")
427 
428    nfailed = 0
429    do idx=1,ntests
430      ! fft_setups(:,idx) = (/fftalg,fftcache,ndat,ith,avail/)
431      fftalg   = fft_setups(1,idx)
432      fftcache = fft_setups(2,idx)
433      ndat     = fft_setups(3,idx)
434      nthreads = fft_setups(4,idx)
435      ! Skip the test if library is not available.
436      if (fft_setups(5,idx) == 0) CYCLE
437 
438      write(msg,"(3(a,i0))")"fftbox_utests with fftalg = ",fftalg,", ndat = ",ndat,", nthreads = ",nthreads
439      call wrtout(std_out,msg,"COLL")
440 
441      nfailed = nfailed + fftbox_utests(fftalg,ndat,nthreads)
442 
443      ! Warning: This routine is not thread safe hence we have to initialize ngfft it here.
444      ut_ngfft = -1
445      ut_ngfft(7) = fftalg
446      ut_ngfft(8) = fftcache
447 
448      call getng(boxcutmin2,ecut,gmet,k0,me_fft0,ut_mgfft,ut_nfft,ut_ngfft,nproc_fft1,nsym,&
449 &     paral_kgb0,symrel,unit=dev_null)
450 
451      write(msg,"(3(a,i0))")"fftu_utests with fftalg = ",fftalg,", ndat = ",ndat,", nthreads = ",nthreads
452      call wrtout(std_out,msg,"COLL")
453 
454      nfailed = nfailed + fftu_utests(ecut,ut_ngfft,rprimd,ndat,nthreads)
455    end do
456 
457    write(msg,'(a,i0)')"Total number of failed tests = ",nfailed
458    call wrtout(std_out,msg,"COLL")
459  end if
460 
461 ! Benchmarks for the sequential version.
462  if (do_seq_bench) then
463    call wrtout(std_out,"Entering benchmark mode","COLL")
464    write(std_out,*)"ecut_arth",ecut_arth,", necut ",necut
465 
466    if (INDEX(tasks,"bench_fourdp")>0) then
467      isign=1; cplex=1
468      call prof_fourdp(fft_setups,isign,cplex,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
469 
470      isign=1; cplex=2
471      call prof_fourdp(fft_setups,isign,cplex,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
472    end if
473 
474    if (INDEX(tasks,"bench_fourwf")>0) then
475      cplex=2; option_fourwf=0
476      call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
477 
478      cplex=1; option_fourwf=1
479      call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
480 
481      cplex=1; option_fourwf=2
482      call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
483 
484      cplex=2; option_fourwf=2
485      call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
486 
487      cplex=0; option_fourwf=3
488      call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
489    end if
490 
491    if (INDEX(tasks,"bench_rhotwg")>0) then
492      map2sphere=1; use_padfft=1
493      call prof_rhotwg(fft_setups,map2sphere,use_padfft,necut,ecut_arth,osc_ecut,boxcutmin2,&
494 &     rprimd,nsym,symrel,gmet,MPI_enreg)
495 
496      map2sphere=1; use_padfft=0
497      call prof_rhotwg(fft_setups,map2sphere,use_padfft,necut,ecut_arth,osc_ecut,boxcutmin2,&
498 &     rprimd,nsym,symrel,gmet,MPI_enreg)
499    end if
500  end if
501  !
502  !===============================
503  !=== End of run, free memory ===
504  !===============================
505  call wrtout(std_out,ch10//" Analysis completed.","COLL")
506 
507  ABI_FREE(fft_setups)
508 
509  call fft_test_free(Ftest)
510  ABI_DT_FREE(Ftest)
511  call fftprof_free(Ftprof)
512  ABI_DT_FREE(Ftprof)
513 
514  call flush_unit(std_out)
515 
516  call destroy_mpi_enreg(MPI_enreg)
517 
518  call abinit_doctor("__fftprof")
519 
520  100 call xmpi_end()
521 
522  end program fftprof