TABLE OF CONTENTS
- ABINIT/m_FFT_prof
- m_fft_mesh/FFT_prof_t
- m_fft_mesh/FFT_test_t
- m_FFT_prof/empty_cache
- m_FFT_prof/fft_test_free_0D
- m_FFT_prof/fft_test_free_1D
- m_FFT_prof/fft_test_init
- m_FFT_prof/fft_test_print
- m_FFT_prof/fftprof_free_0D
- m_FFT_prof/fftprof_free_1D
- m_FFT_prof/fftprof_init
- m_FFT_prof/fftprof_ncalls_per_test
- m_FFT_prof/fftprofs_print
- m_FFT_prof/get_name
- m_FFT_prof/prof_fourdp
- m_FFT_prof/prof_fourwf
- m_FFT_prof/prof_rhotwg
- m_FFT_prof/time_fftbox
- m_FFT_prof/time_fftu
- m_FFT_prof/time_fourdp
- m_FFT_prof/time_fourwf
- m_FFT_prof/time_rhotwg
ABINIT/m_FFT_prof [ Modules ]
NAME
m_FFT_prof
COPYRIGHT
Copyright (C) 2008-2022 ABINIT group (MG) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
13 #if defined HAVE_CONFIG_H 14 #include "config.h" 15 #endif 16 17 #include "abi_common.h" 18 19 MODULE m_FFT_prof 20 21 use defs_basis 22 use m_xomp 23 use m_errors 24 use m_abicore 25 use m_fftw3 26 use m_fft 27 use m_distribfft 28 29 use defs_abitypes, only : MPI_type 30 use m_fstrings, only : sjoin, itoa 31 use m_numeric_tools, only : arth 32 use m_time, only : cwtime 33 use m_io_tools, only : open_file 34 use m_geometry, only : metric 35 use m_hide_blas, only : xcopy 36 use m_cgtools, only : set_istwfk 37 use m_fftcore, only : get_kg, print_ngfft, fftalg_info, kgindex, getng, sphereboundary 38 use m_fft_mesh, only : calc_eigr, calc_ceigr 39 use m_mpinfo, only : nullify_mpi_enreg, destroy_mpi_enreg, copy_mpi_enreg, initmpi_seq 40 use m_oscillators, only : rho_tw_g 41 42 implicit none 43 44 private 45 46 integer,private,parameter :: TNAME_LEN=100
m_fft_mesh/FFT_prof_t [ Types ]
[ Top ] [ m_fft_mesh ] [ Types ]
NAME
FFT_prof_t
FUNCTION
The results of the tests
SOURCE
128 type,public :: FFT_prof_t 129 integer :: ncalls 130 integer :: ndat 131 integer :: use_gpu 132 integer :: nthreads 133 real(dp) :: cpu_time 134 real(dp) :: wall_time 135 real(dp) :: gflops 136 character(len=TNAME_LEN) :: test_name 137 complex(dpc),allocatable :: results(:) 138 139 contains 140 procedure :: init => fftprof_init 141 procedure :: free => fftprof_free_0D 142 end type FFT_prof_t 143 144 public :: fftprofs_free 145 public :: fftprofs_print 146 147 interface fftprofs_free 148 module procedure fftprof_free_0D 149 module procedure fftprof_free_1D 150 end interface fftprofs_free
m_fft_mesh/FFT_test_t [ Types ]
[ Top ] [ m_fft_mesh ] [ Types ]
NAME
FFT_test_t
FUNCTION
Parameters passed to the FFT routines used in abinit (fourdp|fourwf).
SOURCE
69 type, public :: FFT_test_t 70 71 integer :: available = 0 72 integer :: istwf_k = -1 73 integer :: mgfft = -1 74 integer :: ndat = -1 75 integer :: nfft = -1 76 integer :: nthreads = 1 77 integer :: npw_k = -1 78 integer :: npw_kout = -1 79 integer :: paral_kgb = -1 80 integer :: use_gpu = 0 81 82 real(dp) :: ecut=zero 83 integer :: ngfft(18)=-1 84 85 real(dp) :: kpoint(3) = [zero,zero,zero] 86 real(dp) :: rprimd(3,3),rmet(3,3) 87 real(dp) :: gprimd(3,3),gmet(3,3) 88 89 integer,allocatable :: kg_k(:,:) 90 integer,allocatable :: kg_kout(:,:) 91 integer,allocatable :: indpw_k(:) 92 93 type(MPI_type) :: MPI_enreg 94 95 contains 96 procedure :: init => fft_test_init 97 procedure :: free => fft_test_free_0D 98 procedure :: print => fft_test_print 99 procedure :: get_name 100 101 ! Timing routines. 102 procedure :: time_fourdp 103 procedure :: time_fftbox 104 procedure :: time_fourwf 105 procedure :: time_rhotwg 106 procedure :: time_fftu 107 end type FFT_test_t 108 109 public :: fft_tests_free 110 111 interface fft_tests_free 112 module procedure fft_test_free_0D 113 module procedure fft_test_free_1D 114 end interface fft_tests_free
m_FFT_prof/empty_cache [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
empty_cache
FUNCTION
Empty the memory cache
INPUTS
OUTPUT
SOURCE
1731 integer function empty_cache(kbsize) result(fake) 1732 1733 !Arguments ----------------------------------- 1734 integer,intent(in) :: kbsize 1735 1736 !Local variables------------------------------- 1737 integer :: sz 1738 real(dp),allocatable :: chunk(:) 1739 1740 ! ********************************************************************* 1741 1742 fake = 0 1743 if (kbsize <= 0) RETURN 1744 1745 sz = int((100._dp * kbsize) / dp) 1746 1747 ABI_MALLOC(chunk,(sz)) 1748 call random_number(chunk) 1749 fake = int(SUM(chunk)) ! Need a result, otherwise smart compilers may skip the call. 1750 ABI_FREE(chunk) 1751 1752 end function empty_cache
m_FFT_prof/fft_test_free_0D [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
fft_test_free_0D
FUNCTION
Destruction method for the FFT_test_t structured datatype.
INPUTS
OUTPUT
SOURCE
269 subroutine fft_test_free_0D(Ftest) 270 271 !Arguments ----------------------------------- 272 class(FFT_test_t),intent(inout) :: Ftest 273 274 ! ********************************************************************* 275 276 ABI_SFREE(Ftest%indpw_k) 277 ABI_SFREE(Ftest%kg_k) 278 ABI_SFREE(Ftest%kg_kout) 279 280 call destroy_mpi_enreg(Ftest%MPI_enreg) 281 282 end subroutine fft_test_free_0D
m_FFT_prof/fft_test_free_1D [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
fft_test_free_1D
FUNCTION
Destruction method for the FFT_test_t structured datatype.
INPUTS
OUTPUT
SOURCE
300 subroutine fft_test_free_1D(Ftest) 301 302 !Arguments ----------------------------------- 303 class(FFT_test_t),intent(inout) :: Ftest(:) 304 305 !Local variables------------------------------- 306 integer :: ii 307 308 ! ********************************************************************* 309 310 do ii=LBOUND(Ftest,DIM=1),UBOUND(Ftest,DIM=1) 311 call Ftest(ii)%free() 312 end do 313 314 end subroutine fft_test_free_1D
m_FFT_prof/fft_test_init [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
fft_test_init
FUNCTION
Creation method for the FFT_test_t structured datatype.
INPUTS
OUTPUT
SOURCE
178 subroutine fft_test_init(Ftest, fft_setup, kpoint, ecut, boxcutmin, rprimd, nsym, symrel, MPI_enreg_in) 179 180 !Arguments ----------------------------------- 181 !scalars 182 class(FFT_test_t),intent(inout) :: Ftest 183 integer,intent(in) :: nsym 184 real(dp),intent(in) :: ecut,boxcutmin 185 class(MPI_type),intent(in) :: MPI_enreg_in 186 !arrays 187 integer,intent(in) :: fft_setup(6),symrel(3,3,nsym) 188 real(dp),intent(in) :: kpoint(3),rprimd(3,3) 189 190 !Local variables------------------------------- 191 !scalars 192 integer :: fftalg,fftcache,ndat 193 real(dp) :: ucvol 194 !arrays 195 logical,allocatable :: mask(:) 196 real(dp),parameter :: k0(3)=zero 197 real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3) 198 real(dp),allocatable :: tnons(:,:) 199 200 ! ************************************************************************* 201 202 call nullify_mpi_enreg(Ftest%MPI_enreg) 203 204 call metric(gmet,gprimd,-1,rmet,rprimd,ucvol) 205 206 Ftest%rprimd = rprimd 207 Ftest%rmet = rmet 208 Ftest%gprimd = gprimd 209 Ftest%gmet = gmet 210 Ftest%ecut = ecut 211 212 fftalg = fft_setup(1) 213 fftcache = fft_setup(2) 214 ndat = fft_setup(3) 215 Ftest%nthreads = fft_setup(4) 216 Ftest%available = fft_setup(5) 217 Ftest%use_gpu = fft_setup(6) 218 219 Ftest%paral_kgb = 0 220 Ftest%kpoint = kpoint 221 Ftest%ndat = ndat 222 223 Ftest%istwf_k = set_istwfk(kpoint) 224 225 call get_kg(Ftest%kpoint,Ftest%istwf_k,ecut,gmet,Ftest%npw_k,Ftest%kg_k) 226 call get_kg(Ftest%kpoint,Ftest%istwf_k,ecut,gmet,Ftest%npw_kout,Ftest%kg_kout) 227 228 call copy_mpi_enreg(MPI_enreg_in,Ftest%MPI_enreg) 229 230 Ftest%ngfft(7) = fftalg 231 Ftest%ngfft(8) = fftcache 232 233 ! Fill part of ngfft 234 ABI_CALLOC(tnons,(3,nsym)) 235 236 call getng(boxcutmin,0,ecut,gmet,k0,Ftest%MPI_enreg%me_fft,Ftest%mgfft,Ftest%nfft,Ftest%ngfft,Ftest%MPI_enreg%nproc_fft,nsym,& 237 & Ftest%MPI_enreg%paral_kgb,symrel,tnons, unit=dev_null, use_gpu_cuda=ftest%use_gpu) 238 239 ABI_FREE(tnons) 240 241 call init_distribfft(Ftest%MPI_enreg%distribfft,'c',Ftest%MPI_enreg%nproc_fft,Ftest%ngfft(2),Ftest%ngfft(3)) 242 243 ! Compute the index of each plane wave in the FFT grid. 244 ABI_MALLOC(Ftest%indpw_k,(Ftest%npw_k)) 245 246 ABI_MALLOC(mask,(Ftest%npw_k)) 247 call kgindex(Ftest%indpw_k,Ftest%kg_k,mask,Ftest%MPI_enreg,Ftest%ngfft,Ftest%npw_k) 248 ABI_CHECK(ALL(mask),"FFT parallelism not supported in fftprof") 249 ABI_FREE(mask) 250 251 end subroutine fft_test_init
m_FFT_prof/fft_test_print [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
fft_test_print
FUNCTION
Printout of the FFT_test_t structured datatype.
INPUTS
OUTPUT
SOURCE
332 subroutine fft_test_print(Ftest, header, unit, mode_paral, prtvol) 333 334 !Arguments ----------------------------------- 335 class(FFT_test_t),intent(in) :: Ftest 336 integer,optional,intent(in) :: unit,prtvol 337 character(len=4),optional,intent(in) :: mode_paral 338 character(len=*),optional,intent(in) :: header 339 340 !Local variables------------------------------- 341 !scalars 342 integer :: my_unt,my_prtvol 343 character(len=4) :: my_mode 344 character(len=500) :: msg 345 ! ********************************************************************* 346 347 my_unt =std_out; if (PRESENT(unit )) my_unt =unit 348 my_prtvol=0 ; if (PRESENT(prtvol )) my_prtvol=prtvol 349 my_mode ='COLL' ; if (PRESENT(mode_paral)) my_mode =mode_paral 350 351 !msg=' ==== Info on the FFT test object ==== ' 352 if (PRESENT(header)) then 353 msg=' ==== '//TRIM(ADJUSTL(header))//' ==== ' 354 call wrtout(my_unt,msg,my_mode) 355 end if 356 357 write(msg,'(a,i3)')"FFT setup for fftalg ",Ftest%ngfft(7) 358 call print_ngfft(Ftest%ngfft, header=msg, unit=my_unt) 359 360 end subroutine fft_test_print
m_FFT_prof/fftprof_free_0D [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
fftprof_free_0D
FUNCTION
Destruction method for the FFT_prof_t structured datatype.
SOURCE
456 subroutine fftprof_free_0D(Ftprof) 457 458 !Arguments ----------------------------------- 459 class(FFT_prof_t),intent(inout) :: Ftprof 460 461 ! ********************************************************************* 462 463 ABI_SFREE(Ftprof%results) 464 465 end subroutine fftprof_free_0D
m_FFT_prof/fftprof_free_1D [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
fftprof_free_1D
FUNCTION
Destruction method for the FFT_prof_t structured datatype.
INPUTS
OUTPUT
SOURCE
483 subroutine fftprof_free_1D(Ftprof) 484 485 !Arguments ----------------------------------- 486 class(FFT_prof_t),intent(inout) :: Ftprof(:) 487 488 !Local variables------------------------------- 489 integer :: ii 490 ! ********************************************************************* 491 492 do ii=LBOUND(Ftprof,DIM=1),UBOUND(Ftprof,DIM=1) 493 call fftprof_free_0D(Ftprof(ii)) 494 end do 495 496 end subroutine fftprof_free_1D
m_FFT_prof/fftprof_init [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
fftprof_init
FUNCTION
Creation method for the FFT_prof_t structured datatype.
INPUTS
gflops = Gigaflops
OUTPUT
SOURCE
416 subroutine fftprof_init(Ftprof, test_name, nthreads, ncalls, ndat, use_gpu, cpu_time, wall_time, gflops, results) 417 418 !Arguments ----------------------------------- 419 class(FFT_prof_t),intent(out) :: Ftprof 420 integer,intent(in) :: ncalls,nthreads,ndat, use_gpu 421 real(dp),intent(in) :: cpu_time,wall_time,gflops 422 character(len=*),intent(in) :: test_name 423 !arrays 424 complex(dpc),optional,intent(in) :: results(:) 425 426 ! ************************************************************************* 427 428 Ftprof%ncalls = ncalls 429 Ftprof%nthreads = nthreads 430 Ftprof%ndat = ndat 431 Ftprof%use_gpu = use_gpu 432 Ftprof%cpu_time = cpu_time 433 Ftprof%wall_time = wall_time 434 Ftprof%gflops = gflops 435 Ftprof%test_name = test_name 436 437 if (present(results)) then 438 ABI_REMALLOC(Ftprof%results, (size(results))) 439 Ftprof%results = results 440 end if 441 442 end subroutine fftprof_init
m_FFT_prof/fftprof_ncalls_per_test [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
fftprof_ncalls_per_test
FUNCTION
Helper function used to set the number of calls to be used in each time_* routine.
INPUTS
ncalls=Number of calls to be used.
SIDE EFFECTS
NCALLS_FOR_TEST = ncalls
SOURCE
1127 subroutine fftprof_ncalls_per_test(ncalls) 1128 1129 !Arguments ----------------------------------- 1130 integer,intent(in) :: ncalls 1131 ! ********************************************************************* 1132 1133 NCALLS_FOR_TEST = ncalls 1134 1135 end subroutine fftprof_ncalls_per_test
m_FFT_prof/fftprofs_print [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
fftprofs_print
FUNCTION
Printout of the FFT_prof_t structured datatype.
INPUTS
OUTPUT
SOURCE
514 subroutine fftprofs_print(Fprof, header, unit, mode_paral, prtvol) 515 516 !Arguments ----------------------------------- 517 class(FFT_prof_t),intent(in) :: Fprof(:) 518 integer,optional,intent(in) :: unit,prtvol 519 character(len=4),optional,intent(in) :: mode_paral 520 character(len=*),optional,intent(in) :: header 521 522 !Local variables------------------------------- 523 !scalars 524 integer :: my_unt,my_prtvol,ncalls, field1_w,ii,ref_lib !ifft 525 real(dp) :: mabs_err,mean_err,check_mabs_err,check_mean_err, ref_wtime,para_eff 526 character(len=4) :: my_mode 527 character(len=500) :: ofmt,hfmt,nafmt,msg 528 529 ! ********************************************************************* 530 531 my_unt =std_out; if (PRESENT(unit )) my_unt =unit 532 my_prtvol=0 ; if (PRESENT(prtvol )) my_prtvol=prtvol 533 my_mode ='COLL' ; if (PRESENT(mode_paral)) my_mode =mode_paral 534 535 msg='==== Info on the FFT_prof_t object ====' 536 if (PRESENT(header)) msg='==== '//TRIM(ADJUSTL(header))//' ====' 537 538 call wrtout(my_unt,ch10//REPEAT("=",LEN_TRIM(msg))) 539 call wrtout(my_unt,msg,my_mode) 540 call wrtout(my_unt,REPEAT("=",LEN_TRIM(msg))) 541 542 field1_w=0 ! Width of the field used to print key names. 543 do ii=1,SIZE(Fprof) 544 field1_w = MAX(field1_w, LEN_TRIM(Fprof(ii)%test_name)) 545 end do 546 547 if (field1_w==0) RETURN 548 field1_w = field1_w + 2 ! To account for ". " 549 write(ofmt,*)"(a",field1_w,",2x,2(f7.4,4x),1x,i2,1x,a,i3,a,1x,i0,4x,2(es9.2,3x))" 550 551 write(hfmt,*)"(a",field1_w,",2x,a)" 552 write(std_out,hfmt)" Library ","CPU-time WALL-time nthreads ncalls Max_|Err| <|Err|>" 553 ! 554 ! Find reference library. 555 ref_lib=0 556 do ii=1,SIZE(Fprof) 557 if (Fprof(ii)%ncalls>0) then 558 ref_lib = ii 559 EXIT 560 end if 561 end do 562 !ref_lib=3 563 ! 564 ! Write timing analysis and error wrt reference library if available. 565 check_mabs_err=zero; check_mean_err=zero 566 do ii=1,SIZE(Fprof) 567 ncalls = Fprof(ii)%ncalls 568 if (ncalls>0) then 569 mabs_err = zero; mean_err=zero 570 if (ref_lib>0) then 571 mabs_err = MAXVAL( ABS(Fprof(ii)%results - Fprof(ref_lib)%results) ) 572 mean_err = SUM( ABS(Fprof(ii)%results - Fprof(ref_lib)%results) ) / SIZE(Fprof(ref_lib)%results) 573 ! Relative error is not a good estimator because some components are close to zero within machine accuracy. 574 !mean_err = 100 * MAXVAL( ABS(Fprof(ii)%results - Fprof(1)%results)/ ABS(Fprof(1)%results )) 575 !ifft = imax_loc( ABS(Fprof(ii)%results - Fprof(1)%results)/ ABS(Fprof(1)%results) ) 576 !write(std_out,*) Fprof(ii)%results(ifft),Fprof(1)%results(ifft) 577 end if 578 if (Fprof(ii)%nthreads==1) ref_wtime = Fprof(ii)%wall_time 579 para_eff = 100 * ref_wtime / ( Fprof(ii)%wall_time * Fprof(ii)%nthreads) 580 write(std_out,ofmt)& 581 "- "//Fprof(ii)%test_name,Fprof(ii)%cpu_time/ncalls,Fprof(ii)%wall_time/ncalls,& 582 Fprof(ii)%nthreads,"(",NINT(para_eff),"%)",ncalls,mabs_err,mean_err 583 check_mabs_err = MAX(check_mabs_err, mabs_err) 584 check_mean_err = MAX(check_mean_err, mean_err) 585 else 586 write(nafmt,*)"(a",field1_w,",2x,a)" 587 write(std_out,nafmt)"- "//Fprof(ii)%test_name," N/A N/A N/A N/A N/A N/A" 588 end if 589 end do 590 591 if (ref_lib>0) then 592 write(std_out,'(/,2(a,es9.2),2a)')& 593 " Consistency check: MAX(Max_|Err|) = ",check_mabs_err,& 594 ", Max(<|Err|>) = ",check_mean_err,", reference_lib: ",TRIM(Fprof(ref_lib)%test_name) 595 end if 596 write(std_out,*) 597 598 end subroutine fftprofs_print
m_FFT_prof/get_name [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
get_name
FUNCTION
Returns a string with info on the test.
INPUTS
OUTPUT
SOURCE
378 character(len=TNAME_LEN) function get_name(Ftest) 379 380 !Arguments ----------------------------------- 381 class(FFT_test_t),intent(in) :: Ftest 382 383 !Local variables------------------------------- 384 character(len=TNAME_LEN) :: library_name,cplex_mode,padding_mode 385 386 ! ********************************************************************* 387 388 if (ftest%use_gpu == 0) then 389 call fftalg_info(Ftest%ngfft(7), library_name, cplex_mode, padding_mode) 390 !get_name = TRIM(library_name)//"; "//TRIM(cplex_mode)//"; "//TRIM(padding_mode) 391 write(get_name,'(i3)')Ftest%ngfft(7) 392 get_name = TRIM(library_name)//" ("//TRIM(get_name)//")" 393 else 394 get_name = "GPU" 395 end if 396 397 end function get_name
m_FFT_prof/prof_fourdp [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
prof_fourdp
FUNCTION
Profile fourdp
INPUTS
OUTPUT
SOURCE
1437 subroutine prof_fourdp(fft_setups, isign, cplex, necut, ecut_arth, boxcutmin, rprimd, nsym, symrel, MPI_enreg_in) 1438 1439 !Arguments ----------------------------------- 1440 !scalars 1441 integer,intent(in) :: nsym,isign,cplex,necut 1442 real(dp),intent(in) :: boxcutmin 1443 type(MPI_type),intent(in) :: MPI_enreg_in 1444 !arrays 1445 integer,intent(in) :: fft_setups(:,:),symrel(3,3,nsym) 1446 real(dp),intent(in) :: ecut_arth(2) 1447 real(dp),intent(in) :: rprimd(3,3) 1448 1449 !Local variables------------------------------- 1450 !scalars 1451 integer :: iec,nsetups,set,funt 1452 type(FFT_test_t) :: Ftest 1453 type(FFT_prof_t) :: Ftprof 1454 character(len=500) :: msg,frm,header 1455 character(len=fnlen) :: fname 1456 !arrays 1457 integer :: ngfft_ecut(18,necut) 1458 real(dp),parameter :: k_gamma(3)=zero 1459 real(dp) :: ecut_list(necut) 1460 real(dp),allocatable :: prof_res(:,:,:) 1461 1462 ! ********************************************************************* 1463 1464 nsetups = size(fft_setups, dim=2) 1465 ecut_list = arth(ecut_arth(1), ecut_arth(2), necut) 1466 1467 ! Open file and write header with info. 1468 write(fname,'(2(a,i1))')"PROF_fourdp_cplex",cplex,"_isign",isign 1469 if (open_file(fname, msg, newunit=funt) /= 0) then 1470 ABI_ERROR(msg) 1471 end if 1472 1473 write(msg,'(2(a,i0))')"Benchmark: routine = fourdp, cplex =",cplex,", isign=",isign 1474 write(std_out,'(a)')" Running "//TRIM(msg) 1475 1476 write(funt,'(a)')"# "//TRIM(msg) 1477 do set=1,nsetups 1478 write(funt,'(a, 6(a,i0))') "#",& 1479 " fftalg = " ,fft_setups(1,set), & 1480 ", fftcache = " ,fft_setups(2,set), & 1481 ", ndat = " ,fft_setups(3,set), & 1482 ", nthreads = " ,fft_setups(4,set), & 1483 ", available = ",fft_setups(5,set), & 1484 ", use_gpu = ",fft_setups(6,set) 1485 end do 1486 1487 ABI_MALLOC(prof_res,(2,necut,nsetups)) 1488 1489 do set=1,nsetups 1490 do iec=1,necut 1491 call Ftest%init(fft_setups(:,set),k_gamma,ecut_list(iec),boxcutmin,rprimd,nsym,symrel,MPI_enreg_in) 1492 call Ftest%time_fourdp(isign, cplex, header, Ftprof) 1493 1494 prof_res(1,iec,set) = Ftprof%cpu_time /Ftprof%ncalls 1495 prof_res(2,iec,set) = Ftprof%wall_time/Ftprof%ncalls 1496 1497 ! Save FFT divisions. 1498 if (set == 1) ngfft_ecut(:,iec) = Ftest%ngfft 1499 1500 call Ftprof%free() 1501 call Ftest%free() 1502 end do 1503 end do 1504 1505 ! Write the wall-time as a function of ecut. 1506 write(frm,*)"(f7.1,3i4,",nsetups,"(f7.4))" 1507 do iec=1,necut 1508 write(funt,frm) ecut_list(iec),ngfft_ecut(4:6,iec),prof_res(2,iec,:) 1509 end do 1510 1511 close(funt) 1512 ABI_FREE(prof_res) 1513 1514 end subroutine prof_fourdp
m_FFT_prof/prof_fourwf [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
prof_fourwf
FUNCTION
profile fourwf
INPUTS
OUTPUT
SOURCE
1532 subroutine prof_fourwf(fft_setups, cplex, option, kpoint, necut, ecut_arth, & 1533 boxcutmin, rprimd, nsym, symrel, MPI_enreg_in) 1534 1535 !Arguments ----------------------------------- 1536 !scalars 1537 integer,intent(in) :: nsym,cplex,necut,option 1538 real(dp),intent(in) :: boxcutmin 1539 type(MPI_type),intent(in) :: MPI_enreg_in 1540 !arrays 1541 integer,intent(in) :: fft_setups(:,:),symrel(3,3,nsym) 1542 real(dp),intent(in) :: ecut_arth(2) 1543 real(dp),intent(in) :: kpoint(3),rprimd(3,3) 1544 1545 !Local variables------------------------------- 1546 !scalars 1547 integer :: iec,nsetups,set,funt,istwf_k 1548 type(FFT_test_t) :: Ftest 1549 type(FFT_prof_t) :: Ftprof 1550 character(len=500) :: msg,frm,header 1551 character(len=fnlen) :: fname 1552 !arrays 1553 integer :: ngfft_ecut(18,necut) 1554 real(dp) :: ecut_list(necut) 1555 real(dp),allocatable :: prof_res(:,:,:) 1556 1557 ! ********************************************************************* 1558 1559 nsetups = size(fft_setups, dim=2) 1560 ecut_list = arth(ecut_arth(1), ecut_arth(2), necut) 1561 istwf_k = set_istwfk(kpoint) 1562 1563 ! Open file and write header with info. 1564 write(fname,'(3(a,i1))')"PROF_fourwf_cplex",cplex,"_option",option,"_istwfk",istwf_k 1565 if (open_file(fname, msg, newunit=funt) /= 0) then 1566 ABI_ERROR(msg) 1567 end if 1568 1569 write(msg,'(3(a,i1))')"Benchmark: routine = fourwf, cplex = ",cplex,", option= ",option,", istwfk= ",istwf_k 1570 write(std_out,'(a)')" Running "//TRIM(msg) 1571 1572 write(funt,'(a)')"# "//TRIM(msg) 1573 do set=1,nsetups 1574 write(funt,'(a,6(a,i0))') "#",& 1575 " fftalg = " ,fft_setups(1,set), & 1576 ", fftcache = " ,fft_setups(2,set), & 1577 ", ndat = " ,fft_setups(3,set), & 1578 ", nthreads = " ,fft_setups(4,set), & 1579 ", available = ",fft_setups(5,set), & 1580 ", use_gpu = ",fft_setups(6,set) 1581 end do 1582 1583 ABI_MALLOC(prof_res,(2,necut,nsetups)) 1584 1585 do set=1,nsetups 1586 do iec=1,necut 1587 call Ftest%init(fft_setups(:,set),kpoint,ecut_list(iec),boxcutmin,rprimd,nsym,symrel,MPI_enreg_in) 1588 call Ftest%time_fourwf(cplex, option, header, Ftprof) 1589 1590 prof_res(1,iec,set) = Ftprof%cpu_time /Ftprof%ncalls 1591 prof_res(2,iec,set) = Ftprof%wall_time/Ftprof%ncalls 1592 1593 ! Save FFT divisions. 1594 if (set == 1) ngfft_ecut(:,iec) = Ftest%ngfft 1595 call Ftprof%free() 1596 call Ftest%free() 1597 end do 1598 end do 1599 1600 ! Write the wall-time as a function of ecut. 1601 write(frm,*)"(f7.1,3i4,",nsetups,"(f7.4))" 1602 do iec=1,necut 1603 write(funt,frm) ecut_list(iec),ngfft_ecut(4:6,iec),prof_res(2,iec,:) 1604 end do 1605 1606 close(funt) 1607 ABI_FREE(prof_res) 1608 1609 end subroutine prof_fourwf
m_FFT_prof/prof_rhotwg [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
prof_rhotwg
FUNCTION
profile rhotwg
INPUTS
OUTPUT
SOURCE
1627 subroutine prof_rhotwg(fft_setups,map2sphere,use_padfft,necut,ecut_arth,osc_ecut,boxcutmin,& 1628 rprimd,nsym,symrel,gmet,MPI_enreg_in) 1629 1630 !Arguments ----------------------------------- 1631 !scalars 1632 integer,intent(in) :: nsym,necut,map2sphere,use_padfft 1633 real(dp),intent(in) :: boxcutmin,osc_ecut 1634 type(MPI_type),intent(in) :: MPI_enreg_in 1635 !arrays 1636 integer,intent(in) :: fft_setups(:,:),symrel(3,3,nsym) 1637 real(dp),intent(in) :: ecut_arth(2) 1638 real(dp),intent(in) :: rprimd(3,3),gmet(3,3) 1639 1640 !Local variables------------------------------- 1641 !scalars 1642 integer :: iec,nsetups,set,funt,osc_npw 1643 type(FFT_test_t) :: Ftest 1644 type(FFT_prof_t) :: Ftprof 1645 character(len=500) :: msg,frm,header 1646 character(len=fnlen) :: fname 1647 !arrays 1648 integer,allocatable :: osc_gvec(:,:) 1649 integer :: ngfft_ecut(18,necut) 1650 real(dp),parameter :: k_gamma(3)=zero 1651 real(dp) :: ecut_list(necut) 1652 real(dp),allocatable :: prof_res(:,:,:) 1653 1654 ! ********************************************************************* 1655 1656 nsetups = size(fft_setups, dim=2) 1657 ecut_list = arth(ecut_arth(1), ecut_arth(2), necut) 1658 1659 ! Open file and write header with info. 1660 write(fname,'(2(a,i1))')"PROF_rhotwg_map2sphere",map2sphere,"_use_padfft",use_padfft 1661 if (open_file(fname, msg, newunit=funt) /= 0) then 1662 ABI_ERROR(msg) 1663 end if 1664 1665 write(msg,'(2(a,i0),a,f5.1)') & 1666 "Benchmark: routine = rho_tw_g, map2sphere = ",map2sphere,", use_padfft = ",use_padfft,", osc_ecut = ",osc_ecut 1667 write(std_out,'(a)')" Running "//TRIM(msg) 1668 1669 write(funt,'(a)')"# "//TRIM(msg) 1670 do set=1,nsetups 1671 write(funt,'(a,6(a,i0))') "#",& 1672 " fftalg = " ,fft_setups(1,set),& 1673 ", fftcache = " ,fft_setups(2,set),& 1674 ", ndat = " ,fft_setups(3,set),& 1675 ", nthreads = " ,fft_setups(4,set),& 1676 ", available = ",fft_setups(5,set),& 1677 ", use_gpu = ",fft_setups(6,set) 1678 end do 1679 1680 call get_kg([zero,zero,zero],1,osc_ecut,gmet,osc_npw,osc_gvec) 1681 ! TODO should reorder by shells to be consistent with the GW part! 1682 ! Moreover I guess this ordering is more efficient when we have 1683 ! to map the box to the G-sphere! 1684 1685 ABI_MALLOC(prof_res,(2,necut,nsetups)) 1686 1687 do set=1,nsetups 1688 do iec=1,necut 1689 call Ftest%init(fft_setups(:,set),k_gamma,ecut_list(iec),boxcutmin,rprimd,nsym,symrel,MPI_enreg_in) 1690 1691 call Ftest%time_rhotwg(map2sphere,use_padfft,osc_npw,osc_gvec,header,Ftprof) 1692 1693 prof_res(1,iec,set) = Ftprof%cpu_time /Ftprof%ncalls 1694 prof_res(2,iec,set) = Ftprof%wall_time/Ftprof%ncalls 1695 ! Save FFT divisions. 1696 if (set==1) ngfft_ecut(:,iec) = Ftest%ngfft 1697 1698 call Ftprof%free() 1699 call Ftest%free() 1700 end do 1701 end do 1702 1703 ! Write the wall-time as a function of ecut. 1704 write(frm,*)"(f7.1,3i4,",nsetups,"(f7.4))" 1705 do iec=1,necut 1706 write(funt,frm) ecut_list(iec),ngfft_ecut(4:6,iec),prof_res(2,iec,:) 1707 end do 1708 1709 close(funt) 1710 ABI_FREE(prof_res) 1711 ABI_FREE(osc_gvec) 1712 1713 end subroutine prof_rhotwg
m_FFT_prof/time_fftbox [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
time_fftbox
FUNCTION
Profiling of the fftbox_[io]p routines.
INPUTS
OUTPUT
SOURCE
736 subroutine time_fftbox(Ftest, isign, inplace, header, Ftprof) 737 738 !Arguments ----------------------------------- 739 class(FFT_test_t),intent(inout) :: Ftest 740 integer,intent(in) :: isign,inplace 741 character(len=500),intent(inout) :: header 742 class(FFT_prof_t),intent(out) :: Ftprof 743 744 !Local variables------------------------------- 745 !scalars 746 integer,parameter :: nspinor1 = 1, fftcache0 = 0 747 integer :: icall,i1,i2,i3,n1,n2,n3,ifft,ndat,nfft,dat,padat 748 real(dp) :: cpu_time,wall_time,gflops,gsq 749 !character(len=500) :: msg 750 character(len=TNAME_LEN) :: test_name 751 type(fftbox_plan3_t) :: plan 752 !arrays 753 integer,parameter :: g0(3) = [1,-2,1] 754 integer :: gg(3) 755 complex(dpc),allocatable :: ffc(:),ggc(:),results(:) 756 ! ********************************************************************* 757 758 test_name = Ftest%get_name() 759 760 if (Ftest%available == 0) then 761 call Ftprof%init(test_name, 0, 0, 0, 0, zero, zero, zero) 762 return 763 end if 764 765 call xomp_set_num_threads(Ftest%nthreads) 766 767 if (Ftest%ngfft(7)/100 == FFT_FFTW3) call fftw3_set_nthreads(Ftest%nthreads) 768 769 n1 = Ftest%ngfft(1); n2 = Ftest%ngfft(2); n3 = Ftest%ngfft(3) 770 nfft = Ftest%nfft; ndat = Ftest%ndat 771 write(header,'(3(a,i2))')" fftbox with isign ",isign,", in-place ",inplace,", ndat ",ndat 772 773 ABI_CALLOC(ffc,(nfft*ndat)) 774 ABI_CALLOC(ggc,(nfft*ndat)) 775 ABI_CALLOC(results, (nfft*ndat)) 776 777 if (isign==-1) then 778 call calc_ceigr(g0,nfft,nspinor1,Ftest%ngfft,ffc) 779 else if (isign==1) then 780 ifft=0 781 do i3=1,n3 782 gg(3)=i3-1; if (i3>1+n3/2) gg(3)=i3-n3-1 ! TODO recheck this 783 do i2=1,n2 784 gg(2)=i2-1; if (i2>1+n2/2) gg(2)=i2-n2-1 785 do i1=1,n1 786 gg(1)=i1-1; if (i1>1+n1/2) gg(1)=i1-n1-1 787 gsq = two_pi**2 * DOT_PRODUCT(gg,MATMUL(Ftest%gmet,gg)) 788 ifft=ifft+1 789 ffc(ifft) = EXP(-gsq) 790 end do 791 end do 792 end do 793 else 794 ABI_ERROR("Wrong isign") 795 end if 796 797 ! Replicate and scale input data 798 do dat=2,ndat 799 padat = (dat-1) * nfft 800 do ifft=1,nfft 801 ffc(ifft+padat) = DBLE(dat) * ffc(ifft) 802 end do 803 end do 804 805 call cwtime(cpu_time, wall_time, gflops, "start") 806 807 ! No augmentation here. 808 call plan%init(ndat, Ftest%ngfft(1:3), Ftest%ngfft(1:3), Ftest%ngfft(7), fftcache0, ftest%use_gpu) 809 810 select case (inplace) 811 case (0) 812 do icall=1,NCALLS_FOR_TEST 813 ifft = empty_cache(CACHE_KBSIZE) 814 call plan%execute(ffc, ggc, isign) 815 ! Store results at the first call. 816 if (icall == 1) results = ggc 817 end do 818 case (1) 819 do icall=1,NCALLS_FOR_TEST 820 ifft = empty_cache(CACHE_KBSIZE) 821 call plan%execute(ffc, isign) 822 ! Store results at the first call. 823 if (icall == 1) results = ffc 824 end do 825 case default 826 ABI_ERROR(sjoin("Wrong value for inplace:", itoa(inplace))) 827 end select 828 829 call cwtime(cpu_time, wall_time, gflops, "stop") 830 call Ftprof%init(test_name,Ftest%nthreads,NCALLS_FOR_TEST,Ftest%ndat,ftest%use_gpu, & 831 cpu_time,wall_time,gflops,results=results) 832 833 call plan%free() 834 ABI_FREE(ffc) 835 ABI_FREE(ggc) 836 ABI_FREE(results) 837 838 end subroutine time_fftbox
m_FFT_prof/time_fftu [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
time_fftu
FUNCTION
Profiling of the fftu routines.
INPUTS
OUTPUT
SOURCE
1286 subroutine time_fftu(Ftest, isign, header, Ftprof) 1287 1288 !Arguments ----------------------------------- 1289 class(FFT_test_t),intent(inout) :: Ftest 1290 integer,intent(in) :: isign 1291 character(len=500),intent(out) :: header 1292 class(FFT_prof_t),intent(out) :: Ftprof 1293 1294 !Local variables------------------------------- 1295 !scalars 1296 integer,parameter :: nspinor1=1 1297 integer :: icall,i1,i2,i3,n1,n2,n3,ifft,npw_k,ndat,dat,nfft,cnt,ipw,padat,istwf_k 1298 !integer :: n4,n5,n6,n456,idx 1299 real(dp) :: cpu_time,wall_time,gflops,gsq,g0dotr 1300 logical :: not_implemented 1301 !character(len=500) :: msg 1302 character(len=TNAME_LEN) :: test_name 1303 !arrays 1304 integer,parameter :: g0(3) = [1,-2,1] 1305 integer :: gg(3) 1306 integer,allocatable :: kg_k(:,:),gbound(:,:) 1307 complex(dpc),allocatable :: ug(:),results(:),ur(:) 1308 ! ********************************************************************* 1309 1310 test_name = Ftest%get_name() 1311 1312 n1=Ftest%ngfft(1); n2=Ftest%ngfft(2); n3=Ftest%ngfft(3) 1313 !n4=Ftest%ngfft(4); n5=Ftest%ngfft(5); n6=Ftest%ngfft(6) 1314 1315 nfft = n1*n2*n3 1316 !n456 = n4*n5*n6 1317 ndat = Ftest%ndat 1318 1319 write(header,'(2(a,i2))')" fftu with isign ",isign,", ndat ",ndat 1320 1321 ! TODO: zero-pad not available with SG2001 routines. 1322 not_implemented = (Ftest%ngfft(7) == 412) 1323 1324 if (Ftest%available==0.or.not_implemented) then 1325 call Ftprof%init(test_name,0,0,0,0,zero,zero,zero) 1326 return 1327 end if 1328 1329 call xomp_set_num_threads(Ftest%nthreads) 1330 1331 if (Ftest%ngfft(7)/100 == FFT_FFTW3) call fftw3_set_nthreads(Ftest%nthreads) 1332 ! 1333 istwf_k = Ftest%istwf_k 1334 !istwf_k = 1 ! Do not use istwf_k trick here 1335 call get_kg(Ftest%kpoint,istwf_k,Ftest%ecut,Ftest%gmet,npw_k,kg_k) 1336 1337 ABI_MALLOC(gbound,(2*Ftest%mgfft+8,2)) 1338 call sphereboundary(gbound,istwf_k,kg_k,Ftest%mgfft,npw_k) 1339 1340 ABI_MALLOC(ug,(npw_k*ndat)) 1341 ABI_MALLOC(ur, (nfft*ndat)) 1342 ABI_MALLOC(results,(nfft*ndat)) 1343 results=czero ! needed for R --> G since npw_k < nfft 1344 1345 if (isign==1) then 1346 do cnt=0,(npw_k * ndat) - 1 1347 ipw = 1 + MOD(cnt, npw_k) 1348 gg = kg_k(:,ipw) 1349 gsq = two_pi**2 * DOT_PRODUCT(gg,MATMUL(Ftest%gmet,gg)) 1350 ug(cnt+1) = DCMPLX(EXP(-gsq), zero) 1351 end do 1352 ! 1353 ! Replicate input data 1354 do dat=2,ndat 1355 padat = (dat-1) * npw_k 1356 do ipw=1,npw_k 1357 ug(ipw+padat) = DBLE(dat) * ug(ipw) 1358 end do 1359 end do 1360 1361 else if (isign==-1) then 1362 1363 do i3=0,n3-1 1364 do i2=0,n2-1 1365 do i1=0,n1-1 1366 g0dotr= two_pi*( g0(1)*(i1/DBLE(n1)) & 1367 & +g0(2)*(i2/DBLE(n2)) & 1368 & +g0(3)*(i3/DBLE(n3)) ) 1369 ifft = 1 + i1 + i2*n1 + i3*n2*n3 1370 ur(ifft)=DCMPLX(DCOS(g0dotr),DSIN(g0dotr)) 1371 end do 1372 end do 1373 end do 1374 ! 1375 ! Replicate input data 1376 do dat=2,ndat 1377 padat = (dat-1)*nfft 1378 do ifft=1,nfft 1379 ur(ifft+padat) = DBLE(dat) * ur(ifft) 1380 end do 1381 end do 1382 1383 else 1384 ABI_ERROR(sjoin("Wrong isign:", itoa(isign))) 1385 end if 1386 1387 call cwtime(cpu_time, wall_time, gflops, "start") 1388 1389 do icall=1,NCALLS_FOR_TEST 1390 ifft = empty_cache(CACHE_KBSIZE) 1391 1392 !call fftpad(ug,Ftest%ngfft,n1,n2,n3,n4,n5,n6,ndat,Ftest%mgfft,isign,gbound) 1393 if (isign == +1) then 1394 call fft_ug(npw_k,nfft,nspinor1,ndat,Ftest%mgfft,Ftest%ngfft,istwf_k,kg_k,gbound,ug,ur) 1395 ! Store results at the first call. 1396 if (icall == 1) call xcopy(nfft*ndat,ur,1,results,1) 1397 1398 else 1399 call fft_ur(npw_k,nfft,nspinor1,ndat,Ftest%mgfft,Ftest%ngfft,istwf_k,kg_k,gbound,ur,ug) 1400 ! Store results at the first call. 1401 if (icall==1) call xcopy(npw_k*ndat,ug,1,results,1) 1402 end if 1403 !if (isign==-1) then 1404 ! write(Ftest%ngfft(7),*)"ngfft, isign ",Ftest%ngfft(7),isign 1405 ! do idx =1,n4*n5*n6*ndat; write(Ftest%ngfft(7),*)idx,results(idx); end do 1406 !end if 1407 end do 1408 1409 call cwtime(cpu_time, wall_time, gflops, "stop") 1410 call Ftprof%init(test_name,Ftest%nthreads,NCALLS_FOR_TEST,ndat,ftest%use_gpu, & 1411 cpu_time,wall_time,gflops,results=results) 1412 1413 ABI_FREE(kg_k) 1414 ABI_FREE(gbound) 1415 ABI_FREE(ug) 1416 ABI_FREE(ur) 1417 ABI_FREE(results) 1418 1419 end subroutine time_fftu
m_FFT_prof/time_fourdp [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
time_fourdp
FUNCTION
Profiling of the fourdp routine.
INPUTS
OUTPUT
SOURCE
616 subroutine time_fourdp(Ftest, isign, cplex, header, Ftprof) 617 618 !Arguments ----------------------------------- 619 class(FFT_test_t),intent(inout) :: Ftest 620 integer,intent(in) :: cplex,isign 621 character(len=500),intent(out) :: header 622 class(FFT_prof_t),intent(out) :: Ftprof 623 624 !Local variables------------------------------- 625 !scalars 626 integer,parameter :: nspinor1=1 627 integer :: icall,i1,i2,i3,n1,n2,n3,ifft 628 real(dp) :: cpu_time,wall_time,gflops,gsq 629 !character(len=500) :: msg 630 character(len=TNAME_LEN) :: test_name 631 !arrays 632 integer,parameter :: g0(3)=(/1,2,-1/) 633 integer :: gg(3) 634 real(dp),allocatable :: fofg(:,:),fofr(:) 635 complex(dpc),allocatable :: results(:),ctmp(:) 636 637 ! ********************************************************************* 638 639 test_name = Ftest%get_name() 640 n1=Ftest%ngfft(1); n2=Ftest%ngfft(2); n3=Ftest%ngfft(3) 641 642 write(header,'(2(a,i2),a)')" fourdp with cplex ",cplex,", isign ",isign,", ndat 1" 643 644 if (Ftest%available==0) then 645 call Ftprof%init(test_name,0,0,0,0,zero,zero,zero) 646 RETURN 647 end if 648 649 call xomp_set_num_threads(Ftest%nthreads) 650 651 if (Ftest%ngfft(7)/100 == FFT_FFTW3) call fftw3_set_nthreads(Ftest%nthreads) 652 653 ABI_MALLOC(fofg,(2,Ftest%nfft)) 654 ABI_MALLOC(fofr,(cplex*Ftest%nfft)) 655 656 ! Initialize input data. 657 if (isign==1) then 658 ! initialize fofg 659 fofg = zero 660 ifft=0 661 do i3=1,n3 662 gg(3)=i3-1; if (i3>1+n3/2) gg(3)=i3-n3-1 ! TODO recheck this 663 do i2=1,n2 664 gg(2)=i2-1; if (i2>1+n2/2) gg(2)=i2-n2-1 665 do i1=1,n1 666 gg(1)=i1-1; if (i1>1+n1/2) gg(1)=i1-n1-1 667 gsq = two_pi**2 * DOT_PRODUCT(gg,MATMUL(Ftest%gmet,gg)) 668 ifft=ifft+1 669 fofg(1,ifft) = EXP(-gsq) 670 fofg(2,ifft) = zero 671 end do 672 end do 673 end do 674 675 else 676 ! init fofr 677 if (cplex==2) then 678 call calc_eigr(g0,Ftest%nfft,Ftest%ngfft,fofr) 679 else if (cplex==1) then 680 ABI_MALLOC(ctmp,(Ftest%nfft)) 681 call calc_ceigr(g0,Ftest%nfft,nspinor1,Ftest%ngfft,ctmp) 682 fofr = REAL(ctmp) 683 ABI_FREE(ctmp) 684 else 685 ABI_ERROR(sjoin("Wrong cplex:", itoa(cplex))) 686 end if 687 end if 688 689 ABI_CALLOC(results, (Ftest%nfft)) 690 call cwtime(cpu_time, wall_time, gflops, "start") 691 692 do icall=1,NCALLS_FOR_TEST 693 ifft = empty_cache(CACHE_KBSIZE) 694 call fourdp(cplex,fofg,fofr,isign,Ftest%MPI_enreg,Ftest%nfft,1,Ftest%ngfft,0) 695 696 ! Store results at the first call. 697 if (icall==1) then 698 if (isign==1) then 699 if (cplex==1) then 700 results = DCMPLX(fofr, zero) 701 else if (cplex==2) then 702 results = DCMPLX( fofr(1:2*Ftest%nfft:2), fofr(2:2*Ftest%nfft:2) ) 703 end if 704 else if (isign==-1) then 705 results = DCMPLX(fofg(1,:),fofg(2,:)) 706 end if 707 end if 708 end do 709 710 call cwtime(cpu_time, wall_time, gflops, "stop") 711 call Ftprof%init(test_name,Ftest%nthreads,NCALLS_FOR_TEST,Ftest%ndat,ftest%use_gpu, & 712 cpu_time,wall_time,gflops,results=results) 713 714 ABI_FREE(fofg) 715 ABI_FREE(fofr) 716 ABI_FREE(results) 717 718 end subroutine time_fourdp
m_FFT_prof/time_fourwf [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
time_fourwf
FUNCTION
Profiling of the fourwf routine.
INPUTS
OUTPUT
SOURCE
856 subroutine time_fourwf(Ftest, cplex, option_fourwf, header, Ftprof) 857 858 !Arguments ----------------------------------- 859 class(FFT_test_t),intent(inout) :: Ftest 860 integer,intent(in) :: cplex,option_fourwf 861 character(len=500),intent(out) :: header 862 class(FFT_prof_t),intent(out) :: Ftprof 863 864 !Local variables------------------------------- 865 !scalars 866 integer,parameter :: tim0=0 867 integer :: n1,n2,n3,n4,n5,n6,npw_out,icall,i1,i2,i3,idx,ipw,ndat,cnt,dat,padat 868 integer :: fftalg,fftalga,fftalgc 869 real(dp),parameter :: weight_i=one,weight_r=one 870 real(dp) :: cpu_time,wall_time,gflops,gsq,g0dotr 871 logical :: isbuggy,not_supported 872 !character(len=500) :: msg 873 character(len=TNAME_LEN) :: test_name 874 !arrays 875 integer,parameter :: g0(3)=[1,-1,2] !g0(3)=[1,0,0] 876 integer :: gg(3) 877 integer,allocatable :: gbound_in(:,:),gbound_out(:,:) 878 real(dp),allocatable :: denpot(:,:,:),fofg_in(:,:), fofr_4(:,:,:,:),fofg_out(:,:) 879 complex(dpc),allocatable :: results(:) 880 881 ! ********************************************************************* 882 883 test_name = Ftest%get_name() 884 885 fftalg = Ftest%ngfft(7); fftalga = fftalg/100; fftalgc = MOD(fftalg,10) 886 887 isbuggy = & 888 (option_fourwf==3 .and. fftalga==FFT_SG2002 .and. fftalgc /= 0 .and. & 889 any(Ftest%istwf_k == [3,4,5,6,7,8,9])) ! see sg_fourwf 890 !isbuggy = .False. 891 892 !FIXME problems with the unitary tests reference files! 893 not_supported = ( & 894 (fftalgc == 2 .and. option_fourwf==0 .and. Ftest%istwf_k > 2)) 895 !not_supported = .FALSE. 896 897 npw_out= Ftest%npw_kout 898 ndat = Ftest%ndat 899 n1=Ftest%ngfft(1); n2=Ftest%ngfft(2); n3=Ftest%ngfft(3) 900 n4=Ftest%ngfft(4); n5=Ftest%ngfft(5); n6=Ftest%ngfft(6) 901 902 write(header,'(4(a,i2))')" fourwf with option ",option_fourwf,", cplex ",cplex,", ndat ",ndat,", istwf_k ",Ftest%istwf_k 903 904 if (isbuggy .or. not_supported .or. Ftest%available==0) then 905 call Ftprof%init(test_name,0,0,0,0,zero,zero,zero) 906 RETURN 907 end if 908 909 call xomp_set_num_threads(Ftest%nthreads) 910 911 if (Ftest%ngfft(7)/100 == FFT_FFTW3) call fftw3_set_nthreads(Ftest%nthreads) 912 913 ! FFTW3 does not need gbound_in but oh well 914 ABI_MALLOC(gbound_in, (2*Ftest%mgfft+8,2)) 915 call sphereboundary(gbound_in,Ftest%istwf_k,Ftest%kg_k,Ftest%mgfft,Ftest%npw_k) 916 917 ABI_MALLOC(gbound_out, (2*Ftest%mgfft+8,2)) 918 call sphereboundary(gbound_out,Ftest%istwf_k,Ftest%kg_kout,Ftest%mgfft,Ftest%npw_kout) 919 920 ABI_CALLOC(denpot,(cplex*n4,n5,n6)) 921 ABI_CALLOC(fofg_in,(2,Ftest%npw_k*ndat)) 922 ABI_CALLOC(fofg_out,(2,npw_out*ndat)) 923 ABI_CALLOC(fofr_4,(2,n4,n5,n6*ndat)) 924 ABI_CALLOC(results, (Ftest%nfft*ndat)) 925 926 select case (option_fourwf) 927 case (0,1,2) 928 !! for option==0, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere; 929 !! fofr(2,n4,n5,n6) contains the output Fourier Transform of fofgin; 930 !! no use of denpot, fofgout and npwout. 931 !! for option==1, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere; 932 !! denpot(cplex*n4,n5,n6) contains the input density at input, 933 !! and the updated density at output (accumulated); 934 !! no use of fofgout and npwout. 935 !! for option==2, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere; 936 !! denpot(cplex*n4,n5,n6) contains the input local potential; 937 !! fofgout(2,npwout*ndat) contains the output function; 938 !! 939 do cnt=0,(Ftest%npw_k * ndat) - 1 940 ipw = 1 + MOD(cnt, Ftest%npw_k) 941 gg = Ftest%kg_k(:,ipw) 942 gsq = two_pi**2 * DOT_PRODUCT(gg,MATMUL(Ftest%gmet,gg)) 943 fofg_in(1,cnt+1) = EXP(-gsq) 944 fofg_in(2,cnt+1) = zero 945 end do 946 947 if (option_fourwf==1) then ! Init denpot 948 denpot = one 949 end if 950 ! 951 if (option_fourwf==2) then ! Init denpot 952 ! 953 if (cplex==1) then 954 do i3=0,n3-1 955 do i2=0,n2-1 956 do i1=0,n1-1 957 g0dotr= two_pi*( g0(1)*(i1/DBLE(n1)) & 958 & +g0(2)*(i2/DBLE(n2)) & 959 & +g0(3)*(i3/DBLE(n3)) ) 960 denpot(i1+1,i2+1,i3+1)=COS(g0dotr) 961 end do 962 end do 963 end do 964 else if (cplex==2) then 965 do i3=0,n3-1 966 do i2=0,n2-1 967 idx=1 968 do i1=0,n1-1 969 g0dotr= two_pi*( g0(1)*(i1/DBLE(n1)) & 970 & +g0(2)*(i2/DBLE(n2)) & 971 & +g0(3)*(i3/DBLE(n3)) ) 972 973 denpot(idx, i2+1,i3+1)= COS(g0dotr) 974 denpot(idx+1,i2+1,i3+1)= SIN(g0dotr) 975 idx=idx+2 976 end do 977 end do 978 end do 979 end if 980 end if 981 982 case (3) 983 !! for option==3, fofr(2,n4,n5,n6*ndat) contains the input real space wavefunction; 984 !! fofgout(2,npwout*ndat) contains its output Fourier transform; 985 !! no use of fofgin and npwin. 986 idx=0 987 do dat=1,ndat 988 padat = (dat-1)*n6 989 do i3=0,n3-1 990 do i2=0,n2-1 991 do i1=0,n1-1 992 g0dotr= two_pi*( g0(1)*(i1/DBLE(n1)) & 993 +g0(2)*(i2/DBLE(n2)) & 994 +g0(3)*(i3/DBLE(n3)) ) 995 !fofr_4(1,i1+1,i2+1,i3+1+padat)=COS(g0dotr) 996 !fofr_4(2,i1+1,i2+1,i3+1+padat)=SIN(g0dotr) 997 fofr_4(1,i1+1,i2+1,i3+1+padat)=10 * EXP(-g0dotr**2) 998 fofr_4(2,i1+1,i2+1,i3+1+padat)=zero 999 idx=idx+1 1000 results(idx) = DCMPLX(fofr_4(1,i1+1,i2+1,i3+1+padat), fofr_4(2,i1+1,i2+1,i3+1+padat)) 1001 end do 1002 end do 1003 end do 1004 end do 1005 1006 case default 1007 ABI_ERROR(sjoin("Wrong value for option_fourwf:", itoa(option_fourwf))) 1008 end select 1009 1010 call cwtime(cpu_time, wall_time, gflops, "start") 1011 1012 do icall=1,NCALLS_FOR_TEST 1013 1014 i1 = empty_cache(CACHE_KBSIZE) 1015 1016 call fourwf(cplex,denpot,fofg_in,fofg_out,fofr_4,gbound_in,gbound_out,Ftest%istwf_k,& 1017 Ftest%kg_k,Ftest%kg_kout,Ftest%mgfft,Ftest%MPI_enreg,ndat,Ftest%ngfft,Ftest%npw_k,npw_out,n4,n5,n6,option_fourwf,& 1018 tim0,weight_r,weight_i, use_gpu_cuda=ftest%use_gpu) 1019 1020 ! Store results at the first call. 1021 if (icall == 1) then 1022 select case (option_fourwf) 1023 case (0) 1024 !! for option==0, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere; 1025 !! fofr(2,n4,n5,n6) contains the output Fourier Transform of fofgin; 1026 !! no use of denpot, fofgout and npwout. 1027 idx=0 1028 do dat=1,ndat 1029 padat = (dat-1)*n6 1030 do i3=1,n3 1031 do i2=1,n2 1032 do i1=1,n1 1033 idx=idx+1 1034 results(idx) = DCMPLX(fofr_4(1,i1,i2,i3+padat),fofr_4(2,i1,i2,i3+padat)) 1035 end do 1036 end do 1037 end do 1038 end do 1039 case (1) 1040 !! for option==1, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere; 1041 !! denpot(cplex*n4,n5,n6) contains the input density at input, 1042 !! and the updated density at output (accumulated); 1043 !! no use of fofgout and npwout. 1044 if (cplex==1) then 1045 idx=0 1046 do i3=1,n3 1047 do i2=1,n2 1048 do i1=1,n1 1049 idx=idx+1 1050 results(idx) = DCMPLX(denpot(i1,i2,i3),zero) 1051 end do 1052 end do 1053 end do 1054 1055 else if (cplex==2) then 1056 idx=0 1057 do i3=1,n3 1058 do i2=1,n2 1059 do i1=1,2*n1,2 1060 idx=idx+1 1061 results(idx) = DCMPLX(denpot(i1,i2,i3),denpot(i1+1,i2,i3)) 1062 end do 1063 end do 1064 end do 1065 else 1066 ABI_ERROR("Wrong cplex") 1067 end if 1068 1069 case (2) 1070 !! for option==2, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere; 1071 !! denpot(cplex*n4,n5,n6) contains the input local potential; 1072 !! fofgout(2,npwout*ndat) contains the output function; 1073 do ipw=1,npw_out*ndat 1074 results(ipw) = DCMPLX(fofg_out(1,ipw),fofg_out(2,ipw)) 1075 end do 1076 1077 case (3) 1078 !! for option==3, fofr(2,n4,n5,n6*ndat) contains the input real space wavefunction; 1079 !! fofgout(2,npwout*ndat) contains its output Fourier transform; 1080 !! no use of fofgin and npwin. 1081 do ipw=1,npw_out*ndat 1082 results(ipw) = DCMPLX(fofg_out(1,ipw),fofg_out(2,ipw)) 1083 end do 1084 !write(Ftest%ngfft(7),*)"results opt 3 Ftest%ngfft(7)",Ftest%ngfft(7) 1085 !do dat=1,ndat 1086 ! do ipw=1,npw_out 1087 ! idx = ipw + (dat-1)*npw_out 1088 ! write(Ftest%ngfft(7),*)ipw,dat,results(idx) 1089 ! end do 1090 !end do 1091 end select 1092 end if 1093 end do 1094 1095 call cwtime(cpu_time, wall_time, gflops, "stop") 1096 call Ftprof%init(test_name,Ftest%nthreads,NCALLS_FOR_TEST,Ftest%ndat,ftest%use_gpu,& 1097 cpu_time,wall_time,gflops,results=results) 1098 1099 ABI_FREE(denpot) 1100 ABI_FREE(fofg_in) 1101 ABI_FREE(fofg_out) 1102 ABI_FREE(fofr_4) 1103 ABI_FREE(gbound_in) 1104 ABI_FREE(gbound_out) 1105 ABI_FREE(results) 1106 1107 end subroutine time_fourwf
m_FFT_prof/time_rhotwg [ Functions ]
[ Top ] [ m_FFT_prof ] [ Functions ]
NAME
time_rhotwg
FUNCTION
Profiling of the rho_tw_g routine.
INPUTS
OUTPUT
SOURCE
1153 subroutine time_rhotwg(Ftest, map2sphere, use_padfft, osc_npw, osc_gvec, header, Ftprof) 1154 1155 !Arguments ----------------------------------- 1156 !scalars 1157 class(FFT_test_t),intent(inout) :: Ftest 1158 integer,intent(in) :: map2sphere,use_padfft,osc_npw 1159 character(len=500),intent(out) :: header 1160 class(FFT_prof_t),intent(out) :: Ftprof 1161 !arrays 1162 integer,intent(in) :: osc_gvec(3,osc_npw) 1163 1164 !Local variables------------------------------- 1165 !scalars 1166 integer,parameter :: nspinor1=1,dim_rtwg1=1,istwfk1=1 1167 integer :: icall,ifft,itim1,itim2,nfft,dat,sprc,ptr,ndat, n1,n2,n3,n4,n5,n6 1168 real(dp) :: cpu_time,wall_time,gflops 1169 complex(dpc) :: ktabp1 = cone, ktabp2 = cone 1170 character(len=TNAME_LEN) :: test_name 1171 logical :: not_implemented 1172 type(MPI_type) :: MPI_enreg_seq 1173 !arrays 1174 !integer,parameter :: g1(3)=[1,1,2],g2(3)=[-1,-2,-1] 1175 integer,parameter :: g1(3)=[-1,0,0], g2(3)=[1,0,0] 1176 integer,allocatable :: gbound(:,:), ktabr1(:), ktabr2(:), igfftg0(:) 1177 real(dp),parameter :: spinrot1(4)=(/one,zero,zero,one/),spinrot2(4)=(/one,zero,zero,one/) 1178 logical,allocatable :: mask(:) 1179 complex(dpc),allocatable :: results(:) 1180 complex(gwpc),allocatable :: rhotwg(:), wfn1(:), wfn2(:) 1181 1182 ! ********************************************************************* 1183 1184 test_name = Ftest%get_name() 1185 1186 nfft = Ftest%nfft; ndat = Ftest%ndat 1187 n1=Ftest%ngfft(1); n2=Ftest%ngfft(2); n3=Ftest%ngfft(3) 1188 n4=Ftest%ngfft(4); n5=Ftest%ngfft(5); n6=Ftest%ngfft(6) 1189 1190 write(header,'(3(a,i2))')"rho_tw_g with use_padfft ",use_padfft,", map2sphere ",map2sphere,", ndat ",ndat 1191 1192 ! TODO: zero-pad not available with SG2001 routines. 1193 not_implemented = (use_padfft==1.and.Ftest%ngfft(7) == 412) 1194 1195 if (Ftest%available==0.or.not_implemented) then 1196 call Ftprof%init(test_name,0,0,0,0,zero,zero,zero) 1197 return 1198 end if 1199 1200 call xomp_set_num_threads(Ftest%nthreads) 1201 1202 if (Ftest%ngfft(7)/100 == FFT_FFTW3) call fftw3_set_nthreads(Ftest%nthreads) 1203 1204 call initmpi_seq(MPI_enreg_seq) 1205 call init_distribfft_seq(MPI_enreg_seq%distribfft,'c',n2,n3,'all') 1206 1207 itim1=1; itim2=1 1208 ABI_MALLOC(ktabr1,(nfft)) 1209 ABI_MALLOC(ktabr2,(nfft)) 1210 1211 do ifft=1,nfft 1212 ktabr1(ifft)= ifft 1213 ktabr2(ifft)= ifft 1214 end do 1215 1216 ABI_MALLOC(igfftg0,(osc_npw*map2sphere)) 1217 1218 if (map2sphere > 0) then 1219 ABI_MALLOC(mask,(osc_npw)) 1220 call kgindex(igfftg0,osc_gvec,mask,MPI_enreg_seq,Ftest%ngfft,osc_npw) 1221 ABI_CHECK(ALL(mask)," FFT parallelism not supported") 1222 ABI_FREE(mask) 1223 end if 1224 1225 ABI_MALLOC(gbound,(2*Ftest%mgfft+8,2*use_padfft)) 1226 if (use_padfft == 1) call sphereboundary(gbound,istwfk1,osc_gvec,Ftest%mgfft,osc_npw) 1227 1228 ABI_MALLOC(wfn1, (nfft*nspinor1*ndat)) 1229 ABI_MALLOC(wfn2, (nfft*nspinor1*ndat)) 1230 1231 do dat=1,ndat 1232 do sprc=1,nspinor1 1233 ptr = 1 + (sprc-1)*nfft + (dat-1)*nfft*nspinor1 1234 call calc_ceigr(g1,nfft,nspinor1,Ftest%ngfft,wfn1(ptr:)) 1235 call calc_ceigr(g2,nfft,nspinor1,Ftest%ngfft,wfn2(ptr:)) 1236 end do 1237 end do 1238 1239 ABI_MALLOC(rhotwg,(osc_npw*dim_rtwg1*ndat)) 1240 ABI_MALLOC(results,(osc_npw*dim_rtwg1*ndat)) 1241 1242 call cwtime(cpu_time, wall_time, gflops, "start") 1243 1244 do icall=1,NCALLS_FOR_TEST 1245 ifft = empty_cache(CACHE_KBSIZE) 1246 call rho_tw_g(nspinor1,osc_npw,nfft,ndat,Ftest%ngfft,map2sphere,use_padfft,igfftg0,gbound,& 1247 wfn1,itim1,ktabr1,ktabp1,spinrot1,& 1248 wfn2,itim2,ktabr2,ktabp2,spinrot2,& 1249 dim_rtwg1,rhotwg) 1250 ! Store results at the first call. 1251 if (icall==1) results = rhotwg 1252 end do 1253 1254 call cwtime(cpu_time, wall_time, gflops, "stop") 1255 call Ftprof%init(test_name,Ftest%nthreads,NCALLS_FOR_TEST,Ftest%ndat,ftest%use_gpu, & 1256 cpu_time,wall_time,gflops,results=results) 1257 1258 ABI_FREE(ktabr1) 1259 ABI_FREE(ktabr2) 1260 ABI_FREE(igfftg0) 1261 ABI_FREE(gbound) 1262 ABI_FREE(rhotwg) 1263 ABI_FREE(wfn1) 1264 ABI_FREE(wfn2) 1265 ABI_FREE(results) 1266 call destroy_mpi_enreg(MPI_enreg_seq) 1267 1268 end subroutine time_rhotwg