TABLE OF CONTENTS
ABINIT/lapackprof [ Programs ]
NAME
lapackprof
FUNCTION
Utility for profiling the (Sca)Lapack libraries supported by abinit.
COPYRIGHT
Copyright (C) 2004-2018 ABINIT group (MG) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
INPUTS
(main program)
OUTPUT
Timing analysis of the different libraries and algorithms.
NOTES
PARENTS
CHILDREN
abi_io_redirect,abimem_init,abinit_doctor,cg_zaxpy,cg_zcopy,cg_zgemm cg_zgemv,cwtime,destroy_mpi_enreg,herald,init_mpi_enreg,lower,projbd pw_orthon,random_number,sqmat_itranspose,test_xginv,wrtout,xgerc,xheevx xhpev,xmpi_bcast,xmpi_end,xmpi_init,xomp_set_num_threads,xomp_show_info
SOURCE
33 #if defined HAVE_CONFIG_H 34 #include "config.h" 35 #endif 36 37 #include "abi_common.h" 38 39 program lapackprof 40 41 use defs_basis 42 use defs_abitypes 43 use m_build_info 44 use m_abicore 45 use m_xmpi 46 use m_xomp 47 use m_errors 48 use m_hide_blas 49 use m_cgtools 50 use m_hide_lapack 51 52 use m_fstrings, only : lower, itoa, strcat 53 use m_specialmsg, only : specialmsg_getcount, herald 54 use m_time, only : cwtime 55 use m_io_tools, only : prompt 56 use m_numeric_tools, only : arth 57 use m_mpinfo, only : init_mpi_enreg, destroy_mpi_enreg 58 59 !This section has been created automatically by the script Abilint (TD). 60 !Do not modify the following lines by hand. 61 #undef ABI_FUNC 62 #define ABI_FUNC 'lapackprof' 63 !End of the abilint section 64 65 implicit none 66 67 !Local variables------------------------------- 68 !scalars 69 integer,parameter :: master=0,nspinor1=1 70 integer :: comm,npw,ierr,my_rank,ii,isz,jj,it,step,icall,nfound 71 integer :: istwfk=1,useoverlap,mcg,mgsc,band,g0,idx,iall,ortalgo 72 real(dp) :: ctime,wtime,gflops 73 logical :: do_check,test_ortho,test_copy,test_dotc,test_axpy,test_gemv !,test_gemm 74 character(len=24) :: skinds 75 character(len=500) :: header,routname 76 type(MPI_type) :: MPI_enreg 77 !arrays 78 integer,allocatable :: sizes(:) 79 real(dp) :: alpha(2),beta(2),dot(2) 80 real(dp),allocatable :: cg(:,:),gsc(:,:),ortho_check(:,:,:) 81 real(dp),allocatable :: cg1(:,:),cg2(:,:),cg3(:,:),ene(:),direc(:,:),scprod(:,:) 82 complex(dpc),allocatable :: zvec(:),zmat(:,:),wmat(:,:),zpmat(:),evec(:,:) 83 ! complex(spc),allocatable :: vec(:),mat(:,:) 84 type(latime_t) :: Tres 85 ! ========== INPUT FILE ============== 86 integer :: ncalls=50,nband=200,nsizes=20,nthreads=1 87 logical :: debug_mode=.FALSE. 88 character(len=500) :: tasks="all" 89 integer :: size_arth(2) = (/1000,2000/) 90 namelist /CONTROL/ tasks, debug_mode,nthreads 91 namelist /SYSTEM/ ncalls,nband,nsizes,size_arth 92 93 ! ************************************************************************* 94 95 call xmpi_init() 96 97 !Change communicator for I/O (mandatory!) 98 call abi_io_redirect(new_io_comm=xmpi_world) 99 100 !Initialize memory profiling if it is activated 101 !if a full abimem.mocc report is desired, set the argument of abimem_init to "2" instead of "0" 102 !note that abimem.mocc files can easily be multiple GB in size so don't use this option normally 103 #ifdef HAVE_MEM_PROFILING 104 call abimem_init(0) 105 #endif 106 107 call init_mpi_enreg(mpi_enreg) 108 109 comm = xmpi_world 110 my_rank = xmpi_comm_rank(comm) 111 112 call herald("LAPACKPROF",abinit_version,std_out) 113 114 if (my_rank == master) then 115 write(std_out,'(a)')" Tool for profiling and testing the (Sca)Lapack libraries used in ABINIT." 116 end if 117 118 !Read input file 119 read(std_in, NML=CONTROL) 120 read(std_in, NML=SYSTEM) 121 122 call xomp_set_num_threads(nthreads) 123 call xomp_show_info(std_out) 124 125 call lower(tasks) 126 127 !replace "+" with white spaces 128 do ii=1,LEN_TRIM(tasks) 129 if (tasks(ii:ii) == "+") tasks(ii:ii) = " " 130 end do 131 !%call str_replace_chars(tasks,"+"," ") 132 tasks = " "//TRIM(tasks)//"" 133 134 call xmpi_bcast(tasks,master,comm,ierr) 135 call xmpi_bcast(size_arth,master,comm,ierr) 136 call xmpi_bcast(nsizes,master,comm,ierr) 137 138 iall=INDEX (tasks,"all") 139 140 test_ortho = (iall>0 .or. INDEX(tasks," ortho")>0 ) 141 test_copy = (iall>0 .or. INDEX(tasks," copy")>0 ) 142 test_dotc = (iall>0 .or. INDEX(tasks," dotc")>0 ) 143 test_axpy = (iall>0 .or. INDEX(tasks," axpy")>0 ) 144 test_gemv = (iall>0 .or. INDEX(tasks," gemv")>0 ) 145 !test_gemm = (iall>0 .or. INDEX(tasks," gemm")>0 ) 146 147 ABI_MALLOC(sizes,(nsizes)) 148 sizes = arth(size_arth(1),size_arth(2),nsizes) 149 150 if (INDEX(tasks," projbd")>0) then 151 istwfk =1 152 useoverlap = 0 153 154 routname = "projbd" 155 header = strcat("BEGIN_BENCHMARK: ",routname) 156 write(std_out,'(a)')TRIM(header) 157 158 do isz=1,nsizes 159 npw = sizes(isz) 160 mcg = npw * nband 161 mgsc = mcg * useoverlap 162 ABI_MALLOC(cg,(2,mcg)) 163 call random_number(cg) 164 165 ABI_MALLOC(gsc,(2,mgsc)) 166 167 ABI_MALLOC(direc, (2,npw*nspinor1)) 168 direc = zero 169 170 ABI_MALLOC(scprod, (2,nband)) 171 172 call cwtime(ctime,wtime,gflops,"start") 173 174 call projbd(cg,direc,0,0,0,istwfk,mcg,mgsc,nband,npw,nspinor1,gsc,scprod,0,0,useoverlap,& 175 & mpi_enreg%me_g0,mpi_enreg%comm_fft) 176 177 call cwtime(ctime,wtime,gflops,"stop") 178 write(std_out,'(2(a,i0),2(a,f9.6))')" npw = ",npw,", nband = ",nband,", cpu_time = ",ctime,", wall_time = ",wtime 179 180 ABI_FREE(scprod) 181 ABI_FREE(direc) 182 183 ABI_FREE(cg) 184 ABI_FREE(gsc) 185 end do 186 187 header = strcat("END_BENCHMARK: ",routname) 188 write(std_out,'(a)')TRIM(header) 189 end if 190 191 if (test_ortho) then 192 istwfk =1 193 useoverlap = 0 194 195 do ortalgo=1,4,1 196 routname = strcat(" pw_orthon, ortalgo = ",itoa(ortalgo)) 197 header = strcat("BEGIN_BENCHMARK: ",routname) 198 write(std_out,'(a)')TRIM(header) 199 ! 200 do isz=1,nsizes 201 npw = sizes(isz) 202 mcg = npw * nband 203 mgsc = mcg * useoverlap 204 ABI_MALLOC(cg,(2,mcg)) 205 call random_number(cg) 206 207 ABI_MALLOC(gsc,(2,mgsc)) 208 209 if (istwfk/=1) then 210 do band=1,nband 211 g0 = 1 + (band-1)*npw 212 cg(2,g0) = zero 213 end do 214 end if 215 216 call cwtime(ctime,wtime,gflops,"start") 217 218 call pw_orthon(0,0,istwfk,mcg,mgsc,npw,nband,ortalgo,gsc,useoverlap,cg,& 219 & mpi_enreg%me_g0,mpi_enreg%comm_bandspinorfft) 220 221 call cwtime(ctime,wtime,gflops,"stop") 222 223 write(std_out,'(2(a,i0),2(a,f9.6))')" npw = ",npw,", nband = ",nband,", cpu_time = ",ctime,", wall_time = ",wtime 224 225 if (debug_mode) then 226 ABI_MALLOC(ortho_check,(2,nband,nband)) 227 if (istwfk/=1) then 228 do band=1,nband 229 g0 = 1 + (band-1)*npw 230 cg(:,g0) = half * cg(:,g0) 231 end do 232 end if 233 234 call cg_zgemm("C","N",npw,nband,nband,cg,cg,ortho_check) 235 236 if (istwfk/=1) ortho_check = two * ortho_check 237 238 do band=1,nband 239 ortho_check(1,band,band) = ortho_check(1,band,band) - one 240 end do 241 242 write(std_out,*)"MAX ABS error",MAXVAL( ABS(RESHAPE(ortho_check,(/2*nband*nband/)) )) 243 ABI_FREE(ortho_check) 244 end if 245 246 ABI_FREE(cg) 247 ABI_FREE(gsc) 248 end do 249 250 header = strcat("END_BENCHMARK: ",routname) 251 write(std_out,'(a)')TRIM(header) 252 end do ! ortalgo 253 end if 254 255 !copy 256 if (test_copy) then 257 ! 258 do step=1,2 259 ! 260 if (step==1) routname = " f90_zcopy" 261 if (step==2) routname = " cg_zcopy" 262 header = strcat("BEGIN_BENCHMARK: ",routname) 263 write(std_out,'(a)')TRIM(header) 264 ! 265 do isz=1,nsizes 266 npw = sizes(isz) 267 ABI_MALLOC(cg1,(2,npw)) 268 ABI_MALLOC(cg2,(2,npw)) 269 cg1 = zero 270 271 call cwtime(ctime,wtime,gflops,"start") 272 if (step==1) then 273 do ii=1,ncalls 274 !$OMP PARALLEL DO 275 do jj=1,npw 276 cg2(1,jj) = cg1(1,jj) 277 cg2(2,jj) = cg1(2,jj) 278 end do 279 end do 280 else 281 do ii=1,ncalls 282 call cg_zcopy(npw,cg1,cg2) 283 end do 284 end if 285 call cwtime(ctime,wtime,gflops,"stop") 286 write(std_out,'(a,i0,2(a,f9.6))')" npw = ",npw,", cpu_time = ",ctime,", wall_time = ",wtime 287 288 ABI_FREE(cg1) 289 ABI_FREE(cg2) 290 end do 291 292 header = strcat("END_BENCHMARK: ",routname) 293 write(std_out,'(a)')TRIM(header) 294 end do 295 end if 296 297 !zdotc 298 if (test_dotc) then 299 ! 300 do step=1,2 301 ! 302 if (step==1) routname = " f90_zdotc" 303 if (step==2) routname = " cg_zdotc" 304 header = strcat("BEGIN_BENCHMARK: ",routname) 305 write(std_out,'(a)')TRIM(header) 306 307 do isz=1,nsizes 308 npw = sizes(isz) 309 ABI_MALLOC(cg1,(2,npw)) 310 ABI_MALLOC(cg2,(2,npw)) 311 call random_number(cg1) 312 call random_number(cg2) 313 314 call cwtime(ctime,wtime,gflops,"start") 315 if (step==1) then 316 do jj=1,ncalls 317 dot = zero 318 !$OMP PARALLEL DO REDUCTION(+:dot) 319 do ii=1,npw 320 dot(1) = dot(1) + cg1(1,ii)*cg2(1,ii) + cg1(2,ii)*cg2(2,ii) 321 dot(2) = dot(2) + cg1(1,ii)*cg2(2,ii) - cg1(2,ii)*cg2(1,ii) 322 end do 323 end do 324 else 325 do ii=1,ncalls 326 dot = cg_zdotc(npw,cg1,cg2) 327 end do 328 end if 329 call cwtime(ctime,wtime,gflops,"stop") 330 write(std_out,'(a,i0,2(a,f9.6))')" npw = ",npw,", cpu_time = ",ctime,", wall_time = ",wtime 331 ABI_FREE(cg1) 332 ABI_FREE(cg2) 333 end do 334 335 header = strcat("END_BENCHMARK: ",routname) 336 write(std_out,'(a)')TRIM(header) 337 end do 338 end if 339 340 !axpy 341 if (test_axpy) then 342 ! alpha = (/one,zero/) 343 alpha = (/one,two/) 344 do step=1,2 345 ! 346 if (step==1) routname = " f90_axpy" 347 if (step==2) routname = " cg_axpy" 348 header = strcat("BEGIN_BENCHMARK: ",routname) 349 write(std_out,'(a)')TRIM(header) 350 351 do isz=1,nsizes 352 npw = sizes(isz) 353 ABI_MALLOC(cg1,(2,npw)) 354 ABI_MALLOC(cg2,(2,npw)) 355 cg2 = zero 356 357 do jj=1,npw 358 cg1(:,jj) = jj 359 end do 360 361 call cwtime(ctime,wtime,gflops,"start") 362 if (step==1) then 363 jj = 0 364 do icall=1,ncalls 365 ! call random_number(cg1) 366 call random_number(cg2(:,1:1)) 367 ! cg2 = zero 368 do ii=1,npw 369 jj = jj+1 370 cg2(1,ii) = alpha(1)*cg1(1,ii) - alpha(2)*cg1(2,ii) + cg2(1,ii) 371 cg2(2,ii) = alpha(1)*cg1(2,ii) + alpha(2)*cg1(1,ii) + cg2(2,ii) 372 end do 373 end do 374 else 375 do icall=1,ncalls 376 ! call random_number(cg1) 377 ! call random_number(cg2) 378 ! cg2 = zero 379 call random_number(cg2(:,1:1)) 380 call cg_zaxpy(npw,alpha,cg1,cg2) 381 end do 382 end if 383 call cwtime(ctime,wtime,gflops,"stop") 384 write(std_out,'(a,i0,2(a,f9.6))')" npw = ",npw,", cpu_time = ",ctime,", wall_time = ",wtime 385 ABI_FREE(cg1) 386 ABI_FREE(cg2) 387 end do 388 389 header = strcat("END_BENCHMARK: ",routname) 390 write(std_out,'(a)')TRIM(header) 391 end do 392 end if 393 394 !cg_zgemv 395 if (test_gemv) then 396 alpha = (/one,two/) 397 beta = (/zero,zero/) 398 do step=1,2 399 if (step==1) routname = " f90_zgemv" 400 if (step==2) routname = " cg_zgemv" 401 header = strcat("BEGIN_BENCHMARK: ",routname) 402 write(std_out,'(a)')TRIM(header) 403 404 do isz=1,nsizes 405 npw = sizes(isz) 406 ABI_MALLOC(cg1,(2,npw*npw)) 407 ABI_MALLOC(cg2,(2,npw)) 408 ABI_MALLOC(cg3,(2,npw)) 409 410 do jj=1,npw*npw 411 cg1(:,jj) = jj 412 end do 413 cg2 = one 414 cg3 = zero 415 416 call cwtime(ctime,wtime,gflops,"start") 417 if (step==1) then 418 ! do icall=1,ncalls 419 ! do jj=1,npw 420 ! ar=scprod(1,iband);ai=scprod(2,iband) 421 ! do ipw=1,npw_sp 422 ! cg_re=cg(1,index1+ipw) 423 ! cg_im=cg(2,index1+ipw) 424 ! direc(1,ipw)=direc(1,ipw)-ar*cg_re+ai*cg_im 425 ! direc(2,ipw)=direc(2,ipw)-ar*cg_im-ai*cg_re 426 ! end do 427 ! end do 428 ! end do 429 ! !cg3(1,ii) = alpha(1)*cg1(1,ii) - alpha(2)*cg1(2,ii) + cg2(1,ii) 430 ! !cg3(2,ii) = alpha(1)*cg1(2,ii) + alpha(2)*cg1(1,ii) + cg2(2,ii) 431 ! end do 432 ! end do 433 else 434 do icall=1,ncalls 435 call cg_zgemv("N",npw,npw,cg1,cg2,cg3) 436 end do 437 end if 438 call cwtime(ctime,wtime,gflops,"stop") 439 write(std_out,'(a,i0,2(a,f9.6))')" npw = ",npw,", cpu_time = ",ctime,", wall_time = ",wtime 440 441 ABI_FREE(cg1) 442 ABI_FREE(cg2) 443 ABI_FREE(cg3) 444 end do 445 446 header = strcat("END_BENCHMARK: ",routname) 447 write(std_out,'(a)')TRIM(header) 448 end do 449 end if 450 451 !itranspose 452 if (.FALSE.) then 453 ! if (.TRUE.) then 454 do step=1,2 455 if (step==1) routname = " f90_transpose" 456 if (step==2) routname = " mkl_transponse" 457 header = strcat("BEGIN_BENCHMARK: ",routname) 458 write(std_out,'(a)')TRIM(header) 459 460 do isz=1,nsizes 461 npw = sizes(isz) 462 ABI_MALLOC(zmat,(npw,npw)) 463 ABI_MALLOC(wmat,(npw,npw)) 464 do jj=1,npw 465 do ii=1,npw 466 zmat(ii,jj) = DCMPLX(ii,jj) 467 end do 468 end do 469 470 call cwtime(ctime,wtime,gflops,"start") 471 if (step==1) then 472 do icall=1,ncalls 473 ! wmat = TRANSPOSE(zmat) 474 zmat = TRANSPOSE(zmat) 475 end do 476 else 477 do icall=1,ncalls 478 ! call sqmat_otranspose(npw,zmat,wmat) 479 call sqmat_itranspose(npw,zmat) 480 end do 481 end if 482 call cwtime(ctime,wtime,gflops,"stop") 483 write(std_out,'(a,i0,2(a,f9.6))')"npw = ",npw,", cpu_time ",ctime,", wall_time ",wtime 484 485 ABI_FREE(zmat) 486 ABI_FREE(wmat) 487 end do 488 489 header = strcat("END_BENCHMARK: ",routname) 490 write(std_out,'(a)')TRIM(header) 491 end do 492 end if 493 494 !zgerc 495 if (.FALSE.) then 496 do step=1,2 497 if (step==1) routname = " f90_zgerc" 498 if (step==2) routname = " zgerc" 499 header = strcat("BEGIN_BENCHMARK: ",routname) 500 write(std_out,'(a)')TRIM(header) 501 502 do isz=1,nsizes 503 npw = sizes(isz) 504 ABI_MALLOC(zvec,(npw)) 505 ABI_MALLOC(zmat,(npw,npw)) 506 zvec = cone; zmat = czero 507 508 call cwtime(ctime,wtime,gflops,"start") 509 if (step==1) then ! Home made zgerc 510 do it=1,ncalls 511 zmat = czero 512 !$OMP PARALLEL DO 513 do jj=1,npw 514 do ii=1,npw 515 zmat(ii,jj) = zmat(ii,jj) + CONJG(zvec(ii)) * zvec(jj) 516 end do 517 end do 518 end do 519 else 520 do jj=1,ncalls 521 zmat = czero 522 call XGERC(npw,npw,(1._dp,0._dp),zvec,1,zvec,1,zmat,npw) 523 end do 524 end if 525 call cwtime(ctime,wtime,gflops,"stop") 526 write(std_out,'(a,i0,2(a,f9.6))')" npw = ",npw,", cpu_time ",ctime,", wall_time ",wtime 527 528 ABI_FREE(zvec) 529 ABI_FREE(zmat) 530 end do 531 532 header = strcat("END_BENCHMARK: ",routname) 533 write(std_out,'(a)')TRIM(header) 534 end do 535 536 ! do isz=1,nsizes 537 ! npw = sizes(isz) 538 ! ABI_MALLOC(vec,(npw)) 539 ! ABI_MALLOC(mat,(npw,npw)) 540 ! vec = cone; mat = czero 541 542 ! call cwtime(ctime,wtime,gflops,"start") 543 544 ! do jj=1,ncalls 545 ! call XGERC(npw,npw,(1._sp,0._sp),vec,1,vec,1,mat,npw) 546 ! end do 547 548 ! call cwtime(ctime,wtime,gflops,"stop") 549 550 ! write(std_out,'(a,i0,2f9.3)')" CGERG size, cpu_time, wall_time, max_abserr ",npw,ctime,wtime 551 552 ! ABI_FREE(vec) 553 ! ABI_FREE(mat) 554 ! end do 555 end if 556 557 !xginv 558 if (.FALSE.) then 559 do_check = .FALSE. 560 ! do_check = .TRUE. 561 do ii=1,nsizes 562 npw = sizes(ii) 563 call test_xginv(npw,skinds,do_check,Tres,comm) 564 565 if (my_rank==master) then 566 write(std_out,'(a,i0,3f9.3)')& 567 & " routine = xginv, size, cpu_time, wall_time, max_abserr ",Tres%msize,Tres%ctime,Tres%wtime,Tres%max_abserr 568 end if 569 end do 570 end if 571 572 !xhpev vs xheev 573 if (.FALSE.) then 574 do step=1,2 575 ! 576 do isz=1,nsizes 577 npw = sizes(isz) 578 ABI_MALLOC(ene,(npw)) 579 ABI_MALLOC(evec,(npw,npw)) 580 581 ABI_MALLOC(zpmat,(npw*(npw+1)/2)) 582 zpmat = czero 583 idx = 0 584 do jj=1,npw 585 do ii=1,jj 586 idx = idx + 1 587 zpmat(idx) = cone 588 end do 589 end do 590 591 if (step==1) then 592 call cwtime(ctime,wtime,gflops,"start") 593 call xhpev("V","U",npw,zpmat,ene,evec,npw) 594 else 595 ABI_MALLOC(zmat,(npw,npw)) 596 do jj=1,npw 597 do ii=1,jj 598 idx = ii + jj*(jj-1)/2 599 zmat(ii,jj) = zpmat(idx) 600 end do 601 end do 602 call cwtime(ctime,wtime,gflops,"start") 603 ! call xheev("V","U",npw,zmat,ene) 604 call xheevx("V","A","U",npw,zmat,zero,zero,1,1,zero,nfound,ene,evec,npw) 605 ABI_FREE(zmat) 606 end if 607 call cwtime(ctime,wtime,gflops,"stop") 608 write(std_out,'(a,i0,2(a,f9.6))')"EIG PROBLEM: size = ",npw,", cpu_time ",ctime,", wall_time ",wtime 609 end do 610 611 ABI_FREE(ene) 612 ABI_FREE(evec) 613 ABI_FREE(zpmat) 614 end do 615 end if 616 ! 617 !=============================== 618 !=== End of run, free memory === 619 !=============================== 620 call wrtout(std_out,ch10//" Analysis completed.","COLL") 621 622 ABI_FREE(sizes) 623 624 call destroy_mpi_enreg(MPI_enreg) 625 626 !Writes information on file about the memory before ending mpi module, if memory profiling is enabled 627 !ABI_ALLOCATE(nband, (2)) 628 call abinit_doctor("__ioprof") 629 630 call xmpi_end() 631 632 end program lapackprof